diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ebb8dabc5f..55eb8fa161 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -158,7 +158,7 @@ oldest-supported-compiler-job: GIT_SUBMODULE_STRATEGY: none # DO NOT change this version number without updating the README to reflect # the requirement bump. - COMPILER_VERSION: 9 + COMPILER_VERSION: 10 # We define one job to do the Docker container build @@ -319,7 +319,11 @@ report-job: docs-job: interruptible: true stage: build + before_script: + - sudo apt-get -q -y update + - sudo apt-get -q -y install --no-upgrade doxygen graphviz rsync git script: - doc/publish-docs.sh + after_script: [] diff --git a/BOTS.md b/BOTS.md new file mode 100644 index 0000000000..2c35275ac0 --- /dev/null +++ b/BOTS.md @@ -0,0 +1,75 @@ +# VG Project Notes + +## Building +- New `.cpp` files auto-discovered +- Build with `make -j8` or `make obj/whatever.o` to build just one .o. +- You may be getting errors from `clangd`. If these errors seem spurious, stop and demand a `clangd` that works properly. + +## Testing + +### Running Bash-TAP Tests +Use `prove -v` (not `bash`) to execute Bash-TAP tests. This provides proper test harness output and better error reporting. + +**Important**: Run `prove` from the `test/` directory: +```bash +cd test +prove -v t/26_deconstruct.t +``` + +### Running Unit Tests +To run all unit tests: +```bash +./bin/vg test +``` +- `./bin/vg test "[tag]"` runs tests matching a tag + +#### Writing Unit Tests +- Framework: Catch v2 (header-only) +- Include: `#include "catch.hpp"` (in `src/unittest/catch.hpp`) +- Macros: `TEST_CASE("name", "[tags]")`, `SECTION("name")`, `REQUIRE(cond)` +- Namespace: `vg::unittest` +- Directory: `src/unittest/` + +### Running All Tests +```bash +make test +``` + +## Writing Code + +### HandleGraph API +The interfaces in libhandlegraph model a bidirected sequence graph (where nodes have DNA sequences and edges can connect to either the start or end of each involved node). + +#### Core types +- `handle_t` - opaque 64-bit value +- `nid_t` - node ID type +- `edge_t` = `pair` + +#### Key HandleGraph methods +- `get_handle(nid_t, bool is_reverse=false)` → `handle_t` +- `get_id(handle_t)` → `nid_t` +- `get_is_reverse(handle_t)` → `bool` +- `flip(handle_t)` → `handle_t` (toggle orientation) +- `get_sequence(handle_t)` → `string` (in handle's orientation) +- `follow_edges(handle_t, bool go_left, iteratee)` - iterate neighbors +- `for_each_handle(iteratee, bool parallel=false)` - iterate all nodes +- `for_each_edge(iteratee, bool parallel=false)` - iterate all edges +- `has_edge(handle_t left, handle_t right)` → `bool` + +#### MutableHandleGraph additions +- `create_handle(string seq)` / `create_handle(string seq, nid_t id)` → `handle_t` +- `create_edge(handle_t left, handle_t right)` +- `destroy_handle(handle_t)` / `destroy_edge(handle_t, handle_t)` + +#### HandleGraph algorithms +- Things like `topological_sort.hpp` and copy_graph.hpp` are in `deps/libhandlegraph/src/include/handlegraph/algorithms`. + +#### bdsg::HashGraph +- Header: `deps/libbdsg/bdsg/include/bdsg/hash_graph.hpp` +- Implements MutablePathMutableHandleGraph +- Go-to handlegraph implementation to use +- In libbdsg + +### Utilities +- `reverse_complement(string)` → `string` in src/utility.hpp + diff --git a/CLAUDE.md b/CLAUDE.md new file mode 120000 index 0000000000..1a1007d91a --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +BOTS.md \ No newline at end of file diff --git a/Makefile b/Makefile index 6506208d58..6159297339 100644 --- a/Makefile +++ b/Makefile @@ -104,7 +104,8 @@ ifeq ($(shell uname -s),Darwin) LD_UTIL_RPATH_FLAGS="" # Homebrew installs a Protobuf that uses an Abseil that is built with C++17, so we need to build with at least C++17 - CXX_STANDARD?=17 + # C++20 for spaceship operator and ranges + CXX_STANDARD?=20 # We may need libraries from Macports ifeq ($(shell if [ -d /opt/local/lib ];then echo 1;else echo 0;fi), 1) @@ -229,8 +230,9 @@ else $(info Compiler $(CXX) is assumed to be GCC) # gbwtgraph uses inline variables and our oldest supported compiler has - # C++17, so we should use C++17 - CXX_STANDARD?=17 + # C++17, so we should use at least C++17. + # C++20 for spaceship operator and ranges + CXX_STANDARD?=20 # Set an rpath for vg and dependency utils to find installed libraries LD_UTIL_RPATH_FLAGS="-Wl,-rpath,$(CWD)/$(LIB_DIR)" @@ -820,7 +822,7 @@ $(INC_DIR)/dynamic/dynamic.hpp: $(DYNAMIC_DIR)/include/dynamic/*.hpp $(DYNAMIC_D +mkdir -p $(INC_DIR)/dynamic && cp -r $(CWD)/$(DYNAMIC_DIR)/include/dynamic/* $(INC_DIR)/dynamic/ $(INC_DIR)/sparsehash/sparse_hash_map: $(wildcard $(SPARSEHASH_DIR)/**/*.cc) $(wildcard $(SPARSEHASH_DIR)/**/*.h) - +cd $(SPARSEHASH_DIR) && ./autogen.sh && LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" ./configure --prefix=$(CWD) $(FILTER) && $(MAKE) $(FILTER) && $(MAKE) install + +cd $(SPARSEHASH_DIR) && ./autogen.sh && LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" ./configure --prefix=$(CWD) $(FILTER) && $(MAKE) src/sparsehash/internal/sparseconfig.h $(FILTER) && $(MAKE) install-data $(FILTER) $(INC_DIR)/sparsepp/spp.h: $(wildcard $(SPARSEPP_DIR)/sparsepp/*.h) +cp -r $(SPARSEPP_DIR)/sparsepp $(INC_DIR)/ diff --git a/README.md b/README.md index a3e1d5e4cd..2c616f69fe 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ On other distros, or if you do not have root access, you will need to perform th liblzma-dev liblz4-dev libffi-dev libcairo-dev libboost-all-dev \ libzstd-dev pybind11-dev python3-pybind11 libssl-dev kmc -At present, you will need GCC version 9 or greater, with support for C++17, to compile vg. (Check your version with `gcc --version`.) GCC up to 11.4.0 is supported. +At present, you will need GCC version 10 or greater, with support for C++20, to compile vg. (Check your version with `gcc --version`.) GCC up to 11.4.0 is supported. Other libraries may be required. Please report any build difficulties. diff --git a/deps/gbwt b/deps/gbwt index d127b9aff4..9e92e4f11b 160000 --- a/deps/gbwt +++ b/deps/gbwt @@ -1 +1 @@ -Subproject commit d127b9aff47f7212603e56ff3e0383a1b257e0aa +Subproject commit 9e92e4f11bafcb1df92df9adf5991199c0f09f61 diff --git a/deps/gbwtgraph b/deps/gbwtgraph index bcc248bff4..8649f806a1 160000 --- a/deps/gbwtgraph +++ b/deps/gbwtgraph @@ -1 +1 @@ -Subproject commit bcc248bff469bcb6d69a9e37be1dab43580f2417 +Subproject commit 8649f806a1fb332ad9090a76c6f0b7583219ef0a diff --git a/deps/libbdsg b/deps/libbdsg index e74fb663a5..cf3dce919a 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit e74fb663a5f85bc1f76d159b2b3a3691ed85862f +Subproject commit cf3dce919a0085f4cfac9c290a0f750b578f2573 diff --git a/doc/publish-docs.sh b/doc/publish-docs.sh index c17bd438e6..c3c2f839fb 100755 --- a/doc/publish-docs.sh +++ b/doc/publish-docs.sh @@ -26,9 +26,26 @@ COMMIT_AUTHOR_EMAIL="anovak+vgdocbot@soe.ucsc.edu" # See git submodule foreach --recursive git clean -xfd -# Find all the submodules that Doxygen wants to look at and make sure we have -# those. -cat Doxyfile | grep "^INPUT *=" | cut -f2 -d'=' | tr ' ' '\n' | grep "^ *deps" | sed 's_ *\(deps/[^/]*\).*_\1_' | sort | uniq | xargs -n 1 git submodule update --init --recursive +# Find all the submodules that Doxygen wants to look at and make sure we have them, freshly. +# +# The CI workspace can carry these submodules with stale origin URLs (left pointing at a local mirror +# that is not a valid repo on this runner). `git submodule deinit` clears the working tree and the +# superproject config entry, but it leaves the cached git directory under .git/modules/ in place, +# and `git submodule update --init` then reuses that stale gitdir (stale file:// origin + stale refs) +# instead of cloning fresh -- so the pinned commit isn't found and git falls back to a file-transport +# fetch that is (correctly) blocked. So we remove both the working tree AND the cached gitdir, forcing a +# clean clone from the canonical https URL in .gitmodules. Every pinned commit is a branch tip on github +# and the runner has https access, so no local ("file") transport is ever needed; protocol.file.allow=never +# enforces that as defense-in-depth (CVE-2022-39253). We only recurse where Doxygen needs a nested tree: +# deps/libvgio/deps (whose only nested submodule is vgteam/libhandlegraph). +GITDIR=$(git rev-parse --git-dir) +DOXYGEN_DEPS=$(cat Doxyfile | grep "^INPUT *=" | cut -f2 -d'=' | tr ' ' '\n' | grep "^ *deps" | sed 's_ *\(deps/[^/]*\).*_\1_' | sort | uniq) +for dep in ${DOXYGEN_DEPS}; do + git submodule deinit -f -- "${dep}" || true + rm -rf "${dep}" "${GITDIR}/modules/${dep}" +done +echo "${DOXYGEN_DEPS}" | xargs -n 1 git -c protocol.file.allow=never submodule update --init +git -c protocol.file.allow=never submodule update --init --recursive deps/libvgio # Build the documentation. # Assumes we are running in the repo root. diff --git a/src/cactus.cpp b/src/cactus.cpp index 6179663968..49eab63294 100644 --- a/src/cactus.cpp +++ b/src/cactus.cpp @@ -999,8 +999,8 @@ VG cactus_to_vg(stCactusGraph* cactus_graph) { return vg_graph; } -VG cactusify(VG& graph) { - if (graph.size() == 0) { +VG cactusify(const PathHandleGraph& graph) { + if (graph.get_node_count() == 0) { return VG(); } auto parts = handle_graph_to_cactus(graph, unordered_set()); diff --git a/src/cactus.hpp b/src/cactus.hpp index 36d53f2fab..21cfd8ebc7 100644 --- a/src/cactus.hpp +++ b/src/cactus.hpp @@ -46,7 +46,7 @@ VG cactus_to_vg(stCactusGraph* cactus_graph); // Convert vg into vg formatted cactus representation // Input graph must be sorted! -VG cactusify(VG& graph); +VG cactusify(const PathHandleGraph& graph); } diff --git a/src/cluster.hpp b/src/cluster.hpp index df997cc51c..cd6deab517 100644 --- a/src/cluster.hpp +++ b/src/cluster.hpp @@ -212,8 +212,8 @@ class MEMClusterer { protected: - class HitNode; class HitEdge; + class HitNode; class HitGraph; class DPScoreComparator; @@ -232,7 +232,47 @@ class MEMClusterer { /// is closest to the optimal separation void deduplicate_cluster_pairs(vector, int64_t>>& cluster_pairs, int64_t optimal_separation); }; + +class MEMClusterer::HitEdge { +public: + HitEdge(size_t to_idx, int32_t weight, int64_t distance) : to_idx(to_idx), weight(weight), distance(distance) {} + HitEdge() = default; + ~HitEdge() = default; + + /// Index of the node that the edge points to + size_t to_idx; + /// Weight for dynamic programming + int32_t weight; + + /// Estimated distance + int64_t distance; +}; + +class MEMClusterer::HitNode { +public: + HitNode(const MaximalExactMatch& mem, pos_t start_pos, int32_t score) : mem(&mem), start_pos(start_pos), score(score) { } + HitNode() = default; + ~HitNode() = default; + + const MaximalExactMatch* mem; + + /// Position of GCSA hit in the graph + pos_t start_pos; + + /// Score of the exact match this node represents + int32_t score; + + /// Score used in dynamic programming + int32_t dp_score; + + /// Edges from this node that are colinear with the read + vector edges_from; + + /// Edges to this node that are colinear with the read + vector edges_to; +}; + class MEMClusterer::HitGraph { public: @@ -286,46 +326,6 @@ class MEMClusterer::HitGraph { UnionFind components; }; -class MEMClusterer::HitNode { -public: - HitNode(const MaximalExactMatch& mem, pos_t start_pos, int32_t score) : mem(&mem), start_pos(start_pos), score(score) { } - HitNode() = default; - ~HitNode() = default; - - const MaximalExactMatch* mem; - - /// Position of GCSA hit in the graph - pos_t start_pos; - - /// Score of the exact match this node represents - int32_t score; - - /// Score used in dynamic programming - int32_t dp_score; - - /// Edges from this node that are colinear with the read - vector edges_from; - - /// Edges to this node that are colinear with the read - vector edges_to; -}; - -class MEMClusterer::HitEdge { -public: - HitEdge(size_t to_idx, int32_t weight, int64_t distance) : to_idx(to_idx), weight(weight), distance(distance) {} - HitEdge() = default; - ~HitEdge() = default; - - /// Index of the node that the edge points to - size_t to_idx; - - /// Weight for dynamic programming - int32_t weight; - - /// Estimated distance - int64_t distance; -}; - struct MEMClusterer::DPScoreComparator { private: const vector& nodes; diff --git a/src/gbwtgraph_helper.cpp b/src/gbwtgraph_helper.cpp index e9dbfda99e..ebd15b4ed0 100644 --- a/src/gbwtgraph_helper.cpp +++ b/src/gbwtgraph_helper.cpp @@ -468,11 +468,20 @@ void cache_payloads( const handlegraph::HandleGraph* graph_ptr = (const handlegraph::HandleGraph*) &gbz.graph; + double total_zipcode_time = 0.0, total_decoder_time = 0.0; + std::atomic node_count = 0; gbz.graph.for_each_handle([&](const handle_t& handle) { nid_t node_id = gbz.graph.get_id(handle); - ZipCode zipcode; pos_t pos = make_pos_t(node_id, false, 0); - zipcode.fill_in_zipcode_from_pos(distance_index, pos, true, graph_ptr); + ZipCode zipcode; + zipcode.fill_in_zipcode_from_pos(distance_index, pos, false, graph_ptr); + zipcode.fill_in_full_decoder(); + if (++node_count % 10000 == 0 && progress) { + double telapsed = gbwt::readTimer() - start; + #pragma omp critical (cerr) + std::cerr << " Cached " << node_count << " nodes in " << telapsed << "s" << std::endl; + } + payload_t payload = zipcode.get_payload_from_zip(); if (payload == MIPayload::NO_CODE && oversized_zipcodes != nullptr) { // The zipcode is too large for the payload field. diff --git a/src/graph.cpp b/src/graph.cpp index beca52b5e1..3f23ffef18 100644 --- a/src/graph.cpp +++ b/src/graph.cpp @@ -2,93 +2,6 @@ namespace vg { -void sort_by_id_dedup_and_clean(Graph& graph) { - remove_duplicates(graph); // graph is sorted here - remove_orphan_edges(graph); -} - -void remove_duplicates(Graph& graph) { - remove_duplicate_nodes(graph); - remove_duplicate_edges(graph); -} - -void remove_duplicate_edges(Graph& graph) { - sort_edges_by_id(graph); - graph.mutable_edge()->erase(std::unique(graph.mutable_edge()->begin(), - graph.mutable_edge()->end(), - [](const Edge& a, const Edge& b) { - return make_tuple(a.from(), a.to(), a.from_start(), a.to_end()) - == make_tuple(b.from(), b.to(), b.from_start(), b.to_end()); - }), graph.mutable_edge()->end()); - -} - -void remove_duplicate_nodes(Graph& graph) { - sort_nodes_by_id(graph); - graph.mutable_node()->erase(std::unique(graph.mutable_node()->begin(), - graph.mutable_node()->end(), - [](const Node& a, const Node& b) { - return a.id() == b.id(); - }), graph.mutable_node()->end()); -} - -void remove_orphan_edges(Graph& graph) { - set ids; - for (auto& node : graph.node()) { - ids.insert(node.id()); - } - graph.mutable_edge()->erase(std::remove_if(graph.mutable_edge()->begin(), - graph.mutable_edge()->end(), - [&ids](const Edge& e) { - return !ids.count(e.from()) || !ids.count(e.to()); - }), graph.mutable_edge()->end()); -} - -void sort_by_id(Graph& graph) { - sort_nodes_by_id(graph); - sort_edges_by_id(graph); -} - -void sort_nodes_by_id(Graph& graph) { - std::sort(graph.mutable_node()->begin(), - graph.mutable_node()->end(), - [](const Node& a, const Node& b) { - return a.id() < b.id(); - }); -} - -void sort_edges_by_id(Graph& graph) { - std::sort(graph.mutable_edge()->begin(), - graph.mutable_edge()->end(), - [](const Edge& a, const Edge& b) { - return make_tuple(a.from(), a.to(), a.from_start(), a.to_end()) - < make_tuple(b.from(), b.to(), b.from_start(), b.to_end()); - }); -} - -bool is_id_sortable(const Graph& graph) { - for (auto& edge : graph.edge()) { - if (edge.from() >= edge.to()) return false; - } - return true; -} - -bool has_inversion(const Graph& graph) { - for (auto& edge : graph.edge()) { - if (edge.from_start() || edge.to_end()) return true; - } - return false; -} - -void flip_doubly_reversed_edges(Graph& graph) { - for (auto& edge : *graph.mutable_edge()) { - if (edge.from_start() && edge.to_end()) { - edge.set_from_start(false); - edge.set_to_end(false); - } - } -} - void from_handle_graph(const HandleGraph& from, Graph& to) { from.for_each_handle([&](const handle_t& h) { Node* node = to.add_node(); diff --git a/src/graph.hpp b/src/graph.hpp index 964e46cceb..c85afe88ab 100644 --- a/src/graph.hpp +++ b/src/graph.hpp @@ -11,39 +11,6 @@ namespace vg { using namespace std; -/// remove duplicates and sort by id -void sort_by_id_dedup_and_clean(Graph& graph); - -/// remove duplicate nodes and edges -void remove_duplicates(Graph& graph); - -/// remove duplicate edges -void remove_duplicate_edges(Graph& graph); - -/// remove duplicate nodes -void remove_duplicate_nodes(Graph& graph); - -/// remove edges that link to a node that is not in the graph -void remove_orphan_edges(Graph& graph); - -/// order the nodes and edges in the graph by id -void sort_by_id(Graph& graph); - -/// order the nodes in the graph by id -void sort_nodes_by_id(Graph& graph); - -/// order the edges in the graph by id pairs -void sort_edges_by_id(Graph& graph); - -/// returns true if the graph is id-sortable (no reverse links) -bool is_id_sortable(const Graph& graph); - -/// returns true if we find an edge that may specify an inversion -bool has_inversion(const Graph& graph); - -/// clean up doubly-reversed edges -void flip_doubly_reversed_edges(Graph& graph); - // transfer data from a HandleGraph into an empty Graph void from_handle_graph(const HandleGraph& from, Graph& to); diff --git a/src/multipath_mapper.cpp b/src/multipath_mapper.cpp index 74ad7718c1..f90214123c 100644 --- a/src/multipath_mapper.cpp +++ b/src/multipath_mapper.cpp @@ -2448,7 +2448,7 @@ namespace vg { // in the left_idxs and right_idxs vectors int64_t target_len = 2 * seq_len - left_side.clip_length - right_side.clip_length; auto distance_diff = [&](size_t l, size_t r) { - return abs(get<2>(left_sites[left_idxs[l]]) + get<2>(right_sites[right_idxs[r]]) - target_len); + return std::abs(static_cast(get<2>(left_sites[left_idxs[l]]) + get<2>(right_sites[right_idxs[r]]) - target_len)); }; // sweep to identify pairs that most nearly align diff --git a/src/recombinator.cpp b/src/recombinator.cpp index a9aaed4b10..07915118ed 100644 --- a/src/recombinator.cpp +++ b/src/recombinator.cpp @@ -1585,7 +1585,7 @@ void add_path(const gbwt::GBWT& source, gbwt::size_type path_id, gbwt::GBWTBuild gbwt::PathName path_name = source.metadata.path(path_id); std::string sample_name = source.metadata.sample(path_name.sample); std::string contig_name = source.metadata.contig(path_name.contig); - if (sample_name == gbwtgraph::REFERENCE_PATH_SAMPLE_NAME) { + if (sample_name == gbwtgraph::GENERIC_PATH_SAMPLE_NAME) { metadata.add_generic_path(contig_name); } else { // Reference samples will be copied later. diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp index 01e48e62ea..7c79e424fc 100644 --- a/src/snarl_distance_index.cpp +++ b/src/snarl_distance_index.cpp @@ -2,6 +2,8 @@ //#define debug_snarl_traversal //#define debug_distances //#define debug_subgraph +//#define debug_hub_label_build +//#define debug_hub_label_storage #include "snarl_distance_index.hpp" @@ -92,7 +94,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( //Stores unfinished records, as type of record and offset into appropriate vector //(temp_node/snarl/chain_records) - vector> stack; + vector stack; //There may be components of the root that are connected to each other. Each connected component will //get put into a (fake) root-level snarl, but we don't know what those components will be initially, @@ -113,7 +115,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( #ifdef debug_distance_indexing cerr << " Starting new chain at " << graph->get_id(chain_start_handle) << (graph->get_is_reverse(chain_start_handle) ? " reverse" : " forward") << endl; //We shouldn't have seen this node before - //assert(temp_index.temp_node_records[graph->get_id(chain_start_handle)-min_node_id].node_id == 0); + //assert(temp_index.get_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(chain_start_handle))).node_id == 0); #endif //Fill in node in chain @@ -127,7 +129,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( //And the node record itself - auto& temp_node = temp_index.temp_node_records.at(node_id-temp_index.min_node_id); + auto& temp_node = temp_index.get_node(temp_chain.children.back()); temp_node.node_id = node_id; temp_node.node_length = graph->get_length(chain_start_handle); temp_node.reversed_in_parent = graph->get_is_reverse(chain_start_handle); @@ -141,13 +143,13 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( */ //Done with this chain - pair chain_index = stack.back(); + SnarlDistanceIndex::temp_record_ref_t chain_index = stack.back(); stack.pop_back(); #ifdef debug_distance_indexing assert(chain_index.first == SnarlDistanceIndex::TEMP_CHAIN); #endif - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.temp_chain_records.at(chain_index.second); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.get_chain(chain_index); nid_t node_id = graph->get_id(chain_end_handle); if (temp_chain_record.children.size() == 1 && node_id == temp_chain_record.start_node_id) { @@ -159,7 +161,8 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( #endif //Get the node - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = temp_index.temp_node_records.at(node_id - temp_index.min_node_id); + SnarlDistanceIndex::temp_record_ref_t node_index = make_pair(SnarlDistanceIndex::TEMP_NODE, node_id); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = temp_index.get_node(node_index); temp_node_record.reversed_in_parent = false; @@ -199,20 +202,21 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( for (nid_t next_id : reachable_nodes) { //For each node that this is connected to, check if we've already seen it and if we have, then //union this chain and that node's chain - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& node_record = temp_index.temp_node_records[next_id-temp_index.min_node_id]; + SnarlDistanceIndex::temp_record_ref_t next_index = make_pair(SnarlDistanceIndex::TEMP_NODE, next_id); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& node_record = temp_index.get_node(next_index); if (node_record.node_id != 0) { //If we've already seen this node, union it with the new one //If we can see it by walking out from this top-level chain, then it must also be a //top-level chain (or node pretending to be a chain) size_t other_i = node_record.parent.first == SnarlDistanceIndex::TEMP_CHAIN - ? temp_index.temp_chain_records[node_record.parent.second].root_snarl_index + ? temp_index.get_chain(node_record.parent).root_snarl_index : node_record.root_snarl_index; #ifdef debug_distance_indexing assert(other_i != std::numeric_limits::max()); #endif root_snarl_component_uf.union_groups(other_i, temp_node_record.root_snarl_index); //#ifdef debug_distance_indexing -// cerr << " Union this trivial with " << temp_index.temp_chain_records[node_record.parent.second].start_node_id << " " << temp_index.temp_chain_records[node_record.parent.second].end_node_id << endl; +// cerr << " Union this trivial with " << temp_index.get_chain(node_record.parent).start_node_id << " " << temp_index.get_chain(node_record.parent).end_node_id << endl; //#endif } else { new_component = false; @@ -226,7 +230,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( } else { //The last thing on the stack is the parent of this chain, which must be a snarl temp_node_record.parent = stack.back(); - auto& parent_snarl_record = temp_index.temp_snarl_records.at(temp_node_record.parent.second); + auto& parent_snarl_record = temp_index.get_snarl(temp_node_record.parent); temp_node_record.rank_in_parent = parent_snarl_record.children.size() + 2; parent_snarl_record.children.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); } @@ -282,20 +286,25 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( for (nid_t next_id : reachable_nodes) { //For each node that this is connected to, check if we've already seen it and if we have, then //union this chain and that node's chain - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& node_record = temp_index.temp_node_records[next_id-temp_index.min_node_id]; + SnarlDistanceIndex::temp_record_ref_t next_index = make_pair(SnarlDistanceIndex::TEMP_NODE, next_id); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& node_record = temp_index.get_node(next_index); if (node_record.node_id != 0) { //If we've already seen this node, union it with the new one //If we can see it by walking out from this top-level chain, then it must also be a //top-level chain (or node pretending to be a chain) size_t other_i = node_record.parent.first == SnarlDistanceIndex::TEMP_CHAIN - ? temp_index.temp_chain_records[node_record.parent.second].root_snarl_index + ? temp_index.get_chain(node_record.parent).root_snarl_index : node_record.root_snarl_index; #ifdef debug_distance_indexing assert(other_i != std::numeric_limits::max()); #endif root_snarl_component_uf.union_groups(other_i, temp_chain_record.root_snarl_index); #ifdef debug_distance_indexing - cerr << " Union this chain with " << temp_index.temp_chain_records[node_record.parent.second].start_node_id << " " << temp_index.temp_chain_records[node_record.parent.second].end_node_id << endl; + if (node_record.parent.first == SnarlDistanceIndex::TEMP_CHAIN) { + cerr << " Union this chain with " << temp_index.get_chain(node_record.parent).start_node_id << " " << temp_index.get_chain(node_record.parent).end_node_id << endl; + } else { + cerr << " Union this chain with root " << node_record.root_snarl_index << endl; + } #endif } else { new_component = false; @@ -310,7 +319,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( } else { //The last thing on the stack is the parent of this chain, which must be a snarl temp_chain_record.parent = stack.back(); - auto& parent_snarl_record = temp_index.temp_snarl_records.at(temp_chain_record.parent.second); + auto& parent_snarl_record = temp_index.get_snarl(temp_chain_record.parent); temp_chain_record.rank_in_parent = parent_snarl_record.children.size() + 2; parent_snarl_record.children.emplace_back(chain_index); } @@ -347,13 +356,13 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( * parent chain * Also create a node record */ - pair snarl_index = stack.back(); + SnarlDistanceIndex::temp_record_ref_t snarl_index = stack.back(); stack.pop_back(); #ifdef debug_distance_indexing assert(snarl_index.first == SnarlDistanceIndex::TEMP_SNARL); assert(stack.back().first == SnarlDistanceIndex::TEMP_CHAIN); #endif - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records[snarl_index.second]; + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.get_snarl(snarl_index); nid_t node_id = graph->get_id(snarl_end_handle); //Record the end node in the snarl @@ -362,12 +371,12 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( temp_snarl_record.end_node_length = graph->get_length(snarl_end_handle); temp_snarl_record.node_count = temp_snarl_record.children.size(); bool any_edges_in_snarl = false; - graph->follow_edges(graph->get_handle(temp_snarl_record.start_node_id, temp_snarl_record.start_node_rev), false, [&](const handle_t next_handle) { + graph->follow_edges(graph->get_handle(temp_snarl_record.start_node_id, temp_snarl_record.start_node_rev), false, [&](const handle_t& next_handle) { if (graph->get_id(next_handle) != temp_snarl_record.end_node_id) { any_edges_in_snarl = true; } }); - graph->follow_edges(graph->get_handle(temp_snarl_record.end_node_id, !temp_snarl_record.end_node_rev), false, [&](const handle_t next_handle) { + graph->follow_edges(graph->get_handle(temp_snarl_record.end_node_id, !temp_snarl_record.end_node_rev), false, [&](const handle_t& next_handle) { if (graph->get_id(next_handle) != temp_snarl_record.start_node_id) { any_edges_in_snarl = true; } @@ -377,53 +386,50 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( //This is a trivial snarl temp_snarl_record.is_trivial = true; +#ifdef debug_distance_indexing + cerr << " Ending and forgetting trivial snarl " << temp_index.structure_start_end_as_string(snarl_index) + << endl << " that is a child of " << temp_index.structure_start_end_as_string(temp_snarl_record.parent) << endl; +#endif + //Add the end node to the chain #ifdef debug_distance_indexing assert(stack.back().first == SnarlDistanceIndex::TEMP_CHAIN); #endif temp_snarl_record.parent = stack.back(); - auto& temp_chain = temp_index.temp_chain_records.at(stack.back().second); + auto& temp_chain = temp_index.get_chain(stack.back()); temp_chain.children.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); - //Remove the snarl record + //Remove the snarl record. + //This invalidates snarl_index!!! #ifdef debug_distance_indexing assert(temp_index.temp_snarl_records.size() == snarl_index.second+1); #endif temp_index.temp_snarl_records.pop_back(); } else { //This is the child of a chain + +#ifdef debug_distance_indexing + cerr << " Ending new snarl " << temp_index.structure_start_end_as_string(snarl_index) + << endl << " that is a child of " << temp_index.structure_start_end_as_string(temp_snarl_record.parent) << endl; +#endif + #ifdef debug_distance_indexing assert(stack.back().first == SnarlDistanceIndex::TEMP_CHAIN); #endif temp_snarl_record.parent = stack.back(); - auto& temp_chain = temp_index.temp_chain_records.at(stack.back().second); + auto& temp_chain = temp_index.get_chain(stack.back()); temp_chain.children.emplace_back(snarl_index); temp_chain.children.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); } - //Record the snarl as a child of its chain - //if (stack.empty()) { - // assert(false); - // //TODO: The snarl should always be the child of a chain - // //If this was the last thing on the stack, then this was a root - // //TODO: I'm not sure if this would get put into a chain or not - // temp_snarl_record.parent = make_pair(SnarlDistanceIndex::TEMP_ROOT, 0); - // temp_index.components.emplace_back(snarl_index); - //} //Record the node itself. This gets done for the start of the chain, and ends of snarls - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = temp_index.temp_node_records.at(node_id-temp_index.min_node_id); + SnarlDistanceIndex::temp_record_ref_t node_index = make_pair(SnarlDistanceIndex::TEMP_NODE, node_id); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = temp_index.get_node(node_index); temp_node_record.node_id = node_id; temp_node_record.node_length = graph->get_length(snarl_end_handle); temp_node_record.reversed_in_parent = graph->get_is_reverse(snarl_end_handle); temp_node_record.parent = stack.back(); - - - -#ifdef debug_distance_indexing - cerr << " Ending new snarl " << temp_index.structure_start_end_as_string(snarl_index) - << endl << " that is a child of " << temp_index.structure_start_end_as_string(temp_snarl_record.parent) << endl; -#endif }); /* @@ -451,7 +457,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( for (size_t chain_i : root_snarl_indexes) { //For each chain component of this root-level snarl if (temp_index.root_snarl_components[chain_i].first == SnarlDistanceIndex::TEMP_CHAIN){ - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.temp_chain_records[temp_index.root_snarl_components[chain_i].second]; + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.get_chain(temp_index.root_snarl_components[chain_i]); temp_chain_record.parent = make_pair(SnarlDistanceIndex::TEMP_SNARL, temp_index.temp_snarl_records.size() - 1); temp_chain_record.rank_in_parent = temp_snarl_record.children.size(); temp_chain_record.reversed_in_parent = false; @@ -461,7 +467,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( #ifdef debug_distance_indexing assert(temp_index.root_snarl_components[chain_i].first == SnarlDistanceIndex::TEMP_NODE); #endif - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = temp_index.temp_node_records[temp_index.root_snarl_components[chain_i].second - temp_index.min_node_id]; + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = temp_index.get_node(temp_index.root_snarl_components[chain_i]); temp_node_record.parent = make_pair(SnarlDistanceIndex::TEMP_SNARL, temp_index.temp_snarl_records.size() - 1); temp_node_record.rank_in_parent = temp_snarl_record.children.size(); temp_node_record.reversed_in_parent = false; @@ -484,11 +490,11 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( cerr << "Filling in the distances in snarls" << endl; #endif for (int i = temp_index.temp_chain_records.size()-1 ; i >= 0 ; i--) { - - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.temp_chain_records[i]; + SnarlDistanceIndex::temp_record_ref_t chain_index = make_pair(SnarlDistanceIndex::TEMP_CHAIN, i); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.get_chain(chain_index); #ifdef debug_distance_indexing assert(!temp_chain_record.is_trivial); - cerr << " At " << (temp_chain_record.is_trivial ? " trivial " : "") << " chain " << temp_index.structure_start_end_as_string(make_pair(SnarlDistanceIndex::TEMP_CHAIN, i)) << endl; + cerr << " At" << (temp_chain_record.is_trivial ? " trivial " : "") << "chain " << temp_index.structure_start_end_as_string(chain_index) << endl; #endif //Add the first values for the prefix sum and backwards loop vectors @@ -505,7 +511,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( size_t curr_component = 0; //which component of the chain are we in size_t last_node_length = 0; for (size_t chain_child_i = 0 ; chain_child_i < temp_chain_record.children.size() ; chain_child_i++ ){ - const pair& chain_child_index = temp_chain_record.children[chain_child_i]; + const SnarlDistanceIndex::temp_record_ref_t& chain_child_index = temp_chain_record.children[chain_child_i]; //Go through each of the children in the chain, skipping nodes //The snarl may be trivial, in which case don't fill in the distances #ifdef debug_distance_indexing @@ -518,7 +524,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( //all distances, then add distances to the chain that this is in //The parent chain will be the last thing in the stack SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = - temp_index.temp_snarl_records.at(chain_child_index.second); + temp_index.get_snarl(chain_child_index); //Fill in this snarl's distances populate_snarl_index(temp_index, chain_child_index, size_limit, only_top_level_chain_distances, graph); @@ -566,13 +572,13 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( //If this is a node and the last thing was also a node, //then there was a trivial snarl SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = - temp_index.temp_node_records.at(chain_child_index.second-temp_index.min_node_id); + temp_index.get_node(chain_child_index); //Check if there is a loop in this node //Snarls get counted as trivial if they contain no nodes but they might still have edges size_t backward_loop = std::numeric_limits::max(); - graph->follow_edges(graph->get_handle(temp_node_record.node_id, !temp_node_record.reversed_in_parent), false, [&](const handle_t next_handle) { + graph->follow_edges(graph->get_handle(temp_node_record.node_id, !temp_node_record.reversed_in_parent), false, [&](const handle_t& next_handle) { if (graph->get_id(next_handle) == temp_node_record.node_id) { //If there is a loop going backwards (relative to the chain) back to the same node backward_loop = 0; @@ -590,7 +596,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( } temp_chain_record.chain_components.emplace_back(curr_component); } - last_node_length = temp_index.temp_node_records.at(chain_child_index.second - temp_index.min_node_id).node_length; + last_node_length = temp_index.get_node(chain_child_index).node_length; //And update the chains max length temp_chain_record.max_length = SnarlDistanceIndex::sum(temp_chain_record.max_length, last_node_length); @@ -626,7 +632,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( //If this is a looping chain, then check the first snarl for a loop if (temp_chain_record.children.at(1).first == SnarlDistanceIndex::TEMP_SNARL) { - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(temp_chain_record.children.at(1).second); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.get_snarl(temp_chain_record.children.at(1)); temp_chain_record.forward_loops[temp_chain_record.forward_loops.size()-1] = temp_snarl_record.distance_start_start; } } @@ -637,7 +643,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( for (int j = (int)temp_chain_record.children.size() - 1 ; j >= 0 ; j--) { auto& child = temp_chain_record.children.at(j); if (child.first == SnarlDistanceIndex::TEMP_SNARL){ - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(child.second); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.get_snarl(child); if (temp_chain_record.chain_components.at(node_i) != temp_chain_record.chain_components.at(node_i+1) && temp_chain_record.chain_components.at(node_i+1) != 0){ //If this is a new chain component, then add the loop distance from the snarl @@ -656,13 +662,13 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( } else { if (last_node_length != 0) { SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = - temp_index.temp_node_records.at(child.second-temp_index.min_node_id); + temp_index.get_node(child); //Check if there is a loop in this node //Snarls get counted as trivial if they contain no nodes but they might still have edges size_t forward_loop = std::numeric_limits::max(); - graph->follow_edges(graph->get_handle(temp_node_record.node_id, temp_node_record.reversed_in_parent), false, [&](const handle_t next_handle) { + graph->follow_edges(graph->get_handle(temp_node_record.node_id, temp_node_record.reversed_in_parent), false, [&](const handle_t& next_handle) { if (graph->get_id(next_handle) == temp_node_record.node_id) { //If there is a loop going forward (relative to the chain) back to the same node forward_loop = 0; @@ -673,7 +679,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( 2*last_node_length)); node_i--; } - last_node_length = temp_index.temp_node_records.at(child.second - temp_index.min_node_id).node_length; + last_node_length = temp_index.get_node(child).node_length; } } @@ -692,7 +698,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( for (size_t i = 1 ; i < temp_chain_record.children.size()-1 ; i++ ) { auto& child = temp_chain_record.children.at(i); if (child.first == SnarlDistanceIndex::TEMP_SNARL) { - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(child.second); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.get_snarl(child); size_t new_loop_distance = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( temp_chain_record.backward_loops.at(node_i-1), 2*temp_snarl_record.min_length), @@ -715,7 +721,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( temp_chain_record.backward_loops.at(node_i) = std::min(old_loop_distance,new_loop_distance); node_i++; } - last_node_length = temp_index.temp_node_records.at(child.second - temp_index.min_node_id).node_length; + last_node_length = temp_index.get_node(child).node_length; } } } @@ -729,7 +735,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( for (int j = (int)temp_chain_record.children.size() - 1 ; j >= 0 ; j--) { auto& child = temp_chain_record.children.at(j); if (child.first == SnarlDistanceIndex::TEMP_SNARL){ - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(child.second); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.get_snarl(child); size_t new_distance = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( temp_chain_record.forward_loops.at(node_i+1), 2* temp_snarl_record.min_length), @@ -751,7 +757,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( temp_chain_record.forward_loops.at(node_i) = std::min(old_distance, new_distance); node_i--; } - last_node_length = temp_index.temp_node_records.at(child.second - temp_index.min_node_id).node_length; + last_node_length = temp_index.get_node(child).node_length; } } } @@ -767,9 +773,9 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( #ifdef debug_distance_indexing cerr << "Filling in the distances in root snarls and distances along chains" << endl; #endif - for (pair& component_index : temp_index.components) { + for (SnarlDistanceIndex::temp_record_ref_t& component_index : temp_index.components) { if (component_index.first == SnarlDistanceIndex::TEMP_SNARL) { - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(component_index.second); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.get_snarl(component_index); populate_snarl_index(temp_index, component_index, size_limit, only_top_level_chain_distances, graph); temp_snarl_record.min_length = std::numeric_limits::max(); } @@ -782,9 +788,44 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( return temp_index; } +/** + * Populate a row of the distance matrix. + * Also responsible for filling in min_length, distance_start_start, and distance_start_end on the TemporarySnarlRecord when a distance matrix is used. + */ +static void populate_distance_matrix_row(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const SnarlDistanceIndex::temp_record_ref_t& start_index, const HandleGraph* graph, size_t start_rank, bool is_internal_node, size_t size_limit); + +/** + * Fills in required distance matrix rows for each child. + * Only called for non-oversized snarls (size_limit == 0 || node_count <= size_limit); + * oversized snarls go through populate_hub_labeling instead. + * - Normal snarl: all rows + * - size_limit == 0: no distances in index, so no rows + * - Top-level chain distances only: boundaries and tips only + */ +static void populate_distance_matrix_if_needed(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const vector& all_children, const HandleGraph* graph, size_t size_limit, bool only_top_level_chain_distances); +/** + * Does three things: + * - Builds temp graph that hub labels will be built on + * - Builds the hub labels + * - Stores labels in temp_snarl_record + */ +static void populate_hub_labeling(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const vector& all_children, const HandleGraph* graph); + +/** + * Determine if a snarl is regular or not. + * + * A regular snarl is a snarl that consists of only nodes or + * chains connected to the start and end, without any connections between + * multiple children, or any way to turn around. There may be an edge directly + * across. + * + * A simple snarl is always regular. + */ +static bool check_regularity(const SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, const SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const vector& all_children, const HandleGraph* graph); -/*Fill in the snarl index. +/** + * Fill in the snarl index. * The index will already know its boundaries and everything knows their relationships in the * snarl tree. This needs to fill in the distances and the ranks of children in the snarl * The rank of a child is arbitrary, except that the start node will always be 0 and the end node @@ -792,21 +833,18 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( */ void populate_snarl_index( SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, - pair snarl_index, size_t size_limit, + SnarlDistanceIndex::temp_record_ref_t snarl_index, size_t size_limit, bool only_top_level_chain_distances, const HandleGraph* graph) { #ifdef debug_distance_indexing cerr << "Getting the distances for snarl " << temp_index.structure_start_end_as_string(snarl_index) << endl; assert(snarl_index.first == SnarlDistanceIndex::TEMP_SNARL); #endif - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(snarl_index.second); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.get_snarl(snarl_index); temp_snarl_record.is_simple=true; - - - /*Helper function to find the ancestor of a node that is a child of this snarl */ - auto get_ancestor_of_node = [&](pair curr_index, - pair ancestor_snarl_index) { + auto get_ancestor_of_node = [&](SnarlDistanceIndex::temp_record_ref_t curr_index, + SnarlDistanceIndex::temp_record_ref_t ancestor_snarl_index) { //This is a child that isn't a node, so it must be a chain if (curr_index.second == temp_snarl_record.start_node_id || @@ -815,11 +853,11 @@ void populate_snarl_index( } //Otherwise, walk up until we hit the current snarl - pair parent_index = temp_index.temp_node_records.at(curr_index.second-temp_index.min_node_id).parent; + SnarlDistanceIndex::temp_record_ref_t parent_index = temp_index.get_node(curr_index).parent; while (parent_index != ancestor_snarl_index) { curr_index=parent_index; - parent_index = parent_index.first == SnarlDistanceIndex::TEMP_SNARL ? temp_index.temp_snarl_records.at(parent_index.second).parent - : temp_index.temp_chain_records.at(parent_index.second).parent; + parent_index = parent_index.first == SnarlDistanceIndex::TEMP_SNARL ? temp_index.get_snarl(parent_index).parent + : temp_index.get_chain(parent_index).parent; #ifdef debug_distance_indexing assert(parent_index.first != SnarlDistanceIndex::TEMP_ROOT); #endif @@ -829,7 +867,7 @@ void populate_snarl_index( }; // TODO: Copying the list - vector> all_children = temp_snarl_record.children; + vector all_children = temp_snarl_record.children; // Identify tips for (const auto& child : all_children) { @@ -931,23 +969,23 @@ void populate_snarl_index( // If the current child is the start bound, then get the start node pointing in current_graph_handle = topological_sort_start; } else { - pair current_index = all_children[current_child_index.first]; + SnarlDistanceIndex::temp_record_ref_t current_index = all_children[current_child_index.first]; if (current_index.first == SnarlDistanceIndex::TEMP_NODE) { // If the current child is a node, then get the node pointing in the correct direction current_graph_handle = graph->get_handle(current_index.second, current_child_index.second); } else if (current_child_index.second) { // If the current child is a chain, and we're traversing the chain backwards - current_graph_handle = graph->get_handle(temp_index.temp_chain_records[current_index.second].start_node_id, - !temp_index.temp_chain_records[current_index.second].start_node_rev); + current_graph_handle = graph->get_handle(temp_index.get_chain(current_index).start_node_id, + !temp_index.get_chain(current_index).start_node_rev); } else { // Otherwise, the current child is a chain and we're traversing the chain forwards - current_graph_handle = graph->get_handle(temp_index.temp_chain_records[current_index.second].end_node_id, - temp_index.temp_chain_records[current_index.second].end_node_rev); + current_graph_handle = graph->get_handle(temp_index.get_chain(current_index).end_node_id, + temp_index.get_chain(current_index).end_node_rev); } } - + // Try all edges leaving this side - graph->follow_edges(current_graph_handle, false, [&](const handle_t next_handle) { + graph->follow_edges(current_graph_handle, false, [&](const handle_t& next_handle) { #ifdef debug_distance_indexing cerr << "Following forward edges from " << graph->get_id(current_graph_handle) << " to " << graph->get_id(next_handle) << endl; @@ -958,19 +996,18 @@ void populate_snarl_index( return true; } // Is next_handle a new source? Any unvisited predecessors? - pair next_index = - get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle)), snarl_index); - bool next_is_node = next_index.first == SnarlDistanceIndex::TEMP_NODE; - size_t next_rank = next_is_node - ? temp_index.temp_node_records.at(next_index.second - temp_index.min_node_id).rank_in_parent - : temp_index.temp_chain_records[next_index.second].rank_in_parent; + SnarlDistanceIndex::temp_record_ref_t next_index = + get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle)), snarl_index); + size_t next_rank = next_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.get_node(next_index).rank_in_parent + : temp_index.get_chain(next_index).rank_in_parent; // Subtract 2 to get the index from the rank assert(next_rank >= 2); next_rank -= 2; assert(all_children[next_rank] == next_index); - bool next_rev = (next_is_node || temp_index.temp_chain_records[next_index.second].is_trivial) - ? graph->get_is_reverse(next_handle) - : graph->get_id(next_handle) == temp_index.temp_chain_records[next_index.second].end_node_id; + bool next_rev = next_index.first == SnarlDistanceIndex::TEMP_NODE || temp_index.get_chain(next_index).is_trivial + ? graph->get_is_reverse(next_handle) + : graph->get_id(next_handle) == temp_index.get_chain(next_index).end_node_id; if (visited_ranks.count(next_rank) != 0) { // If this is a loop, abort return true; @@ -979,16 +1016,16 @@ void populate_snarl_index( // Get the handle from the child represented by next_handle going the other way handle_t reverse_handle = next_index.first == SnarlDistanceIndex::TEMP_NODE ? graph->get_handle(next_index.second, !next_rev) : - (next_rev ? graph->get_handle(temp_index.temp_chain_records[next_index.second].end_node_id, - temp_index.temp_chain_records[next_index.second].end_node_rev) - : graph->get_handle(temp_index.temp_chain_records[next_index.second].start_node_id, - !temp_index.temp_chain_records[next_index.second].start_node_rev)); + (next_rev ? graph->get_handle(temp_index.get_chain(next_index).end_node_id, + temp_index.get_chain(next_index).end_node_rev) + : graph->get_handle(temp_index.get_chain(next_index).start_node_id, + !temp_index.get_chain(next_index).start_node_rev)); // Does this have no unseen incoming edges? Check as we go through incoming edges bool is_source = true; // Does this have no unseen incoming edges? - graph->follow_edges(reverse_handle, false, [&](const handle_t incoming_handle) { + graph->follow_edges(reverse_handle, false, [&](const handle_t& incoming_handle) { #ifdef debug_distance_indexing cerr << "Getting backwards edge to " << graph->get_id(incoming_handle) << endl; #endif @@ -998,16 +1035,15 @@ void populate_snarl_index( return true; } // The index of the snarl's child that next_handle represents - pair incoming_index = - get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(incoming_handle)), snarl_index); - bool incoming_is_node = incoming_index.first == SnarlDistanceIndex::TEMP_NODE; - size_t incoming_rank = incoming_is_node - ? temp_index.temp_node_records.at(incoming_index.second - temp_index.min_node_id).rank_in_parent - : temp_index.temp_chain_records[incoming_index.second].rank_in_parent; - - bool incoming_rev = incoming_is_node || temp_index.temp_chain_records[incoming_index.second].is_trivial - ? graph->get_is_reverse(incoming_handle) - : graph->get_id(incoming_handle) == temp_index.temp_chain_records[incoming_index.second].end_node_id; + SnarlDistanceIndex::temp_record_ref_t incoming_index = + get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(incoming_handle)), snarl_index); + size_t incoming_rank = incoming_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.get_node(incoming_index).rank_in_parent + : temp_index.get_chain(incoming_index).rank_in_parent; + + bool incoming_rev = incoming_index.first == SnarlDistanceIndex::TEMP_NODE || temp_index.get_chain(incoming_index).is_trivial + ? graph->get_is_reverse(incoming_handle) + : graph->get_id(incoming_handle) == temp_index.get_chain(incoming_index).end_node_id; // Subtract 2 to get the index from the rank assert(incoming_rank >= 2); incoming_rank -= 2; @@ -1046,9 +1082,9 @@ void populate_snarl_index( for (size_t new_rank = 0 ; new_rank < topological_sort_order.size() ; new_rank++) { size_t old_rank = topological_sort_order[new_rank]; if (all_children[old_rank].first == SnarlDistanceIndex::TEMP_NODE) { - temp_index.temp_node_records.at(all_children[old_rank].second-temp_index.min_node_id).rank_in_parent = new_rank+2; + temp_index.get_node(all_children[old_rank]).rank_in_parent = new_rank+2; } else { - temp_index.temp_chain_records[all_children[old_rank].second].rank_in_parent = new_rank+2; + temp_index.get_chain(all_children[old_rank]).rank_in_parent = new_rank+2; } const auto& old_is_tip = old_tippy_ranks.find(old_rank); if (old_is_tip != old_tippy_ranks.end()) { @@ -1063,30 +1099,143 @@ void populate_snarl_index( */ - if (size_limit != 0 && !only_top_level_chain_distances) { - //If we are saving distances - //Reserve enough space to store all possible distances - temp_snarl_record.distances.reserve( temp_snarl_record.node_count > size_limit - ? temp_snarl_record.node_count * 2 - : temp_snarl_record.node_count * temp_snarl_record.node_count); + // Add the start and end nodes to the list of children so that we include them in the traversal. + if (!temp_snarl_record.is_root_snarl) { + all_children.emplace_back(SnarlDistanceIndex::TEMP_NODE, temp_snarl_record.start_node_id); + all_children.emplace_back(SnarlDistanceIndex::TEMP_NODE, temp_snarl_record.end_node_id); + } + + if (size_limit != 0 && temp_snarl_record.node_count > size_limit) { + temp_index.most_oversized_snarl_size = std::max(temp_index.most_oversized_snarl_size, temp_snarl_record.node_count); + temp_index.use_oversized_snarls = true; + temp_snarl_record.is_simple = false; + populate_hub_labeling(temp_index, snarl_index, temp_snarl_record, all_children, graph); + + if (!temp_snarl_record.is_root_snarl) { + // We need to query the hub labeling to fill in min_length, + // distance_start_start, and distance_start_end with the connectivity + // distances through the snarl, not including boundary nodes. + // + // Luckily we know the start is always child rank 0 forward, and the end + // is always child rank 1 forward. + // + // To exclude the boundary lengths we go from source port to non-source + // port. + // + // Root snarls have no boundary nodes (no rank 0/1), so these queries + // are meaningless for them. The root read path ignores these fields too. + temp_snarl_record.min_length = promote_distance(hhl_query(temp_snarl_record.hub_labels.begin(), bgid(0, false, true), bgid(1, false, false))); + temp_snarl_record.distance_start_start = promote_distance(hhl_query(temp_snarl_record.hub_labels.begin(), bgid(0, false, true), bgid(0, true, false))); + temp_snarl_record.distance_end_end = promote_distance(hhl_query(temp_snarl_record.hub_labels.begin(), bgid(1, true, true), bgid(1, false, false))); + } + // TODO: Should this be here or should it be part of populate_hub_labeling()? Or its own function? } else { + if (size_limit == 0 || only_top_level_chain_distances) { temp_snarl_record.include_distances = false; + } + // Also fills in min_length, distance_start_start, and distance_start_end, and sets is_simple to false if snarl isn't simple + populate_distance_matrix_if_needed(temp_index, snarl_index, temp_snarl_record, all_children, graph, size_limit, only_top_level_chain_distances); } - if (size_limit != 0 && temp_snarl_record.node_count > size_limit) { - temp_index.most_oversized_snarl_size = std::max(temp_index.most_oversized_snarl_size, temp_snarl_record.node_count); - temp_index.use_oversized_snarls = true; +#ifdef debug_distance_indexing + cerr << "snarl " << temp_index.structure_start_end_as_string(snarl_index) << " is_simple: " << temp_snarl_record.is_simple << endl; +#endif + + if (temp_snarl_record.is_simple) { + // If this is a simple snarl (one with only single nodes that connect to the start and end nodes), then + // we want to remember if the child nodes are reversed + for (size_t i = 0 ; i < temp_snarl_record.node_count ; i++) { + //Get the index of the child + const SnarlDistanceIndex::temp_record_ref_t& child_index = temp_snarl_record.children[i]; + //Which is a node +#ifdef debug_distance_indexing + assert(child_index.first == SnarlDistanceIndex::TEMP_NODE); +#endif + + //And get the record + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = + temp_index.get_node(child_index); + size_t rank =temp_node_record.rank_in_parent; + + + + //Set the orientation of this node in the simple snarl + temp_node_record.reversed_in_parent = temp_node_record.distance_left_start == std::numeric_limits::max(); + } + + } + + // Decide if the snarl is regular. + temp_snarl_record.is_regular = check_regularity(temp_index, snarl_index, temp_snarl_record, all_children, graph); + + //Now that the distances are filled in, predict the size of the snarl in the index + temp_index.max_index_size += temp_snarl_record.get_max_record_length(); + if (temp_snarl_record.is_simple) { + temp_index.max_index_size -= (temp_snarl_record.children.size() * SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord::get_max_record_length()); } - //Add the start and end nodes to the list of children so that we include them in the traversal - if (!temp_snarl_record.is_root_snarl) { - all_children.emplace_back(SnarlDistanceIndex::TEMP_NODE, temp_snarl_record.start_node_id); - all_children.emplace_back(SnarlDistanceIndex::TEMP_NODE, temp_snarl_record.end_node_id); + // For simple snarl records, need 11 + 11 + number of bits for the number of children + temp_index.max_bits = std::max(temp_index.max_bits, 22 + SnarlDistanceIndex::bit_width(temp_snarl_record.children.size())); +} + +void populate_hub_labeling(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const vector& all_children, const HandleGraph* graph) { + CHOverlay ov = make_boost_graph(temp_index, snarl_index, temp_snarl_record, all_children, graph); + +#ifdef debug_hub_label_build + // Dump CHOverlay graph to stderr for debugging + std::cerr << "=== CHOverlay Graph Dump ===" << std::endl; + std::cerr << ov << std::endl; + std::cerr << "=== End CHOverlay Dump ===" << std::endl; +#endif + + make_contraction_hierarchy(ov); + + vector> labels; labels.resize(num_vertices(ov)); + vector> labels_rev; labels_rev.resize(num_vertices(ov)); + create_labels(labels, labels_rev, ov); +#ifdef debug_hub_label_storage + std::cerr << "Hub labels unpacked:" << std::endl; + for (const auto& node_list : {labels, labels_rev}) { + std::cerr << "Labels for all nodes:" << std::endl; + for (size_t i = 0; i < node_list.size(); i++) { + std::cerr << "\tLabels for rank " << i << ":" << std::endl; + for (const HubRecord& label : node_list[i]) { + std::cerr << "\t\tHub: " << label.hub << " Dist: " << label.dist << std::endl; + } } + } +#endif + + // Put labels in temp_snarl_record + temp_snarl_record.hub_labels = pack_labels(labels, labels_rev); +#ifdef debug_hub_label_storage + std::cerr << "Hub labels as packed: "; + for (size_t i = 0; i < temp_snarl_record.hub_labels.size(); i++) { + if (i > 0) { + std::cerr << " | "; + } + std::cerr << temp_snarl_record.hub_labels[i]; + } + std::cerr << std::endl; +#endif +} - while (!all_children.empty()) { - const pair start_index = std::move(all_children.back()); - all_children.pop_back(); +void populate_distance_matrix_if_needed(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const vector& all_children, const HandleGraph* graph, size_t size_limit, bool only_top_level_chain_distances) { + // This is only called for non-oversized snarls; oversized snarls go through populate_hub_labeling instead. +#ifdef debug_distance_indexing + assert(size_limit == 0 || temp_snarl_record.node_count <= size_limit); +#endif + if (size_limit != 0 && !only_top_level_chain_distances) { + //If we are saving distances + //Reserve enough space to store all possible distances. Since we are not oversized, node_count <= size_limit, + //so we always need the full node_count * node_count matrix. + temp_snarl_record.distances.reserve(temp_snarl_record.node_count * temp_snarl_record.node_count); + } else { + temp_snarl_record.include_distances = false; + } + for (auto it = all_children.rbegin(); it != all_children.rend(); ++it) { + // Visit all the children in reverse order + const SnarlDistanceIndex::temp_record_ref_t& start_index = *it; bool is_internal_node = false; @@ -1094,21 +1243,44 @@ void populate_snarl_index( && start_index.second != temp_snarl_record.start_node_id && start_index.second != temp_snarl_record.end_node_id) || - (start_index.first == SnarlDistanceIndex::TEMP_CHAIN && temp_index.temp_chain_records.at(start_index.second).is_trivial)) { - // This is an internal node + (start_index.first == SnarlDistanceIndex::TEMP_CHAIN && temp_index.get_chain(start_index).is_trivial)) { + // If this is an internal node is_internal_node = true; - } else if (start_index.first == SnarlDistanceIndex::TEMP_CHAIN && !temp_index.temp_chain_records.at(start_index.second).is_trivial) { + nid_t node_id = start_index.first == SnarlDistanceIndex::TEMP_NODE ? start_index.second : temp_index.get_chain(start_index).start_node_id; + SnarlDistanceIndex::temp_record_ref_t node_index {SnarlDistanceIndex::TEMP_NODE, node_id}; + size_t rank = start_index.first == SnarlDistanceIndex::TEMP_NODE ? temp_index.get_node(start_index).rank_in_parent + : temp_index.get_chain(start_index).rank_in_parent; + + bool has_edges = false; + graph->follow_edges(graph->get_handle(node_id, false), false, [&](const handle_t& next_handle) { + has_edges = true; + }); + if (!has_edges) { + temp_index.get_node(node_index).is_tip = true; + temp_snarl_record.tippy_child_ranks.emplace(rank, false); + temp_snarl_record.is_simple=false; //It is a tip so this isn't simple snarl + } + has_edges = false; + graph->follow_edges(graph->get_handle(node_id, true), false, [&](const handle_t& next_handle) { + has_edges = true; + }); + if (!has_edges) { + temp_index.get_node(node_index).is_tip = true; + temp_snarl_record.tippy_child_ranks.emplace(rank, true); + temp_snarl_record.is_simple=false; //It is a tip so this isn't simple snarl + } + } else if (start_index.first == SnarlDistanceIndex::TEMP_CHAIN && !temp_index.get_chain(start_index).is_trivial) { // If this is an internal chain, then it isn't a simple snarl temp_snarl_record.is_simple=false; } bool start_is_tip = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).is_tip - : temp_index.temp_chain_records.at(start_index.second).is_tip; + ? temp_index.get_node(start_index).is_tip + : temp_index.get_chain(start_index).is_tip; size_t start_rank = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).rank_in_parent - : temp_index.temp_chain_records.at(start_index.second).rank_in_parent; + ? temp_index.get_node(start_index).rank_in_parent + : temp_index.get_chain(start_index).rank_in_parent; if (start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second == temp_snarl_record.start_node_id) { @@ -1118,357 +1290,587 @@ void populate_snarl_index( } //TODO: //else { // assert(start_rank != 0 && start_rank != 1); - //} + //} - if ( (temp_snarl_record.node_count > size_limit || size_limit == 0 || only_top_level_chain_distances) && (temp_snarl_record.is_root_snarl || (!start_is_tip && - start_rank != 0 && start_rank != 1))) { - //If we don't care about internal distances, and we also are not at a boundary or tip + //traversal start is not a tip or a boundary node + bool start_normal_child = (!start_is_tip && start_rank != 0 && start_rank != 1); + + if ( (size_limit == 0 || only_top_level_chain_distances) && (temp_snarl_record.is_root_snarl || start_normal_child)) { + //We don't care about internal distances, and this child is a root child or a normal (non-boundary, non-tip) child //TODO: Why do we care about tips specifically? continue; } + //fill in all distances for a row + populate_distance_matrix_row(temp_index, snarl_index, temp_snarl_record, start_index, graph, start_rank, is_internal_node, size_limit); + } +} + + + +void populate_distance_matrix_row(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const SnarlDistanceIndex::temp_record_ref_t& start_index, const HandleGraph* graph, size_t start_rank, bool is_internal_node, size_t size_limit) { + /*Helper function to find the ancestor of a node that is a child of this snarl */ + auto get_ancestor_of_node = [&](SnarlDistanceIndex::temp_record_ref_t curr_index, + SnarlDistanceIndex::temp_record_ref_t ancestor_snarl_index) { - //Start from either direction for all nodes, but only going in for start and end - vector directions; - if (start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second == temp_snarl_record.start_node_id) { - directions.emplace_back(temp_snarl_record.start_node_rev); - } else if (start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second == temp_snarl_record.end_node_id){ - directions.emplace_back(!temp_snarl_record.end_node_rev); - } else { - directions.emplace_back(true); - directions.emplace_back(false); + //This is a child that isn't a node, so it must be a chain + if (curr_index.second == temp_snarl_record.start_node_id || + curr_index.second == temp_snarl_record.end_node_id) { + return curr_index; } - for (bool start_rev : directions) { - //Start a dijkstra traversal from start_index going in the direction indicated by start_rev - //Record the distances to each node (child of the snarl) found - size_t reachable_node_count = 0; //How many nodes can we reach from this node side? + //Otherwise, walk up until we hit the current snarl + SnarlDistanceIndex::temp_record_ref_t parent_index = temp_index.get_node(curr_index).parent; + while (parent_index != ancestor_snarl_index) { + curr_index=parent_index; + parent_index = parent_index.first == SnarlDistanceIndex::TEMP_SNARL ? temp_index.get_snarl(parent_index).parent + : temp_index.get_chain(parent_index).parent; #ifdef debug_distance_indexing - cerr << " Starting from child " << temp_index.structure_start_end_as_string(start_index) - << " going " << (start_rev ? "rev" : "fd") << endl; -#endif - - //Define a NetgraphNode as the value for the priority queue: - // , direction> - using NetgraphNode = pair, bool>>; - auto cmp = [] (const NetgraphNode a, const NetgraphNode b) { - return a.first > b.first; - }; - - //The priority queue of the next nodes to visit, ordered by the distance - std::priority_queue, decltype(cmp)> queue(cmp); - //The nodes we've already visited - unordered_set, bool>> visited_nodes; - visited_nodes.reserve(temp_snarl_record.node_count * 2); - - //Start from the current start node - queue.push(make_pair(0, make_pair(start_index, start_rev))); - - while (!queue.empty()) { - - //Get the current node from the queue and pop it out of the queue - size_t current_distance = queue.top().first; - pair current_index = queue.top().second.first; - bool current_rev = queue.top().second.second; - if (visited_nodes.count(queue.top().second)) { - queue.pop(); - continue; - } - visited_nodes.emplace(queue.top().second); + assert(parent_index.first != SnarlDistanceIndex::TEMP_ROOT); +#endif + } + + return curr_index; + }; + + //Start from either direction for all nodes, but only going in for start and end + vector directions; + if (start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second == temp_snarl_record.start_node_id) { + directions.emplace_back(temp_snarl_record.start_node_rev); + } else if (start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second == temp_snarl_record.end_node_id){ + directions.emplace_back(!temp_snarl_record.end_node_rev); + } else { + directions.emplace_back(true); + directions.emplace_back(false); + } + for (bool start_rev : directions) { + //Start a dijkstra traversal from start_index going in the direction indicated by start_rev + //Record the distances to each node (child of the snarl) found + size_t reachable_node_count = 0; //How many nodes can we reach from this node side? + +#ifdef debug_distance_indexing + cerr << " Starting from child " << temp_index.structure_start_end_as_string(start_index) + << " going " << (start_rev ? "rev" : "fd") << endl; +#endif + + //Define a NetgraphNode as the value for the priority queue: + // , direction> + using NetgraphNode = pair>; + auto cmp = [] (const NetgraphNode a, const NetgraphNode b) { + return a.first > b.first; + }; + + //The priority queue of the next nodes to visit, ordered by the distance + std::priority_queue, decltype(cmp)> queue(cmp); + //The nodes we've already visited + unordered_set> visited_nodes; + visited_nodes.reserve(temp_snarl_record.node_count * 2); + + //Start from the current start node + queue.push(make_pair(0, make_pair(start_index, start_rev))); + + while (!queue.empty()) { + + //Get the current node from the queue and pop it out of the queue + size_t current_distance = queue.top().first; + SnarlDistanceIndex::temp_record_ref_t current_index = queue.top().second.first; + bool current_rev = queue.top().second.second; + if (visited_nodes.count(queue.top().second)) { queue.pop(); + continue; + } + visited_nodes.emplace(queue.top().second); + queue.pop(); - //The handle that we need to follow to get the next reachable nodes - //If the current node is a node, then its just the node. Otherwise, it's the - //opposite side of the child chain - handle_t current_end_handle = current_index.first == SnarlDistanceIndex::TEMP_NODE ? - graph->get_handle(current_index.second, current_rev) : - (current_rev ? graph->get_handle(temp_index.temp_chain_records[current_index.second].start_node_id, - !temp_index.temp_chain_records[current_index.second].start_node_rev) - : graph->get_handle(temp_index.temp_chain_records[current_index.second].end_node_id, - temp_index.temp_chain_records[current_index.second].end_node_rev)); + //The handle that we need to follow to get the next reachable nodes + //If the current node is a node, then its just the node. Otherwise, it's the + //opposite side of the child chain + handle_t current_end_handle = current_index.first == SnarlDistanceIndex::TEMP_NODE ? + graph->get_handle(current_index.second, current_rev) : + (current_rev ? graph->get_handle(temp_index.get_chain(current_index).start_node_id, + !temp_index.get_chain(current_index).start_node_rev) + : graph->get_handle(temp_index.get_chain(current_index).end_node_id, + temp_index.get_chain(current_index).end_node_rev)); #ifdef debug_distance_indexing - cerr << " at child " << temp_index.structure_start_end_as_string(current_index) << " going " - << (current_rev ? "rev" : "fd") << " at actual node " << graph->get_id(current_end_handle) - << (graph->get_is_reverse(current_end_handle) ? "rev" : "fd") << endl; + cerr << " at child " << temp_index.structure_start_end_as_string(current_index) << " going " + << (current_rev ? "rev" : "fd") << " at actual node " << graph->get_id(current_end_handle) + << (graph->get_is_reverse(current_end_handle) ? "rev" : "fd") << endl; #endif - graph->follow_edges(current_end_handle, false, [&](const handle_t next_handle) { - if (graph->get_id(current_end_handle) == graph->get_id(next_handle)){ - //If this loops onto the same node side then this isn't a simple snarl - temp_snarl_record.is_simple = false; - } else if ((current_index.first == SnarlDistanceIndex::TEMP_NODE ? current_index.second - : (current_rev ? temp_index.temp_chain_records[current_index.second].end_node_id - : temp_index.temp_chain_records[current_index.second].start_node_id)) - == graph->get_id(next_handle)){ - //If this loops to the other end of the chain then this isn't a simple snarl - temp_snarl_record.is_simple = false; - } else if (!temp_snarl_record.is_root_snarl && start_rank == 0 && - current_index != start_index && graph->get_id(next_handle) != temp_snarl_record.end_node_id) { - //If the starting point of this traversal was the start of the snarl, the current starting point is not the start node, - //and we found another child, then this is not a simple snarl - temp_snarl_record.is_simple = false; - } else if (!temp_snarl_record.is_root_snarl && start_rank == 1 && - current_index != start_index && graph->get_id(next_handle) != temp_snarl_record.start_node_id) { - //If the starting point of this traversal was the end of the snarl, the current starting point is not the end node, - //and we found another child, then this is not a simple snarl + graph->follow_edges(current_end_handle, false, [&](const handle_t& next_handle) { +#ifdef debug_distance_indexing + cerr << " see edge " << graph->get_id(current_end_handle) + << (graph->get_is_reverse(current_end_handle) ? "rev" : "fd") + << " -> " << graph->get_id(next_handle) + << (graph->get_is_reverse(next_handle) ? "rev" : "fd") << endl; +#endif + + if (graph->get_id(current_end_handle) == graph->get_id(next_handle)) { + //If this loops onto the same node side then this isn't a simple snarl + temp_snarl_record.is_simple = false; + } else if ((current_index.first == SnarlDistanceIndex::TEMP_NODE ? current_index.second + : (current_rev ? temp_index.get_chain(current_index).end_node_id + : temp_index.get_chain(current_index).start_node_id)) + == graph->get_id(next_handle)){ + //If this loops to the other end of the chain then this isn't a simple snarl + temp_snarl_record.is_simple = false; + } else if (!temp_snarl_record.is_root_snarl && start_rank == 0 && + current_index != start_index && graph->get_id(next_handle) != temp_snarl_record.end_node_id) { + //If the starting point of this traversal was the start of the snarl, the current starting point is not the start node, + //and we found another child, then this is not a simple snarl + temp_snarl_record.is_simple = false; + } else if (!temp_snarl_record.is_root_snarl && start_rank == 1 && + current_index != start_index && graph->get_id(next_handle) != temp_snarl_record.start_node_id) { + //If the starting point of this traversal was the end of the snarl, the current starting point is not the end node, + //and we found another child, then this is not a simple snarl + temp_snarl_record.is_simple = false; + } + + reachable_node_count++; + + SnarlDistanceIndex::temp_record_ref_t next_node_index = make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle)); + + //At each of the nodes reachable from the current one, fill in the distance from the start + //node to the next node (current_distance). If this handle isn't leaving the snarl, + //add the next nodes along with the distance to the end of the next node + auto& node_record = temp_index.get_node(next_node_index); + + //The index of the snarl's child that next_handle represents + SnarlDistanceIndex::temp_record_ref_t next_index = + get_ancestor_of_node(next_node_index, snarl_index); + + bool next_is_tip = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.get_node(start_index).is_tip + : temp_index.get_chain(start_index).is_tip; + + //The rank and orientation of next in the snarl + size_t next_rank = next_index.first == SnarlDistanceIndex::TEMP_NODE + ? node_record.rank_in_parent + : temp_index.get_chain(next_index).rank_in_parent; + if (next_index.first == SnarlDistanceIndex::TEMP_NODE && next_index.second == temp_snarl_record.start_node_id) { +#ifdef debug_distance_indexing + std::cerr << " edge arrived at start" << std::endl; +#endif + next_rank = 0; + } else if (next_index.first == SnarlDistanceIndex::TEMP_NODE && next_index.second == temp_snarl_record.end_node_id) { +#ifdef debug_distance_indexing + std::cerr << " edge arrived at end" << std::endl; +#endif + next_rank = 1; + } else { + //If the next thing wasn't a boundary node and this was an internal node, then it isn't a simple snarl + if (is_internal_node) { temp_snarl_record.is_simple = false; } + }//TODO: This won't be true of root snarls + //else { + // assert(next_rank != 0 && next_rank != 1); + //} + bool next_rev = next_index.first == SnarlDistanceIndex::TEMP_NODE || temp_index.get_chain(next_index).is_trivial + ? graph->get_is_reverse(next_handle) + : graph->get_id(next_handle) == temp_index.get_chain(next_index).end_node_id; + + /**Record the distance **/ + bool start_is_boundary = !temp_snarl_record.is_root_snarl && (start_rank == 0 || start_rank == 1); + bool next_is_boundary = !temp_snarl_record.is_root_snarl && (next_rank == 0 || next_rank == 1); + + pair start = start_is_boundary + ? make_pair(start_rank, false) : make_pair(start_rank, !start_rev); + pair next = next_is_boundary + ? make_pair(next_rank, false) : make_pair(next_rank, next_rev); + + if (size_limit == 0 && start_is_boundary && next_is_boundary) { + // If not measuring distances, we need to use + // distance_start_start and distance_end_end as + // connectivity flags so we can still detect reversals + // within chains and recognize regular snarls. + if (start_rank == 0 && next_rank == 0) { + temp_snarl_record.distance_start_start = 0; +#ifdef debug_distance_indexing + cerr << " set loop indicator start start distance " << temp_snarl_record.distance_start_start << endl; +#endif + } else if (start_rank == 1 && next_rank == 1) { + temp_snarl_record.distance_end_end = 0; +#ifdef debug_distance_indexing + cerr << " set loop indicator end end distance " << temp_snarl_record.distance_start_start << endl; +#endif + } + } else if (size_limit != 0 && + (temp_snarl_record.node_count <= size_limit || start_is_boundary || next_is_boundary)) { + //If the snarl is too big, then we don't record distances between internal nodes + //If we are looking at all distances or we are looking at boundaries + bool added_new_distance = false; + + //Set the distance + if (start_is_boundary && next_is_boundary) { + //If it is between bounds of the snarl, then the snarl stores it + if (start_rank == 0 && next_rank == 0 && + temp_snarl_record.distance_start_start == std::numeric_limits::max()) { + temp_snarl_record.distance_start_start = current_distance; +#ifdef debug_distance_indexing + cerr << " set start start distance " << temp_snarl_record.distance_start_start << endl; +#endif + added_new_distance = true; + } else if (start_rank == 1 && next_rank == 1 && + temp_snarl_record.distance_end_end == std::numeric_limits::max()) { + temp_snarl_record.distance_end_end = current_distance; +#ifdef debug_distance_indexing + cerr << " set end end distance " << temp_snarl_record.distance_start_start << endl; +#endif + added_new_distance = true; + } else if (((start_rank == 0 && next_rank == 1) || (start_rank == 1 && next_rank == 0)) + && temp_snarl_record.min_length == std::numeric_limits::max()){ + temp_snarl_record.min_length = current_distance; + added_new_distance = true; - reachable_node_count++; - //At each of the nodes reachable from the current one, fill in the distance from the start - //node to the next node (current_distance). If this handle isn't leaving the snarl, - //add the next nodes along with the distance to the end of the next node - auto& node_record = temp_index.temp_node_records.at(graph->get_id(next_handle)-temp_index.min_node_id); - - //The index of the snarl's child that next_handle represents - pair next_index = - get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle)), snarl_index); - - bool next_is_tip = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).is_tip - : temp_index.temp_chain_records.at(start_index.second).is_tip; - - //The rank and orientation of next in the snarl - size_t next_rank = next_index.first == SnarlDistanceIndex::TEMP_NODE - ? node_record.rank_in_parent - : temp_index.temp_chain_records[next_index.second].rank_in_parent; - if (next_index.first == SnarlDistanceIndex::TEMP_NODE && next_index.second == temp_snarl_record.start_node_id) { - next_rank = 0; - } else if (next_index.first == SnarlDistanceIndex::TEMP_NODE && next_index.second == temp_snarl_record.end_node_id) { - next_rank = 1; - } else { - //If the next thing wasn't a boundary node and this was an internal node, then it isn't a simple snarl - if (is_internal_node) { - temp_snarl_record.is_simple = false; } - }//TODO: This won't be true of root snarls - //else { - // assert(next_rank != 0 && next_rank != 1); - //} - bool next_rev = next_index.first == SnarlDistanceIndex::TEMP_NODE || temp_index.temp_chain_records[next_index.second].is_trivial - ? graph->get_is_reverse(next_handle) - : graph->get_id(next_handle) == temp_index.temp_chain_records[next_index.second].end_node_id; - - /**Record the distance **/ - bool start_is_boundary = !temp_snarl_record.is_root_snarl && (start_rank == 0 || start_rank == 1); - bool next_is_boundary = !temp_snarl_record.is_root_snarl && (next_rank == 0 || next_rank == 1); - - if (size_limit != 0 && - (temp_snarl_record.node_count <= size_limit || start_is_boundary || next_is_boundary)) { - //If the snarl is too big, then we don't record distances between internal nodes - //If we are looking at all distances or we are looking at boundaries - bool added_new_distance = false; - - //Set the distance - pair start = start_is_boundary - ? make_pair(start_rank, false) : make_pair(start_rank, !start_rev); - pair next = next_is_boundary - ? make_pair(next_rank, false) : make_pair(next_rank, next_rev); - if (start_is_boundary && next_is_boundary) { - //If it is between bounds of the snarl, then the snarl stores it - if (start_rank == 0 && next_rank == 0 && - temp_snarl_record.distance_start_start == std::numeric_limits::max()) { - temp_snarl_record.distance_start_start = current_distance; - added_new_distance = true; - } else if (start_rank == 1 && next_rank == 1 && - temp_snarl_record.distance_end_end == std::numeric_limits::max()) { - temp_snarl_record.distance_end_end = current_distance; - added_new_distance = true; - } else if (((start_rank == 0 && next_rank == 1) || (start_rank == 1 && next_rank == 0)) - && temp_snarl_record.min_length == std::numeric_limits::max()){ - temp_snarl_record.min_length = current_distance; + } else if (start_is_boundary){ + //If start is a boundary node + if (next_index.first == SnarlDistanceIndex::TEMP_NODE) { + //Next is a node + auto& temp_node_record = temp_index.get_node(next_index); + if (start_rank == 0 && !next_rev && + temp_node_record.distance_left_start == std::numeric_limits::max()) { + temp_node_record.distance_left_start = current_distance; added_new_distance = true; - + } else if (start_rank == 0 && next_rev && + temp_node_record.distance_right_start == std::numeric_limits::max()) { + temp_node_record.distance_right_start = current_distance; + added_new_distance = true; + } else if (start_rank == 1 && !next_rev && + temp_node_record.distance_left_end == std::numeric_limits::max()) { + temp_node_record.distance_left_end = current_distance; + added_new_distance = true; + } else if (start_rank == 1 && next_rev && + temp_node_record.distance_right_end == std::numeric_limits::max()) { + temp_node_record.distance_right_end = current_distance; + added_new_distance = true; } - } else if (start_is_boundary){ - //If start is a boundary node - if (next_index.first == SnarlDistanceIndex::TEMP_NODE) { - //Next is a node - auto& temp_node_record = temp_index.temp_node_records.at(next_index.second-temp_index.min_node_id); - if (start_rank == 0 && !next_rev && - temp_node_record.distance_left_start == std::numeric_limits::max()) { - temp_node_record.distance_left_start = current_distance; - added_new_distance = true; - } else if (start_rank == 0 && next_rev && - temp_node_record.distance_right_start == std::numeric_limits::max()) { - temp_node_record.distance_right_start = current_distance; - added_new_distance = true; - } else if (start_rank == 1 && !next_rev && - temp_node_record.distance_left_end == std::numeric_limits::max()) { - temp_node_record.distance_left_end = current_distance; - added_new_distance = true; - } else if (start_rank == 1 && next_rev && - temp_node_record.distance_right_end == std::numeric_limits::max()) { - temp_node_record.distance_right_end = current_distance; - added_new_distance = true; - } - } else { - //Next is a chain - auto& temp_chain_record = temp_index.temp_chain_records.at(next_index.second); - if (start_rank == 0 && !next_rev && - temp_chain_record.distance_left_start == std::numeric_limits::max()) { - temp_chain_record.distance_left_start = current_distance; - added_new_distance = true; - } else if (start_rank == 0 && next_rev && - temp_chain_record.distance_right_start == std::numeric_limits::max()) { - temp_chain_record.distance_right_start = current_distance; - added_new_distance = true; - } else if (start_rank == 1 && !next_rev && - temp_chain_record.distance_left_end == std::numeric_limits::max()) { - temp_chain_record.distance_left_end = current_distance; - added_new_distance = true; - } else if (start_rank == 1 && next_rev && - temp_chain_record.distance_right_end == std::numeric_limits::max()) { - temp_chain_record.distance_right_end = current_distance; - added_new_distance = true; - } + } else { + //Next is a chain + auto& temp_chain_record = temp_index.get_chain(next_index); + if (start_rank == 0 && !next_rev && + temp_chain_record.distance_left_start == std::numeric_limits::max()) { + temp_chain_record.distance_left_start = current_distance; + added_new_distance = true; + } else if (start_rank == 0 && next_rev && + temp_chain_record.distance_right_start == std::numeric_limits::max()) { + temp_chain_record.distance_right_start = current_distance; + added_new_distance = true; + } else if (start_rank == 1 && !next_rev && + temp_chain_record.distance_left_end == std::numeric_limits::max()) { + temp_chain_record.distance_left_end = current_distance; + added_new_distance = true; + } else if (start_rank == 1 && next_rev && + temp_chain_record.distance_right_end == std::numeric_limits::max()) { + temp_chain_record.distance_right_end = current_distance; + added_new_distance = true; } - } else if (!next_is_boundary && !temp_snarl_record.distances.count(make_pair(start, next))) { - //Otherwise the snarl stores it in its distance - //If the distance isn't from an internal node to a bound and we haven't stored the distance yet + } + } else if (!next_is_boundary && !temp_snarl_record.distances.count(make_pair(start, next))) { + //Otherwise the snarl stores it in its distance + //If the distance isn't from an internal node to a bound and we haven't stored the distance yet - temp_snarl_record.distances[make_pair(start, next)] = current_distance; - added_new_distance = true; + temp_snarl_record.distances[make_pair(start, next)] = current_distance; + added_new_distance = true; #ifdef debug_distance_indexing - cerr << " Adding distance between ranks " << start.first << " " << start.second << " and " << next.first << " " << next.second << ": " << current_distance << endl; + cerr << " Adding distance between ranks " << start.first << " " << start.second << " and " << next.first << " " << next.second << ": " << current_distance << endl; #endif - } - if (added_new_distance) { - temp_snarl_record.max_distance = std::max(temp_snarl_record.max_distance, current_distance); - } } + if (added_new_distance) { + temp_snarl_record.max_distance = std::max(temp_snarl_record.max_distance, current_distance); + } + } - /**Add the next node to the priority queue**/ - - if (visited_nodes.count(make_pair(next_index, next_rev)) == 0 && + /**Add the next node to the priority queue**/ + + if (visited_nodes.count(make_pair(next_index, next_rev)) == 0 && + graph->get_id(next_handle) != temp_snarl_record.start_node_id && + graph->get_id(next_handle) != temp_snarl_record.end_node_id + ) { + //If this isn't leaving the snarl, + //then add the next node to the queue, along with the distance to traverse it + size_t next_node_length = next_index.first == SnarlDistanceIndex::TEMP_NODE ? graph->get_length(next_handle) : + temp_index.get_chain(next_index).min_length; + if (next_index.first == SnarlDistanceIndex::TEMP_CHAIN && + temp_index.get_chain(next_index).chain_components.back() != 0) { + //If there are multiple components, then the chain is not start-end reachable so its length + //is actually infinite + next_node_length = std::numeric_limits::max(); + } + if (next_node_length != std::numeric_limits::max()) { + queue.push(make_pair(SnarlDistanceIndex::sum(current_distance, next_node_length), + make_pair(next_index, next_rev))); + } + } + if (next_index.first == SnarlDistanceIndex::TEMP_CHAIN) { + size_t loop_distance = next_rev ? temp_index.get_chain(next_index).backward_loops.back() + : temp_index.get_chain(next_index).forward_loops.front(); + if (loop_distance != std::numeric_limits::max() && + visited_nodes.count(make_pair(next_index, !next_rev)) == 0 && graph->get_id(next_handle) != temp_snarl_record.start_node_id && graph->get_id(next_handle) != temp_snarl_record.end_node_id ) { - //If this isn't leaving the snarl, - //then add the next node to the queue, along with the distance to traverse it - size_t next_node_length = next_index.first == SnarlDistanceIndex::TEMP_NODE ? graph->get_length(next_handle) : - temp_index.temp_chain_records[next_index.second].min_length; - if (next_index.first == SnarlDistanceIndex::TEMP_CHAIN && - temp_index.temp_chain_records[next_index.second].chain_components.back() != 0) { - //If there are multiple components, then the chain is not start-end reachable so its length - //is actually infinite - next_node_length = std::numeric_limits::max(); - } - if (next_node_length != std::numeric_limits::max()) { - queue.push(make_pair(SnarlDistanceIndex::sum(current_distance, next_node_length), - make_pair(next_index, next_rev))); - } - } - if (next_index.first == SnarlDistanceIndex::TEMP_CHAIN) { - size_t loop_distance = next_rev ? temp_index.temp_chain_records[next_index.second].backward_loops.back() - : temp_index.temp_chain_records[next_index.second].forward_loops.front(); - if (loop_distance != std::numeric_limits::max() && - visited_nodes.count(make_pair(next_index, !next_rev)) == 0 && - graph->get_id(next_handle) != temp_snarl_record.start_node_id && - graph->get_id(next_handle) != temp_snarl_record.end_node_id - ) { - //If the next node can loop back on itself, then add the next node in the opposite direction - size_t next_node_len = loop_distance + 2 * graph->get_length(next_handle); - queue.push(make_pair(SnarlDistanceIndex::sum(current_distance, next_node_len), - make_pair(next_index, !next_rev))); - } + //If the next node can loop back on itself, then add the next node in the opposite direction + size_t next_node_len = loop_distance + 2 * graph->get_length(next_handle); + queue.push(make_pair(SnarlDistanceIndex::sum(current_distance, next_node_len), + make_pair(next_index, !next_rev))); } + } #ifdef debug_distance_indexing - cerr << " reached child " << temp_index.structure_start_end_as_string(next_index) << "going " - << (next_rev ? "rev" : "fd") << " with distance " << current_distance << " for ranks " << start_rank << " " << next_rank << endl; + cerr << " reached child " << temp_index.structure_start_end_as_string(next_index) << " going " + << (next_rev ? "rev" : "fd") << " with distance " << current_distance << " for ranks " << start_rank << " " << next_rank << endl; #endif - }); - } - if (is_internal_node && reachable_node_count != 1) { - //If this is an internal node, then it must have only one edge for it to be a simple snarl - temp_snarl_record.is_simple = false; - } + }); + } + if (is_internal_node && reachable_node_count != 1) { + //If this is an internal node, then it must have only one edge for it to be a simple snarl + temp_snarl_record.is_simple = false; } + } - /** Check the minimum length of the snarl passing through this node **/ - if (start_rank != 0 && start_rank != 1) { - - size_t child_max_length = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).node_length - : temp_index.temp_chain_records.at(start_index.second).max_length; - //The distance through the whole snarl traversing this node forwards - //(This might actually be traversing it backwards but it doesn't really matter) - - size_t dist_start_left = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).distance_left_start - : temp_index.temp_chain_records.at(start_index.second).distance_left_start; - size_t dist_end_right = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).distance_right_end - : temp_index.temp_chain_records.at(start_index.second).distance_right_end; - size_t dist_start_right = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).distance_right_start - : temp_index.temp_chain_records.at(start_index.second).distance_right_start; - size_t dist_end_left = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).distance_left_end - : temp_index.temp_chain_records.at(start_index.second).distance_left_end; - - size_t snarl_length_fd = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - dist_start_left, dist_end_right),child_max_length); - //The same thing traversing this node backwards - size_t snarl_length_rev = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - dist_start_right, dist_end_left), child_max_length); - //The max that isn't infinite - size_t max_length = - snarl_length_rev == std::numeric_limits::max() - ? snarl_length_fd - : (snarl_length_fd == std::numeric_limits::max() - ? snarl_length_rev - : std::max(snarl_length_rev, snarl_length_fd)); - if (max_length != std::numeric_limits::max()) { - temp_snarl_record.max_length = std::max(temp_snarl_record.max_length, max_length); - } - if ( temp_snarl_record.is_simple && - ! ((dist_start_left == 0 && dist_end_right == 0 && dist_end_left == std::numeric_limits::max() && dist_start_right == std::numeric_limits::max() ) || - (dist_start_left == std::numeric_limits::max() && dist_end_right == std::numeric_limits::max() && dist_end_left == 0 && dist_start_right == 0 ))){ - //If the snarl is simple, double check that this node is actually simple: that it can only be traversed going - //across the nsarl - temp_snarl_record.is_simple = false; - } + /** Check the minimum length of the snarl passing through this node **/ + if (start_rank != 0 && start_rank != 1) { + + size_t child_max_length = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.get_node(start_index).node_length + : temp_index.get_chain(start_index).max_length; + //The distance through the whole snarl traversing this node forwards + //(This might actually be traversing it backwards but it doesn't really matter) + + size_t dist_start_left = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.get_node(start_index).distance_left_start + : temp_index.get_chain(start_index).distance_left_start; + size_t dist_end_right = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.get_node(start_index).distance_right_end + : temp_index.get_chain(start_index).distance_right_end; + size_t dist_start_right = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.get_node(start_index).distance_right_start + : temp_index.get_chain(start_index).distance_right_start; + size_t dist_end_left = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.get_node(start_index).distance_left_end + : temp_index.get_chain(start_index).distance_left_end; + + size_t snarl_length_fd = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + dist_start_left, dist_end_right),child_max_length); + //The same thing traversing this node backwards + size_t snarl_length_rev = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + dist_start_right, dist_end_left), child_max_length); + //The max that isn't infinite + size_t max_length = + snarl_length_rev == std::numeric_limits::max() + ? snarl_length_fd + : (snarl_length_fd == std::numeric_limits::max() + ? snarl_length_rev + : std::max(snarl_length_rev, snarl_length_fd)); + if (max_length != std::numeric_limits::max()) { + temp_snarl_record.max_length = std::max(temp_snarl_record.max_length, max_length); + } + if ( temp_snarl_record.is_simple && + ! ((dist_start_left == 0 && dist_end_right == 0 && dist_end_left == std::numeric_limits::max() && dist_start_right == std::numeric_limits::max() ) || + (dist_start_left == std::numeric_limits::max() && dist_end_right == std::numeric_limits::max() && dist_end_left == 0 && dist_start_right == 0 ))){ + //If the snarl is simple, double check that this node is actually simple: that it can only be traversed going + //across the nsarl + temp_snarl_record.is_simple = false; } } +} +bool check_regularity(const SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, const SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const vector& all_children, const HandleGraph* graph) { +#ifdef debug_distance_indexing + std::cerr << "Check if snarl " << temp_snarl_record.start_node_id << " to " << temp_snarl_record.end_node_id << " with " << all_children.size() << " children is regular" << std::endl; +#endif - //If this is a simple snarl (one with only single nodes that connect to the start and end nodes), then - // we want to remember if the child nodes are reversed + if (temp_snarl_record.is_root_snarl) { + // Roots can't be regular. +#ifdef debug_distance_indexing + std::cerr << "Snarl is not regular because it is a root snarl." << std::endl; +#endif + return false; + } if (temp_snarl_record.is_simple) { - for (size_t i = 0 ; i < temp_snarl_record.node_count ; i++) { - //Get the index of the child - const pair& child_index = temp_snarl_record.children[i]; - //Which is a node + // Simple snarls are always also regular. #ifdef debug_distance_indexing - assert(child_index.first == SnarlDistanceIndex::TEMP_NODE); + std::cerr << "Snarl is regular because it is simple." << std::endl; #endif + return true; + } - //And get the record - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = - temp_index.temp_node_records[child_index.second-temp_index.min_node_id]; - size_t rank =temp_node_record.rank_in_parent; + // Get the snarl boundary nodes, facing out + handle_t start_out = graph->get_handle(temp_snarl_record.start_node_id, !temp_snarl_record.start_node_rev); + handle_t end_out = graph->get_handle(temp_snarl_record.end_node_id, temp_snarl_record.end_node_rev); + + // Define accessors to get bounding graph handles for children, facing out. + auto child_start_out = [&](const SnarlDistanceIndex::temp_record_ref_t& child_index) { + return child_index.first == SnarlDistanceIndex::TEMP_NODE ? + graph->get_handle(child_index.second, true) : + graph->get_handle( + temp_index.get_chain(child_index).start_node_id, + !temp_index.get_chain(child_index).start_node_rev + ); + }; + auto child_end_out = [&](const SnarlDistanceIndex::temp_record_ref_t& child_index) { + return child_index.first == SnarlDistanceIndex::TEMP_NODE ? + graph->get_handle(child_index.second, false) : + graph->get_handle( + temp_index.get_chain(child_index).end_node_id, + temp_index.get_chain(child_index).end_node_rev + ); + }; - + for (const SnarlDistanceIndex::temp_record_ref_t& child_index : all_children) { + // We should only have nodes and chains as children + assert(child_index.first == SnarlDistanceIndex::TEMP_NODE + || child_index.first == SnarlDistanceIndex::TEMP_CHAIN); + if (child_index.first == SnarlDistanceIndex::TEMP_NODE + && (child_index.second == temp_snarl_record.start_node_id + || child_index.second == temp_snarl_record.end_node_id)) { + // Don't think about children for the snarl bounds now; we handle the bounds later. + continue; + } - //Set the orientation of this node in the simple snarl - temp_node_record.reversed_in_parent = temp_node_record.distance_left_start == std::numeric_limits::max(); + // Have we seen the snarl start? + bool saw_start = false; + // Have we seen the snarl end? + bool saw_end = false; + // Have we seen anything else, or a duplicate snarl boundary? + bool saw_other = false; + + auto handle_destination = [&](const handle_t& next_handle) { +#ifdef debug_distance_indexing + std::cerr << "\tConnects to " << graph->get_id(next_handle) << (graph->get_is_reverse(next_handle) ? "-" : "+") << std::endl; +#endif + + // Every edge out the end the child must go to a snarl boundary out + // that hasn't been reached yet. + if (next_handle == start_out && !saw_start) { + saw_start = true; +#ifdef debug_distance_indexing + std::cerr << "\t\tThis is a new connection to snarl start" << std::endl; +#endif + return true; + } else if (next_handle == end_out && !saw_end) { + saw_end = true; +#ifdef debug_distance_indexing + std::cerr << "\t\tThis is a new connection to snarl end" << std::endl; +#endif + return true; + } else { + saw_other = true; + // We don't care if we have an edge going the right way because + // we found an edge going the wrong way. +#ifdef debug_distance_indexing + std::cerr << "\t\tThis is an unwanted connection!" << std::endl; +#endif + return false; + } + }; + + // Check the edges off the child start + handle_t here = child_start_out(child_index); +#ifdef debug_distance_indexing + std::cerr << "Look right from " << graph->get_id(here) << (graph->get_is_reverse(here) ? "-" : "+") << std::endl; +#endif + graph->follow_edges(here, false, handle_destination); + + if (saw_other || !(saw_start != saw_end)) { + // We have an edge we shouldn't, or we don't connect to exactly one boundary. +#ifdef debug_distance_indexing + std::cerr << "\tWe must not be regular" << std::endl; +#endif + return false; + } + + // Check the edges off the child end + here = child_end_out(child_index); +#ifdef debug_distance_indexing + std::cerr << "Look right from " << graph->get_id(here) << (graph->get_is_reverse(here) ? "-" : "+") << std::endl; +#endif + graph->follow_edges(here, false, handle_destination); + + if (saw_other || !saw_start || !saw_end) { + // We have an edge we shouldn't, or we haven't reached both + // boundaries exactly once across the two ends of the child. +#ifdef debug_distance_indexing + std::cerr << "\tWe must not be regular" << std::endl; +#endif + return false; + } + + if (child_index.first == SnarlDistanceIndex::TEMP_CHAIN) { + // If a child is a chain, check it for loops +#ifdef debug_distance_indexing + std::cerr << "Check child chain for loops." << std::endl; +#endif + const SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.get_chain(child_index); +#ifdef debug_distance_indexing + std::cerr << "Forward loops:"; + for (auto& l : temp_chain_record.forward_loops) { + std::cerr << " " << l; + } + std::cerr << std::endl; +#endif + if (!temp_chain_record.forward_loops.empty() && temp_chain_record.forward_loops.front() != std::numeric_limits::max()) { + // There's a forward loop in this child chain, so the snarl's not regular. +#ifdef debug_distance_indexing + std::cerr << "We are not regular because there's a forward loop in this child chain." << std::endl; +#endif + return false; + } + +#ifdef debug_distance_indexing + std::cerr << "Backward loops:"; + for (auto& l : temp_chain_record.backward_loops) { + std::cerr << " " << l; + } + std::cerr << std::endl; +#endif + + if (!temp_chain_record.backward_loops.empty() && temp_chain_record.backward_loops.back() != std::numeric_limits::max()) { + // There's a backward loop in this child chain, so the snarl's not regular. +#ifdef debug_distance_indexing + std::cerr << "We are not regular because there's a backward loop in this child chain." << std::endl; +#endif + return false; + } } } - //Now that the distances are filled in, predict the size of the snarl in the index - temp_index.max_index_size += temp_snarl_record.get_max_record_length(); - if (temp_snarl_record.is_simple) { - temp_index.max_index_size -= (temp_snarl_record.children.size() * SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord::get_max_record_length()); + // Now we know the children are fine; check for disallowed edges between + // the sentinels. + + handle_t start_in = graph->flip(start_out); + if (graph->has_edge(start_in, start_out)) { +#ifdef debug_distance_indexing + std::cerr << "We are not regular because we have a start-start loop." << std::endl; +#endif + return false; } - // For simple snarl records, need 11 + 11 + number of bits for the number of children - temp_index.max_bits = std::max(temp_index.max_bits, 22 + SnarlDistanceIndex::bit_width(temp_snarl_record.children.size())); + handle_t end_in = graph->flip(end_out); + if (graph->has_edge(end_in, end_out)) { +#ifdef debug_distance_indexing + std::cerr << "We are not regular because we have an end-end loop." << std::endl; +#endif + return false; + } + + // If we don't have any disallowed edges, and we don't have any children + // without the exact right connectivity, we must be regular. + + // We don't make sure we actually had any children. + +#ifdef debug_distance_indexing + std::cerr << "We are a regular snarl." << std::endl; +#endif + + return true; } + //Given an alignment to a graph and a range, find the set of nodes in the //graph for which the minimum distance from the position to any position //in the node is within the given distance range @@ -1541,6 +1943,7 @@ cerr << "Start positon: "<< start_pos << endl; while (!distance_index.is_root(parent)) { #ifdef debug_subgraph cerr << "At child " << distance_index.net_handle_as_string(current_net) << " with distances " << current_distance_left << " " << current_distance_right << endl; + cerr << "Parent is " << distance_index.net_handle_as_string(parent) << " at offset " << SnarlDistanceIndex::get_record_offset(parent) << endl; #endif size_t max_parent_length = distance_index.maximum_length(parent); @@ -1568,7 +1971,7 @@ cerr << "Start positon: "<< start_pos << endl; if (distance_index.is_snarl(parent)) { //If this is the child of a snarl, then just traverse from the end of the node #ifdef debug_subgraph -cerr << "Start search in parent " << distance_index.net_handle_as_string(parent); + cerr << "Start search in parent " << distance_index.net_handle_as_string(parent); #endif if (current_distance_left != std::numeric_limits::max() ){ //If we can go left @@ -1615,7 +2018,7 @@ cerr << "Start search in parent " << distance_index.net_handle_as_string(parent) #endif } else { #ifdef debug_subgraph -cerr << "Start search along parent chain " << distance_index.net_handle_as_string(parent); + cerr << "Start search along parent chain " << distance_index.net_handle_as_string(parent); #endif //If this is the child of a chain, then traverse along the chain if (current_distance_left != std::numeric_limits::max()) { @@ -1630,6 +2033,9 @@ cerr << "Start search along parent chain " << distance_index.net_handle_as_strin subgraph_in_distance_range_walk_graph(super_graph, min_distance, max_distance, subgraph, search_start_nodes, seen_nodes, traversal_start); return; } else if (distance_index.is_snarl(parent)){ +#ifdef debug_subgraph + cerr << "Parent is a snarl of handle type " << SnarlDistanceIndex::get_handle_type(parent) << " at offset " << SnarlDistanceIndex::get_record_offset(parent) << endl; +#endif //TODO: This might be overkill. It prevents us from adding nodes that shouldn't be in the subgraph, but might be too slow //If we don't check the other direction, go through the loop and add everything whose distance is lower than the minimum //to seen_nodes @@ -1661,6 +2067,9 @@ cerr << "Start search along parent chain " << distance_index.net_handle_as_strin }); } } else if (distance_index.is_chain(parent)) { +#ifdef debug_subgraph + cerr << "Parent is a chain of handle type " << SnarlDistanceIndex::get_handle_type(parent) << " at offset " << SnarlDistanceIndex::get_record_offset(parent) << endl; +#endif //TODO: This is probably also overkill - walk a chain if there is a viable loop size_t distance_loop_right = distance_index.distance_in_parent(parent, current_net, current_net, super_graph, max_distance); size_t distance_loop_left = distance_index.distance_in_parent(parent, distance_index.flip(current_net), distance_index.flip(current_net), super_graph, max_distance); diff --git a/src/snarl_distance_index.hpp b/src/snarl_distance_index.hpp index 43268d4b23..e502b9aa12 100644 --- a/src/snarl_distance_index.hpp +++ b/src/snarl_distance_index.hpp @@ -2,6 +2,7 @@ #define VG_SNARL_DISTANCE_HPP_INCLUDED #include +#include #include "snarls.hpp" #include #include "hash_map.hpp" @@ -36,7 +37,7 @@ void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGrap /// Fill in the temporary snarl record with distances void populate_snarl_index(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, - pair snarl_index, size_t size_limit, bool only_top_level_chain_distances, const HandleGraph* graph) ; + SnarlDistanceIndex::temp_record_ref_t snarl_index, size_t size_limit, bool only_top_level_chain_distances, const HandleGraph* graph) ; SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index(const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool only_top_level_chain_distances); diff --git a/src/snarls.cpp b/src/snarls.cpp index abaa507681..004021d6aa 100644 --- a/src/snarls.cpp +++ b/src/snarls.cpp @@ -10,6 +10,7 @@ #include "snarls.hpp" #include "vg/io/json2pb.h" #include "subgraph_overlay.hpp" +#include "crash.hpp" namespace vg { @@ -20,7 +21,7 @@ SnarlManager SnarlFinder::find_snarls_parallel() { } HandleGraphSnarlFinder::HandleGraphSnarlFinder(const HandleGraph* graph) : graph(graph) { - // Nothing to do! + crash_unless(graph != nullptr); } SnarlManager HandleGraphSnarlFinder::find_snarls_unindexed() { diff --git a/src/subcommand/bench_dist_query_main.cpp b/src/subcommand/bench_dist_query_main.cpp new file mode 100644 index 0000000000..3303ad1146 --- /dev/null +++ b/src/subcommand/bench_dist_query_main.cpp @@ -0,0 +1,233 @@ +/** \file bench_dist_query_main.cpp + * + * Defines the "vg bench-dist-query" subcommand, which benchmarks distance query speed across multiple indexes. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "subcommand.hpp" + +#include "../benchmark.hpp" +#include "../version.hpp" + +#include "../snarl_distance_index.hpp" +#include "../integrated_snarl_finder.hpp" +#include "../algorithms/gfa_to_handle.hpp" +#include +#include +#include "../gbwtgraph_helper.hpp" + + + +using namespace std; +using namespace vg; +using namespace vg::subcommand; + +void help_bench_dist_query(char** argv) { + std::cerr << "usage: " << argv[0] << " bench-dist-query -g -d [-d ...] [options] >report.tsv" << endl + << "options:" << endl + << " -g, --graph FILE path to input GBZ graph file" << endl + << " -d, --dist FILE path to distance index file (repeatable)" << endl + << " -q, --numQueries N number of queries to run (default: 10000)" << endl + << " -s, --save-queries FILE save generated queries to FILE for reproducibility" << endl + << " -Q, --load-queries FILE load queries from FILE instead of generating new ones" << endl + << " -p, --progress show progress" << endl + << " -h, --help print this help message to stderr and exit" << endl; +} + + +int main_bench_dist_query(int argc, char** argv) { + bool show_progress = false; + + string graph_path = ""; + vector dist_paths; + int num_queries = 10000; + string save_queries_path = ""; + string load_queries_path = ""; + + int c; + optind = 2; // force optind past command positional argument + while (true) { + static struct option long_options[] = + { + {"progress", no_argument, 0, 'p'}, + {"help", no_argument, 0, 'h'}, + {"graph", required_argument, 0, 'g'}, + {"dist", required_argument, 0, 'd'}, + {"numQueries", required_argument, 0, 'q'}, + {"save-queries", required_argument, 0, 's'}, + {"load-queries", required_argument, 0, 'Q'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + c = getopt_long(argc, argv, "g:d:q:s:Q:ph?", + long_options, &option_index); + + if (c == -1) + break; + + switch (c) { + case 'g': + graph_path = optarg; + break; + case 'd': + dist_paths.push_back(optarg); + break; + case 'q': + { + num_queries = stoi(optarg); + } + break; + case 's': + save_queries_path = optarg; + break; + case 'Q': + load_queries_path = optarg; + break; + case 'p': + show_progress = true; + break; + case 'h': + case '?': + help_bench_dist_query(argv); + exit(1); + break; + default: + abort(); + } + } + + if (graph_path.empty()) { + cerr << "error: a GBZ graph file is required (-g)" << endl; + help_bench_dist_query(argv); + exit(1); + } + + if (dist_paths.empty()) { + cerr << "error: at least one distance index file is required (-d)" << endl; + help_bench_dist_query(argv); + exit(1); + } + + // Load GBZ graph + if (show_progress) { + cerr << "Loading GBZ graph from " << graph_path << "..." << endl; + } + gbwtgraph::GBZ gbz; + load_gbz(gbz, graph_path, show_progress); + const HandleGraph& graph = gbz.graph; + cerr << "Loaded graph with " << graph.get_node_count() << " nodes" << endl; + + // Collect all node IDs + vector all_node_ids; + graph.for_each_handle([&](handle_t h) { + all_node_ids.push_back(graph.get_id(h)); + }); + + using QueryEntry = pair, tuple>; + vector queries; + + if (!load_queries_path.empty()) { + if (show_progress) { + cerr << "Loading queries from " << load_queries_path << "..." << endl; + } + ifstream qf(load_queries_path); + if (!qf) { + cerr << "error: cannot open query file: " << load_queries_path << endl; + exit(1); + } + string line; + while (getline(qf, line)) { + if (line.empty()) continue; + istringstream iss(line); + nid_t id1, id2; int rev1, rev2; size_t off1, off2; + iss >> id1 >> rev1 >> off1 >> id2 >> rev2 >> off2; + QueryEntry q; + q.first = make_tuple(id1, (bool)rev1, off1); + q.second = make_tuple(id2, (bool)rev2, off2); + queries.push_back(q); + } + cerr << "Loaded " << queries.size() << " queries from " << load_queries_path << endl; + } else { + if (show_progress) { + cerr << "Generating " << num_queries << " queries..." << endl; + } + queries.resize(num_queries); + for (auto& query : queries) { + nid_t node1 = all_node_ids[rand() % all_node_ids.size()]; + nid_t node2 = all_node_ids[rand() % all_node_ids.size()]; + size_t len1 = graph.get_length(graph.get_handle(node1)); + size_t len2 = graph.get_length(graph.get_handle(node2)); + query.first = make_tuple(node1, rand() % 2 == 1, len1 > 0 ? rand() % len1 : 0); + query.second = make_tuple(node2, rand() % 2 == 1, len2 > 0 ? rand() % len2 : 0); + } + cerr << "Generated " << queries.size() << " queries" << endl; + } + + if (!save_queries_path.empty()) { + ofstream qf(save_queries_path); + if (!qf) { + cerr << "error: cannot open save file: " << save_queries_path << endl; + exit(1); + } + for (auto& query : queries) { + auto& [id1, rev1, off1] = query.first; + auto& [id2, rev2, off2] = query.second; + qf << id1 << "\t" << (int)rev1 << "\t" << off1 << "\t" + << id2 << "\t" << (int)rev2 << "\t" << off2 << "\n"; + } + cerr << "Saved " << queries.size() << " queries to " << save_queries_path << endl; + } + + // Output header + cout << "dist_index\tavg_query_us" << endl; + + // Benchmark each distance index + for (const auto& dist_path : dist_paths) { + if (show_progress) { + cerr << "Loading distance index from " << dist_path << "..." << endl; + } + SnarlDistanceIndex distance_index; + distance_index.deserialize(dist_path); + cerr << "Loaded distance index from " << dist_path << endl; + + // Pull the whole index into the OS page cache so timings reflect + // unavoidable query cost, not avoidable first-touch I/O + distance_index.preload(true); + + // Time all queries + auto start = chrono::high_resolution_clock::now(); + for (auto& query : queries) { + auto& [node1, node1_rev, node1_offset] = query.first; + auto& [node2, node2_rev, node2_offset] = query.second; + distance_index.minimum_distance( + node1, node1_rev, node1_offset, + node2, node2_rev, node2_offset, + false, + nullptr + ); + } + auto end = chrono::high_resolution_clock::now(); + + double total_us = chrono::duration(end - start).count(); + double avg_us = total_us / queries.size(); + + filesystem::path dist_fs_path(dist_path); + cout << dist_fs_path.filename().string() << "\t" << avg_us << endl; + cerr << dist_path << ": avg query time = " << avg_us << " us" << endl; + } + + return 0; +} + +// Register subcommand +static Subcommand vg_bench_dist_query("bench-dist-query", "benchmark distance query speed across multiple indexes", DEVELOPMENT, main_bench_dist_query); diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp index 70f83bc17e..6388981cf2 100644 --- a/src/subcommand/call_main.cpp +++ b/src/subcommand/call_main.cpp @@ -801,7 +801,7 @@ int main_call(int argc, char** argv) { unique_ptr alignment_emitter; if (gaf_output) { - alignment_emitter = vg::io::get_non_hts_alignment_emitter("-", "GAF", {}, get_thread_count(), graph); + alignment_emitter = vg::io::get_non_hts_alignment_emitter("-", "GAF", {}, vg::get_thread_count(), graph); // TODO: There should be a general function for emitting headers. See giraffe_main.cpp. io::GafAlignmentEmitter* gaf_emitter = dynamic_cast(alignment_emitter.get()); if (gbz_graph.get() != nullptr && gaf_emitter != nullptr) { diff --git a/src/subcommand/gampcompare_main.cpp b/src/subcommand/gampcompare_main.cpp index 01a5d59717..96bcd85ec0 100644 --- a/src/subcommand/gampcompare_main.cpp +++ b/src/subcommand/gampcompare_main.cpp @@ -215,8 +215,13 @@ int main_gampcompare(int argc, char** argv) { for (size_t j = 0; j < path_mapped_positions.size(); ++j) { if (path_true_positions[i].second == path_mapped_positions[j].second) { // there is a pair of positions on the same strand of the same path - abs_dist = min(abs_dist, - std::abs(static_cast(path_true_positions[i].first) - static_cast(path_mapped_positions[j].first))); + abs_dist = min( + abs_dist, + std::abs( + static_cast(path_true_positions[i].first) - + static_cast(path_mapped_positions[j].first) + ) + ); } } } diff --git a/src/subcommand/haplotypes_main.cpp b/src/subcommand/haplotypes_main.cpp index f0fa15c20f..4bffd2b852 100644 --- a/src/subcommand/haplotypes_main.cpp +++ b/src/subcommand/haplotypes_main.cpp @@ -965,7 +965,7 @@ void validate_error_sequence(const Logger& logger, size_t chain_id, size_t subch } std::string validate_unary_path(const HandleGraph& graph, handle_t from, handle_t to) { - hash_set visited; + vg::hash_set visited; handle_t curr = from; while (curr != to) { if (visited.find(curr) != visited.end()) { @@ -989,7 +989,7 @@ std::string validate_unary_path(const HandleGraph& graph, handle_t from, handle_ // Returns true if the path from (start, offset) reaches the end without revisiting start or leaving the subchain. // The path may continue in subsequent fragments. bool trace_path( - const gbwt::GBWT& index, const gbwt::FragmentMap& fragment_map, const hash_set& subchain_nodes, + const gbwt::GBWT& index, const gbwt::FragmentMap& fragment_map, const vg::hash_set& subchain_nodes, gbwt::size_type sequence_id, gbwt::node_type start, gbwt::size_type offset, gbwt::node_type end ) { gbwt::edge_type pos(start, offset); @@ -1132,8 +1132,8 @@ void validate_chain(const Logger& logger, // Sequences: normal subchains. if (subchain.type == Haplotypes::Subchain::normal) { std::vector da = r_index.decompressDA(subchain.start); - hash_set nodes = extract_subchain(graph, gbwtgraph::GBWTGraph::node_to_handle(subchain.start), gbwtgraph::GBWTGraph::node_to_handle(subchain.end)); - hash_set selected; + vg::hash_set nodes = extract_subchain(graph, gbwtgraph::GBWTGraph::node_to_handle(subchain.start), gbwtgraph::GBWTGraph::node_to_handle(subchain.end)); + vg::hash_set selected; for (size_t i = 0; i < da.size(); i++) { if (trace_path(*(graph.index), fragment_map, nodes, da[i], subchain.start, i, subchain.end)) { selected.insert(Haplotypes::sequence_type(da[i], i)); @@ -1159,7 +1159,7 @@ void validate_chain(const Logger& logger, std::string message = expected_got(da.size(), subchain.sequences.size()) + " sequences (prefix / suffix)"; validate_error_subchain(logger, chain_id, subchain_id, message); } - hash_set truth; + vg::hash_set truth; for (size_t i = 0; i < da.size(); i++) { truth.insert({ da[i], i }); } @@ -1180,7 +1180,7 @@ void validate_chain(const Logger& logger, // Kmers. if (subchain.type != Haplotypes::Subchain::full_haplotype) { - hash_set all_kmers; + vg::hash_set all_kmers; for (size_t i = 0; i < subchain.kmers.size(); i++) { all_kmers.insert(subchain.kmers[i]); } @@ -1188,14 +1188,14 @@ void validate_chain(const Logger& logger, std::string message = expected_got(subchain.kmers.size(), all_kmers.size()) + " kmers"; validate_error_subchain(logger, chain_id, subchain_id, message); } - hash_map used_kmers; // (kmer used in haplotypes, number of sequences that contain it) - hash_map missing_kmers; // (kmer not used in haplotypes, number of sequences that contain it) + vg::hash_map used_kmers; // (kmer used in haplotypes, number of sequences that contain it) + vg::hash_map missing_kmers; // (kmer not used in haplotypes, number of sequences that contain it) for (size_t i = 0; i < subchain.sequences.size(); i++) { std::vector haplotype = get_haplotype( graph, fragment_map, subchain.sequences[i], subchain.start, subchain.end, minimizer_index.k() ); - hash_map unique_minimizers; // (kmer, used in the sequence) + vg::hash_map unique_minimizers; // (kmer, used in the sequence) for (const std::string& sequence : haplotype) { auto minimizers = minimizer_index.minimizers(sequence); for (auto& minimizer : minimizers) { @@ -1322,7 +1322,7 @@ void validate_haplotypes(const Logger& logger, if (verbosity >= HaplotypePartitioner::Verbosity::verbosity_detailed) { logger.info() << "Validating kmer specificity" << std::endl; } - hash_map> kmers; + vg::hash_map> kmers; size_t collisions = 0, total_kmers = 0; for (size_t chain_id = 0; chain_id < haplotypes.components(); chain_id++) { const Haplotypes::TopLevelChain& chain = haplotypes.chains[chain_id]; diff --git a/src/subcommand/inject_main.cpp b/src/subcommand/inject_main.cpp index 268bbeeac0..de8ce07ed7 100644 --- a/src/subcommand/inject_main.cpp +++ b/src/subcommand/inject_main.cpp @@ -138,7 +138,7 @@ int main_inject(int argc, char** argv) { set_crash_context(aln.name()); if (add_identity) { // Calculate & save identity statistic - aln.set_identity(identity(aln.path())); + aln.set_identity(vg::identity(aln.path())); } if (rescore) { // Rescore the alignment diff --git a/src/subcommand/minimizer_main.cpp b/src/subcommand/minimizer_main.cpp index f2f69da695..755c45dcfd 100644 --- a/src/subcommand/minimizer_main.cpp +++ b/src/subcommand/minimizer_main.cpp @@ -94,7 +94,6 @@ int main_minimizer(int argc, char** argv) { logger.info() << "Loading SnarlDistanceIndex from " << config.distance_name << std::endl; } distance_index = vg::io::VPKG::load_one(config.distance_name); - distance_index->preload(true); } ZipCodeCollection oversized_zipcodes; diff --git a/src/subcommand/pack_main.cpp b/src/subcommand/pack_main.cpp index 8d6d7155a9..10146cd2ff 100644 --- a/src/subcommand/pack_main.cpp +++ b/src/subcommand/pack_main.cpp @@ -200,7 +200,7 @@ int main_pack(int argc, char** argv) { // use some naive heuristics to come up with bin count and batch size based on thread count // more bins: finer grained parallelism at cost of more mutexes and allocations // bigger batch size: more robustness to sorted input at cost of less parallelism - size_t num_threads = get_thread_count(); + size_t num_threads = vg::get_thread_count(); size_t batch_size = Packer::estimate_batch_size(num_threads); size_t bin_count = Packer::estimate_bin_count(num_threads); diff --git a/src/unittest/banded_global_aligner.cpp b/src/unittest/banded_global_aligner.cpp index 045e9bfa97..6b5fb4b3c8 100644 --- a/src/unittest/banded_global_aligner.cpp +++ b/src/unittest/banded_global_aligner.cpp @@ -10,7 +10,7 @@ #include "vg.hpp" #include "path.hpp" #include "banded_global_aligner.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include "bdsg/hash_graph.hpp" #include "../algorithms/pad_band.hpp" @@ -3515,10 +3515,9 @@ namespace vg { SECTION( "Banded global aligner does not produce empty edits when there is an insertion an empty node") { string graph_json = R"({"edge": [{"to_end": true, "from_start": true, "to": 22, "from": 20}, {"to": 26, "from": 20}, {"to": 24, "from": 20}, {"to_end": true, "from_start": true, "to": 26, "from": 4}, {"to_end": true, "from_start": true, "to": 24, "from": 4}], "node": [{"sequence": "C", "id": 24}, {"sequence": "GAGA", "id": 20}, {"sequence": "T", "id": 26}, {"sequence": "GGAGTCT", "id": 4}, {"id": 22}]})"; - - Graph graph; - json2pb(graph, graph_json.c_str(), graph_json.size()); - VG vg_graph(graph); + + bdsg::HashGraph vg_graph; + vg::io::json2graph(graph_json, &vg_graph); TestAligner aligner_source; const Aligner& aligner = *aligner_source.get_regular_aligner(); diff --git a/src/unittest/cactus.cpp b/src/unittest/cactus.cpp index 7447ee247d..5e518db4ef 100644 --- a/src/unittest/cactus.cpp +++ b/src/unittest/cactus.cpp @@ -5,8 +5,9 @@ #include #include -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include "../cactus.hpp" +#include #include "catch.hpp" namespace vg { @@ -14,9 +15,7 @@ namespace unittest { using namespace std; TEST_CASE("We can convert a two-tailed graph to Cactus", "[cactus]") { - - VG graph; - + string graph_json = R"( {"node":[{"sequence":"GT","id":7575}, {"sequence":"TGTTAACAGCACAACATTTA","id":7580}, @@ -25,20 +24,18 @@ TEST_CASE("We can convert a two-tailed graph to Cactus", "[cactus]") { "edge":[{"from":7575,"to":7580,"from_start":true}, {"from":7575,"to":7576}]} )"; - - Graph g; - json2pb(g, graph_json.c_str(), graph_json.size()); - graph.extend(g); - // Make sure we can make a Cactus graph and get something out. + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + + // Make sure we can make a Cactus graph and get something out. auto cactusified = cactusify(graph); REQUIRE(cactusified.is_valid()); } TEST_CASE("We can convert a hairpin graph to Cactus", "[cactus]") { - VG graph; - + // Here's a graph where only the left side of node 2 is dangling, and the right side of node 1 has a self loop. string graph_json = R"( {"node": [{"sequence": "A", "id": 1}, @@ -46,12 +43,11 @@ TEST_CASE("We can convert a hairpin graph to Cactus", "[cactus]") { "edge": [{"from": 2, "to": 1}, {"from": 1, "to": 1, "to_end": true}]} )"; - - Graph g; - json2pb(g, graph_json.c_str(), graph_json.size()); - graph.extend(g); - // Make sure we can make a Cactus graph and get something out. + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + + // Make sure we can make a Cactus graph and get something out. auto cactusified = cactusify(graph); REQUIRE(cactusified.is_valid()); } diff --git a/src/unittest/chunker.cpp b/src/unittest/chunker.cpp index 24f7d3b645..3be2298c15 100644 --- a/src/unittest/chunker.cpp +++ b/src/unittest/chunker.cpp @@ -7,6 +7,8 @@ #include "vg.hpp" #include "xg.hpp" #include "path.hpp" +#include "../io/json2graph.hpp" +#include namespace vg { namespace unittest { @@ -83,13 +85,13 @@ TEST_CASE("basic graph chunking", "[chunk]") { )"; - // Load it into Protobuf - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - + // Load the graph + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + // Pass it over to XG xg::XG index; - index.from_path_handle_graph(VG(chunk)); + index.from_path_handle_graph(graph); PathChunker chunker(&index); diff --git a/src/unittest/copy_graph.cpp b/src/unittest/copy_graph.cpp index 581b683130..4e7e878075 100644 --- a/src/unittest/copy_graph.cpp +++ b/src/unittest/copy_graph.cpp @@ -1,6 +1,7 @@ #include "catch.hpp" #include "../handle.hpp" #include "../vg.hpp" +#include "../io/json2graph.hpp" #include "xg.hpp" #include "bdsg/packed_graph.hpp" @@ -53,14 +54,15 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); VG vg; handlealgs::copy_handle_graph(&xg, &vg); - + REQUIRE(xg.get_node_count() == 1); REQUIRE(vg.get_node_count() == 1); } @@ -72,14 +74,15 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::PackedGraph pg; handlealgs::copy_handle_graph(&xg, &pg); - + REQUIRE(xg.get_node_count() == 1); REQUIRE(pg.get_node_count() == 1); } @@ -91,14 +94,15 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::HashGraph hg; handlealgs::copy_handle_graph(&xg, &hg); - + REQUIRE(xg.get_node_count() == 1); REQUIRE(hg.get_node_count() == 1); } @@ -120,19 +124,20 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); VG vg; handlealgs::copy_handle_graph(&xg, &vg); - + REQUIRE(xg.get_node_count() == 4); REQUIRE(vg.get_node_count() == 4); REQUIRE(vg.edge_count() == 4); REQUIRE(vg.length() == 16); - + } TEST_CASE( "copy_handle_graph converter works on graphs with one reversing edge, xg to pg", "[handle][pg][xg]") { string graph_json = R"( @@ -151,14 +156,15 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::PackedGraph pg; handlealgs::copy_handle_graph(&xg, &pg); - + REQUIRE(xg.get_node_count() == 4); REQUIRE(pg.get_node_count() == 4); @@ -168,14 +174,14 @@ namespace vg { return true; }); REQUIRE(length == 16); - + int edge_count = 0; pg.for_each_edge([&](const edge_t& edge) { edge_count += 1; return true; }); REQUIRE(edge_count == 4); - + } TEST_CASE( "copy_handle_graph converter works on graphs with one reversing edge, xg to hg", "[handle][hg][xg]") { string graph_json = R"( @@ -194,14 +200,15 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::HashGraph hg; handlealgs::copy_handle_graph(&xg, &hg); - + REQUIRE(xg.get_node_count() == 4); REQUIRE(hg.get_node_count() == 4); int length = 0; @@ -210,14 +217,14 @@ namespace vg { return true; }); REQUIRE(length == 16); - + int edge_count = 0; hg.for_each_edge([&](const edge_t& edge) { edge_count += 1; return true; }); REQUIRE(edge_count == 4); - + } TEST_CASE( "copy_handle_graph converter works on graphs with reversing edges and loops", "[handle][vg][xg]") { string graph_json = R"( @@ -239,14 +246,15 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); VG vg; handlealgs::copy_handle_graph(&xg, &vg); - + REQUIRE(xg.get_sequence(xg.get_handle(1)) == "GATT"); REQUIRE(xg.get_sequence(xg.get_handle(3)) == "CGAT"); REQUIRE(xg.get_node_count() == 4); @@ -274,26 +282,27 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::PackedGraph pg; handlealgs::copy_handle_graph(&xg, &pg); - + REQUIRE(xg.get_sequence(xg.get_handle(1)) == "GATT"); REQUIRE(xg.get_sequence(xg.get_handle(3)) == "CGAT"); REQUIRE(xg.get_node_count() == 4); REQUIRE(pg.get_node_count() == 4); - + int length = 0; pg.for_each_handle([&](const handle_t& here) { length += pg.get_length(here); return true; }); REQUIRE(length == 16); - + int edge_count = 0; pg.for_each_edge([&](const edge_t& edge) { edge_count += 1; @@ -321,26 +330,27 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::HashGraph hg; handlealgs::copy_handle_graph(&xg, &hg); - + REQUIRE(xg.get_sequence(xg.get_handle(1)) == "GATT"); REQUIRE(xg.get_sequence(xg.get_handle(3)) == "CGAT"); REQUIRE(xg.get_node_count() == 4); REQUIRE(hg.get_node_count() == 4); - + int length = 0; hg.for_each_handle([&](const handle_t& here) { length += hg.get_length(here); return true; }); REQUIRE(length == 16); - + int edge_count = 0; hg.for_each_edge([&](const edge_t& edge) { edge_count += 1; @@ -382,16 +392,17 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); VG vg; handlealgs::copy_path_handle_graph(&xg, &vg); - - + + REQUIRE(xg.get_sequence(xg.get_handle(1)) == "GATT"); REQUIRE(xg.get_sequence(xg.get_handle(3)) == "CGAT"); REQUIRE(xg.get_node_count() == 4); @@ -444,37 +455,38 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::PackedGraph pg; handlealgs::copy_path_handle_graph(&xg, &pg); - - - + + + REQUIRE(xg.get_sequence(xg.get_handle(1)) == "GATT"); REQUIRE(xg.get_sequence(xg.get_handle(3)) == "CGAT"); REQUIRE(xg.get_node_count() == 4); REQUIRE(pg.get_node_count() == 4); - + int length = 0; pg.for_each_handle([&](const handle_t& here) { length += pg.get_length(here); return true; }); REQUIRE(length == 16); - + int edge_count = 0; pg.for_each_edge([&](const edge_t& edge) { edge_count += 1; return true; }); REQUIRE(edge_count == 7); - - + + REQUIRE(pg.has_path("path1") == true); REQUIRE(pg.has_path("path2") == true); REQUIRE(pg.get_path_count() == 2); @@ -521,37 +533,38 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::HashGraph hg; handlealgs::copy_path_handle_graph(&xg, &hg); - - - + + + REQUIRE(xg.get_sequence(xg.get_handle(1)) == "GATT"); REQUIRE(xg.get_sequence(xg.get_handle(3)) == "CGAT"); REQUIRE(xg.get_node_count() == 4); REQUIRE(hg.get_node_count() == 4); - - + + int length = 0; hg.for_each_handle([&](const handle_t& here) { length += hg.get_length(here); return true; }); REQUIRE(length == 16); - + int edge_count = 0; hg.for_each_edge([&](const edge_t& edge) { edge_count += 1; return true; }); REQUIRE(edge_count == 7); - - + + REQUIRE(hg.has_path("path1") == true); REQUIRE(hg.has_path("path2") == true); REQUIRE(hg.get_path_count() == 2); diff --git a/src/unittest/dijkstra.cpp b/src/unittest/dijkstra.cpp index 2608567153..4e94414040 100644 --- a/src/unittest/dijkstra.cpp +++ b/src/unittest/dijkstra.cpp @@ -6,7 +6,7 @@ #include #include #include "../handle.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include "../vg.hpp" #include "catch.hpp" @@ -125,14 +125,12 @@ TEST_CASE("Dijkstra search handles early stopping correctly", "[dijkstra][algori TEST_CASE("Dijkstra search works on a particular problem graph", "[dijkstra][algorithms]") { string graph_json = R"( -{"node":[{"sequence":"A","id":"2454530"},{"sequence":"AGTGCTGGAGAGGATGTGGAGAAATAGGAAC","id":"2454529"},{"sequence":"C","id":"2454532"},{"sequence":"TTTTACACTGTTGGTGGGACTGTAAA","id":"2454533"},{"sequence":"A","id":"2454527"},{"sequence":"C","id":"2454528"},{"sequence":"G","id":"2454531"},{"sequence":"C","id":"2454534"},{"sequence":"T","id":"2454535"},{"sequence":"GGGTAATAA","id":"2454526"},{"sequence":"TAGTTCAACCATTGTGGAAGACTGTGGCAATT","id":"2454536"}],"edge":[{"from":"2454530","to":"2454532"},{"from":"2454530","to":"2454533"},{"from":"2454529","to":"2454530"},{"from":"2454529","to":"2454531"},{"from":"2454532","to":"2454533"},{"from":"2454533","to":"2454534"},{"from":"2454533","to":"2454535"},{"from":"2454527","to":"2454529"},{"from":"2454528","to":"2454529"},{"from":"2454531","to":"2454532"},{"from":"2454531","to":"2454533"},{"from":"2454534","to":"2454536"},{"from":"2454535","to":"2454536"},{"from":"2454526","to":"2454527"},{"from":"2454526","to":"2454528"}],"path":[{"name":"21","mapping":[{"position":{"node_id":"2454526"},"edit":[{"from_length":9,"to_length":9}],"rank":"3049077"},{"position":{"node_id":"2454528"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049078"},{"position":{"node_id":"2454529"},"edit":[{"from_length":31,"to_length":31}],"rank":"3049079"},{"position":{"node_id":"2454531"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049080"},{"position":{"node_id":"2454532"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049081"},{"position":{"node_id":"2454533"},"edit":[{"from_length":26,"to_length":26}],"rank":"3049082"},{"position":{"node_id":"2454535"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049083"},{"position":{"node_id":"2454536"},"edit":[{"from_length":32,"to_length":32}],"rank":"3049084"}]}]} +{"node":[{"sequence":"A","id":"2454530"},{"sequence":"AGTGCTGGAGAGGATGTGGAGAAATAGGAAC","id":"2454529"},{"sequence":"C","id":"2454532"},{"sequence":"TTTTACACTGTTGGTGGGACTGTAAA","id":"2454533"},{"sequence":"A","id":"2454527"},{"sequence":"C","id":"2454528"},{"sequence":"G","id":"2454531"},{"sequence":"C","id":"2454534"},{"sequence":"T","id":"2454535"},{"sequence":"GGGTAATAA","id":"2454526"},{"sequence":"TAGTTCAACCATTGTGGAAGACTGTGGCAATT","id":"2454536"}],"edge":[{"from":"2454530","to":"2454532"},{"from":"2454530","to":"2454533"},{"from":"2454529","to":"2454530"},{"from":"2454529","to":"2454531"},{"from":"2454532","to":"2454533"},{"from":"2454533","to":"2454534"},{"from":"2454533","to":"2454535"},{"from":"2454527","to":"2454529"},{"from":"2454528","to":"2454529"},{"from":"2454531","to":"2454532"},{"from":"2454531","to":"2454533"},{"from":"2454534","to":"2454536"},{"from":"2454535","to":"2454536"},{"from":"2454526","to":"2454527"},{"from":"2454526","to":"2454528"}],"path":[{"name":"21","mapping":[{"position":{"node_id":"2454526"},"edit":[{"from_length":9,"to_length":9}],"rank":"3049077"},{"position":{"node_id":"2454528"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049078"},{"position":{"node_id":"2454529"},"edit":[{"from_length":31,"to_length":31}],"rank":"3049079"},{"position":{"node_id":"2454531"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049080"},{"position":{"node_id":"2454532"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049081"},{"position":{"node_id":"2454533"},"edit":[{"from_length":26,"to_length":26}],"rank":"3049082"},{"position":{"node_id":"2454535"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049083"},{"position":{"node_id":"2454536"},"edit":[{"from_length":32,"to_length":32}],"rank":"3049084"}]}]} )"; - - Graph g; - json2pb(g, graph_json); - - // Wrap the graph in a HandleGraph - VG graph(g); + + // Load the graph + HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Decide where to start handle_t start = graph.get_handle(2454536, true); diff --git a/src/unittest/gbwt_extender.cpp b/src/unittest/gbwt_extender.cpp index ec61517cfa..5d25bf956e 100644 --- a/src/unittest/gbwt_extender.cpp +++ b/src/unittest/gbwt_extender.cpp @@ -5,7 +5,7 @@ #include "../gbwt_extender.hpp" #include "../gbwt_helper.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include "../utility.hpp" #include "../vg.hpp" @@ -90,10 +90,9 @@ gbwt::GBWT build_gbwt_index() { // Build a GBWTGraph using the provided GBWT index. gbwtgraph::GBWTGraph build_gbwt_graph(const gbwt::GBWT& gbwt_index) { - Graph graph; - json2pb(graph, gapless_extender_graph.c_str(), gapless_extender_graph.size()); - VG vg_graph(graph); - return gbwtgraph::GBWTGraph(gbwt_index, vg_graph, nullptr); + bdsg::HashGraph graph; + vg::io::json2graph(gapless_extender_graph, &graph); + return gbwtgraph::GBWTGraph(gbwt_index, graph, nullptr); } void same_position(const Position& pos, const Position& correct) { diff --git a/src/unittest/genotypekit.cpp b/src/unittest/genotypekit.cpp index af9bc2a4d8..b5d460c59a 100644 --- a/src/unittest/genotypekit.cpp +++ b/src/unittest/genotypekit.cpp @@ -10,6 +10,8 @@ #include "../traversal_finder.hpp" #include "xg.hpp" #include "../haplotype_extracter.hpp" +#include "../io/json2graph.hpp" +#include namespace Catch { @@ -62,10 +64,10 @@ namespace vg { namespace unittest { TEST_CASE("sites can be found with Cactus", "[genotype]") { - + // Build a toy graph const string graph_json = R"( - + { "node": [ {"id": 1, "sequence": "G"}, @@ -90,7 +92,7 @@ TEST_CASE("sites can be found with Cactus", "[genotype]") { {"from": 6, "to": 8}, {"from": 7, "to": 9}, {"from": 8, "to": 9} - + ], "path": [ {"name": "hint", "mapping": [ @@ -101,14 +103,13 @@ TEST_CASE("sites can be found with Cactus", "[genotype]") { ]} ] } - + )"; - + // Make an actual graph + // Note: Using VG here because the test uses VG-specific methods like get_node() and get_edge() VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + vg::io::json2graph(graph_json, &graph); // Make a CactusSnarlFinder unique_ptr finder(new CactusSnarlFinder(graph)); @@ -196,10 +197,10 @@ TEST_CASE("sites can be found with Cactus", "[genotype]") { } TEST_CASE("sites can be found with the IntegratedSnarlFinder", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( - + { "node": [ {"id": 1, "sequence": "G"}, @@ -224,7 +225,7 @@ TEST_CASE("sites can be found with the IntegratedSnarlFinder", "[genotype][integ {"from": 6, "to": 8}, {"from": 7, "to": 9}, {"from": 8, "to": 9} - + ], "path": [ {"name": "hint", "mapping": [ @@ -235,14 +236,13 @@ TEST_CASE("sites can be found with the IntegratedSnarlFinder", "[genotype][integ ]} ] } - + )"; - + // Make an actual graph + // Note: Using VG here because the test uses VG-specific methods like get_node() and get_edge() VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -329,7 +329,7 @@ TEST_CASE("sites can be found with the IntegratedSnarlFinder", "[genotype][integ } TEST_CASE("IntegratedSnarlFinder works when cactus graph contains back-to-back cycles along root path", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -351,17 +351,15 @@ TEST_CASE("IntegratedSnarlFinder works when cactus graph contains back-to-back c {"from": 3, "to": 5}, {"from": 4, "to": 6}, {"from": 5, "to": 6} - + ] } )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -375,18 +373,16 @@ TEST_CASE("IntegratedSnarlFinder works when cactus graph contains back-to-back c } TEST_CASE("IntegratedSnarlFinder works on an all bridge edge Y graph with specific numbering", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( {"node":[{"id":"2","sequence":"G"},{"id":"3","sequence":"G"},{"id":"4","sequence":"G"},{"id":"5","sequence":"G"},{"id":"6","sequence":"G"},{"id":"11","sequence":"G"}], - "edge":[{"from":"2","to":"3"},{"from":"3","to":"6"},{"from":"4","to":"5"},{"from":"5","to":"6"},{"from":"6","to":"11"}]} + "edge":[{"from":"2","to":"3"},{"from":"3","to":"6"},{"from":"4","to":"5"},{"from":"5","to":"6"},{"from":"6","to":"11"}]} )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -403,18 +399,16 @@ TEST_CASE("IntegratedSnarlFinder works on an all bridge edge Y graph with specif } TEST_CASE("IntegratedSnarlFinder roots correctly an all bridge edge Y graph with winning longest path", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( {"node":[{"id":"2","sequence":"G"},{"id":"3","sequence":"G"},{"id":"4","sequence":"GG"},{"id":"5","sequence":"G"},{"id":"6","sequence":"G"},{"id":"11","sequence":"GG"}], - "edge":[{"from":"2","to":"3"},{"from":"3","to":"6"},{"from":"4","to":"5"},{"from":"5","to":"6"},{"from":"6","to":"11"}]} + "edge":[{"from":"2","to":"3"},{"from":"3","to":"6"},{"from":"4","to":"5"},{"from":"5","to":"6"},{"from":"6","to":"11"}]} )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -452,7 +446,7 @@ TEST_CASE("IntegratedSnarlFinder roots correctly an all bridge edge Y graph with } TEST_CASE("IntegratedSnarlFinder works when cactus graph contains longer back-to-back cycles along root path", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -482,17 +476,15 @@ TEST_CASE("IntegratedSnarlFinder works when cactus graph contains longer back-to {"from": 32, "to": 5}, {"from": 4, "to": 6}, {"from": 5, "to": 6} - + ] } )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -506,50 +498,48 @@ TEST_CASE("IntegratedSnarlFinder works when cactus graph contains longer back-to } TEST_CASE("IntegratedSnarlFinder works on a complex bundle-y region with a nested snarl", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( {"edge": [{"from": "129672", "to": "129673"}, - {"from": "129662", "to": "129663"}, - {"from": "129662", "to": "129664"}, - {"from": "129664", "to": "129665"}, - {"from": "129664", "to": "129666"}, - {"from": "129666", "to": "129668"}, - {"from": "129666", "to": "129669"}, - {"from": "129666", "to": "129667"}, - {"from": "129667", "to": "129668"}, - {"from": "129667", "to": "129669"}, - {"from": "129669", "to": "129670"}, - {"from": "129669", "to": "129673"}, - {"from": "129671", "to": "129672"}, - {"from": "129668", "to": "129670"}, - {"from": "129668", "to": "129673"}, - {"from": "129665", "to": "129668"}, - {"from": "129665", "to": "129669"}, - {"from": "129665", "to": "129667"}, - {"from": "129670", "to": "129671"}, - {"from": "129670", "to": "129672"}, - {"from": "129663", "to": "129665"}, - {"from": "129663", "to": "129666"}], - "node": [{"id": "129672", "sequence": "AT"}, - {"id": "129662", "sequence": "CAGGTCAAACTGTGAT"}, - {"id": "129664", "sequence": "T"}, - {"id": "129666", "sequence": "T"}, - {"id": "129667", "sequence": "G"}, - {"id": "129669", "sequence": "G"}, - {"id": "129671", "sequence": "T"}, - {"id": "129668", "sequence": "A"}, - {"id": "129665", "sequence": "A"}, - {"id": "129670", "sequence": "A"}, - {"id": "129673", "sequence": "ATATATATATACTTATTGTAAAAATCTTTAGA"}, + {"from": "129662", "to": "129663"}, + {"from": "129662", "to": "129664"}, + {"from": "129664", "to": "129665"}, + {"from": "129664", "to": "129666"}, + {"from": "129666", "to": "129668"}, + {"from": "129666", "to": "129669"}, + {"from": "129666", "to": "129667"}, + {"from": "129667", "to": "129668"}, + {"from": "129667", "to": "129669"}, + {"from": "129669", "to": "129670"}, + {"from": "129669", "to": "129673"}, + {"from": "129671", "to": "129672"}, + {"from": "129668", "to": "129670"}, + {"from": "129668", "to": "129673"}, + {"from": "129665", "to": "129668"}, + {"from": "129665", "to": "129669"}, + {"from": "129665", "to": "129667"}, + {"from": "129670", "to": "129671"}, + {"from": "129670", "to": "129672"}, + {"from": "129663", "to": "129665"}, + {"from": "129663", "to": "129666"}], + "node": [{"id": "129672", "sequence": "AT"}, + {"id": "129662", "sequence": "CAGGTCAAACTGTGAT"}, + {"id": "129664", "sequence": "T"}, + {"id": "129666", "sequence": "T"}, + {"id": "129667", "sequence": "G"}, + {"id": "129669", "sequence": "G"}, + {"id": "129671", "sequence": "T"}, + {"id": "129668", "sequence": "A"}, + {"id": "129665", "sequence": "A"}, + {"id": "129670", "sequence": "A"}, + {"id": "129673", "sequence": "ATATATATATACTTATTGTAAAAATCTTTAGA"}, {"id": "129663", "sequence": "G"}]} )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -579,23 +569,21 @@ TEST_CASE("IntegratedSnarlFinder works on a complex bundle-y region with a neste } TEST_CASE("CactusSnarlFinder safely handles a single node graph", "[genotype][cactus-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( - + { "node": [ {"id": 1, "sequence": "GATTACA"} ] } - + )"; - + // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make a CactusSnarlFinder unique_ptr finder(new CactusSnarlFinder(graph)); @@ -607,15 +595,13 @@ TEST_CASE("CactusSnarlFinder safely handles a single node graph", "[genotype][ca } TEST_CASE("IntegratedSnarlFinder safely handles a completely empty graph", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = "{}"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make a IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -625,7 +611,7 @@ TEST_CASE("IntegratedSnarlFinder safely handles a completely empty graph", "[gen } TEST_CASE("IntegratedSnarlFinder safely handles a single node graph", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -638,10 +624,8 @@ TEST_CASE("IntegratedSnarlFinder safely handles a single node graph", "[genotype )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -651,7 +635,7 @@ TEST_CASE("IntegratedSnarlFinder safely handles a single node graph", "[genotype } TEST_CASE("IntegratedSnarlFinder produces all the correct types of single-node chains", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -673,10 +657,8 @@ TEST_CASE("IntegratedSnarlFinder produces all the correct types of single-node c )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder IntegratedSnarlFinder finder(graph); @@ -736,7 +718,7 @@ TEST_CASE("IntegratedSnarlFinder produces all the correct types of single-node c } TEST_CASE("IntegratedSnarlFinder safely handles a path when forced to root at one end", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -757,10 +739,8 @@ TEST_CASE("IntegratedSnarlFinder safely handles a path when forced to root at on )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -770,7 +750,7 @@ TEST_CASE("IntegratedSnarlFinder safely handles a path when forced to root at on } TEST_CASE("IntegratedSnarlFinder safely handles a single node connected component in a larger graph", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -787,10 +767,8 @@ TEST_CASE("IntegratedSnarlFinder safely handles a single node connected componen )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -813,7 +791,7 @@ TEST_CASE("IntegratedSnarlFinder safely handles a single node connected componen } TEST_CASE("IntegratedSnarlFinder safely handles a single node cycle", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -828,10 +806,8 @@ TEST_CASE("IntegratedSnarlFinder safely handles a single node cycle", "[genotype )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -844,7 +820,7 @@ TEST_CASE("IntegratedSnarlFinder safely handles a single node cycle", "[genotype } TEST_CASE("IntegratedSnarlFinder safely handles a totally connected graph", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -866,10 +842,8 @@ TEST_CASE("IntegratedSnarlFinder safely handles a totally connected graph", "[ge )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -882,7 +856,7 @@ TEST_CASE("IntegratedSnarlFinder safely handles a totally connected graph", "[ge } TEST_CASE("IntegratedSnarlFinder prefers to root at a bridge edge path in a tie", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -903,10 +877,8 @@ TEST_CASE("IntegratedSnarlFinder prefers to root at a bridge edge path in a tie" )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -935,7 +907,7 @@ TEST_CASE("IntegratedSnarlFinder prefers to root at a bridge edge path in a tie" } TEST_CASE("IntegratedSnarlFinder prefers to root at a cycle that is 1 bp longer", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -956,10 +928,8 @@ TEST_CASE("IntegratedSnarlFinder prefers to root at a cycle that is 1 bp longer" )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -988,7 +958,7 @@ TEST_CASE("IntegratedSnarlFinder prefers to root at a cycle that is 1 bp longer" } TEST_CASE("IntegratedSnarlFinder prefers to root at a chain with an up-weighted node", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -1009,10 +979,8 @@ TEST_CASE("IntegratedSnarlFinder prefers to root at a chain with an up-weighted )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder that adds 10 bp to node 4's apparent length unique_ptr finder(new IntegratedSnarlFinder(graph, {{4, 10}})); @@ -1041,7 +1009,7 @@ TEST_CASE("IntegratedSnarlFinder prefers to root at a chain with an up-weighted } TEST_CASE("IntegratedSnarlFinder sees tips as disqualifying ultrabubbles", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -1066,10 +1034,8 @@ TEST_CASE("IntegratedSnarlFinder sees tips as disqualifying ultrabubbles", "[gen )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -1098,10 +1064,10 @@ TEST_CASE("IntegratedSnarlFinder sees tips as disqualifying ultrabubbles", "[gen } TEST_CASE("CactusSnarlFinder throws an error instead of crashing when the graph has no edges", "[genotype][cactus-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( - + { "node": [ {"id": 1, "sequence": "G"}, @@ -1115,14 +1081,12 @@ TEST_CASE("CactusSnarlFinder throws an error instead of crashing when the graph {"id": 9, "sequence": "A"} ] } - + )"; - + // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make a CactusSnarlFinder unique_ptr finder(new CactusSnarlFinder(graph)); @@ -1183,7 +1147,7 @@ TEST_CASE("fixed priors can be assigned to genotypes", "[genotype]") { TEST_CASE("TrivialTraversalFinder can find traversals", "[genotype]") { // Build a toy graph const string graph_json = R"( - + { "node": [ {"id": 1, "sequence": "G"}, @@ -1208,7 +1172,7 @@ TEST_CASE("TrivialTraversalFinder can find traversals", "[genotype]") { {"from": 6, "to": 8}, {"from": 7, "to": 9}, {"from": 8, "to": 9} - + ], "path": [ {"name": "hint", "mapping": [ @@ -1219,14 +1183,12 @@ TEST_CASE("TrivialTraversalFinder can find traversals", "[genotype]") { ]} ] } - + )"; - + // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make a site Snarl site; @@ -1329,12 +1291,10 @@ TEST_CASE("CactusSnarlFinder can differentiate ultrabubbles from snarls", "[geno ] } )"; - + // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Find the snarls CactusSnarlFinder cubs(graph); @@ -1381,10 +1341,8 @@ TEST_CASE("CactusSnarlFinder can differentiate ultrabubbles from snarls", "[geno )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Find the snarls CactusSnarlFinder cubs(graph); @@ -1454,10 +1412,8 @@ TEST_CASE("IntegratedSnarlFinder can differentiate ultrabubbles from snarls", "[ )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Find the snarls IntegratedSnarlFinder cubs(graph); @@ -1504,10 +1460,8 @@ TEST_CASE("IntegratedSnarlFinder can differentiate ultrabubbles from snarls", "[ )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Find the snarls IntegratedSnarlFinder cubs(graph); @@ -1581,11 +1535,9 @@ TEST_CASE("RepresentativeTraversalFinder finds traversals correctly", "[genotype } )"; - // Make an actual graph + // Load the graph. Needs to be a vg because we will give it to a SupportAugmentedGraph later. VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + vg::io::json2graph(graph_json, &graph); // Find the snarls CactusSnarlFinder cubs(graph); @@ -1713,11 +1665,9 @@ TEST_CASE("RepresentativeTraversalFinder finds traversals of simple inversions", } )"; - // Make an actual graph + // Load the graph. Needs to be a vg because we will give it to a SupportAugmentedGraph later. VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + vg::io::json2graph(graph_json, &graph); // Find the snarls CactusSnarlFinder cubs(graph); @@ -1774,11 +1724,11 @@ TEST_CASE("GBWTTraversalFinder finds traversals for GBWT threads", "[genotype][g string graph_json = R"({"node": [{"id": 1, "sequence": "CAAATAAGGCTT"}, {"id": 2, "sequence": "G"}, {"id": 3, "sequence": "GGAAATTTTC"}, {"id": 4, "sequence": "C"}, {"id": 5, "sequence": "TGGAGTTCTATTATATTCC"}, {"id": 6, "sequence": "G"}, {"id": 7, "sequence": "A"}, {"id": 8, "sequence": "ACTCTCTGGTTCCTG"}, {"id": 9, "sequence": "A"}, {"id": 10, "sequence": "G"}, {"id": 11, "sequence": "TGCTATGTGTAACTAGTAATGGTAATGGATATGTTGGGCTTTTTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCA"}], "edge": [{"from": 1, "to": 2}, {"from": 1, "to": 3}, {"from": 2, "to": 3}, {"from": 3, "to": 4}, {"from": 3, "to": 5}, {"from": 4, "to": 5}, {"from": 5, "to": 6}, {"from": 5, "to": 7}, {"from": 6, "to": 8}, {"from": 7, "to": 8}, {"from": 8, "to": 9}, {"from": 8, "to": 10}, {"from": 9, "to": 11}, {"from": 10, "to": 11}]})"; // Load the JSON - vg::Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(vg::VG(proto_graph)); + xg_index.from_path_handle_graph(graph); gbwt::Verbosity::set(gbwt::Verbosity::SILENT); diff --git a/src/unittest/genotyper.cpp b/src/unittest/genotyper.cpp index e2e9f7a142..4228b16ee3 100644 --- a/src/unittest/genotyper.cpp +++ b/src/unittest/genotyper.cpp @@ -7,6 +7,7 @@ #include "../snarls.hpp" #include "../cactus_snarl_finder.hpp" #include "../traversal_finder.hpp" +#include "../io/json2graph.hpp" namespace vg { namespace unittest { @@ -41,15 +42,6 @@ TEST_CASE("traversals can be found from reads", "[genotyper]") { {"from": 6, "to": 8}, {"from": 7, "to": 9}, {"from": 8, "to": 9} - - ], - "path": [ - {"name": "hint", "mapping": [ - {"position": {"node_id": 1}, "rank" : 1 }, - {"position": {"node_id": 6}, "rank" : 2 }, - {"position": {"node_id": 8}, "rank" : 3 }, - {"position": {"node_id": 9}, "rank" : 4 } - ]} ] } @@ -57,9 +49,7 @@ TEST_CASE("traversals can be found from reads", "[genotyper]") { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + vg::io::json2graph(graph_json, &graph); // Find the snarls SnarlManager manager = CactusSnarlFinder(graph).find_snarls(); diff --git a/src/unittest/haplotypes.cpp b/src/unittest/haplotypes.cpp index e441bbe197..9e4e04475b 100644 --- a/src/unittest/haplotypes.cpp +++ b/src/unittest/haplotypes.cpp @@ -4,8 +4,10 @@ #include "catch.hpp" #include "haplotypes.hpp" +#include "../io/json2graph.hpp" #include "xg.hpp" #include "vg.hpp" +#include #include @@ -66,7 +68,7 @@ TEST_CASE("We can represent appropriate graphs according to linear reference", " )"; thread_t SNP_thread = {tm[1], tm[3], tm[4]}; - + string del_graph_json = R"( {"node":[ {"id":1,"sequence":"AAA"}, @@ -89,22 +91,24 @@ TEST_CASE("We can represent appropriate graphs according to linear reference", " ]} ]} )"; - + thread_t del_ref_thread = {tm[1], tm[2], tm[4]}; thread_t del_thread = {tm[1], tm[4]}; - - vg::Graph SNP_proto_graph; - json2pb(SNP_proto_graph, SNP_graph_json.c_str(), SNP_graph_json.size()); + + // Build the SNP graph + bdsg::HashGraph SNP_graph; + vg::io::json2graph(SNP_graph_json, &SNP_graph); // Build the xg index xg::XG SNP_xg_index; - SNP_xg_index.from_path_handle_graph(vg::VG(SNP_proto_graph)); + SNP_xg_index.from_path_handle_graph(SNP_graph); vg::path_handle_t SNP_ref_path_handle = SNP_xg_index.get_path_handle("reference"); - - vg::Graph del_proto_graph; - json2pb(del_proto_graph, del_graph_json.c_str(), del_graph_json.size()); + + // Build the del graph + bdsg::HashGraph del_graph; + vg::io::json2graph(del_graph_json, &del_graph); // Build the xg index xg::XG del_xg_index; - del_xg_index.from_path_handle_graph(vg::VG(del_proto_graph)); + del_xg_index.from_path_handle_graph(del_graph); vg::path_handle_t del_ref_path_handle = del_xg_index.get_path_handle("reference"); // NEGATIVE SNVs @@ -159,18 +163,20 @@ TEST_CASE("We can represent appropriate graphs according to linear reference", " thread_t double_thread = {tm[1], tm[2], tm[4]}; - vg::Graph long_proto_graph; - json2pb(long_proto_graph, long_graph_json.c_str(), long_graph_json.size()); + // Build the long graph + bdsg::HashGraph long_graph; + vg::io::json2graph(long_graph_json, &long_graph); // Build the xg index xg::XG long_xg_index; - long_xg_index.from_path_handle_graph(vg::VG(long_proto_graph)); + long_xg_index.from_path_handle_graph(long_graph); vg::path_handle_t long_ref_path_handle = long_xg_index.get_path_handle("reference"); - - vg::Graph double_proto_graph; - json2pb(double_proto_graph, double_graph_json.c_str(), double_graph_json.size()); + + // Build the double graph + bdsg::HashGraph double_graph; + vg::io::json2graph(double_graph_json, &double_graph); // Build the xg index xg::XG double_xg_index; - double_xg_index.from_path_handle_graph(vg::VG(double_proto_graph)); + double_xg_index.from_path_handle_graph(double_graph); vg::path_handle_t double_ref_path_handle = double_xg_index.get_path_handle("reference"); string matching_test_file = "matching_test.slls"; @@ -382,13 +388,13 @@ TEST_CASE("We can score haplotypes using GBWT", "[haplo-score][gbwt]") { TEST_CASE("We can recognize a required crossover", "[hapo-score][gbwt]") { // This graph is the start of xy2 from test/small string graph_json = R"({"node": [{"id": 1, "sequence": "CAAATAAGGCTT"}, {"id": 2, "sequence": "G"}, {"id": 3, "sequence": "GGAAATTTTC"}, {"id": 4, "sequence": "C"}, {"id": 5, "sequence": "TGGAGTTCTATTATATTCC"}, {"id": 6, "sequence": "G"}, {"id": 7, "sequence": "A"}, {"id": 8, "sequence": "ACTCTCTGGTTCCTG"}, {"id": 9, "sequence": "A"}, {"id": 10, "sequence": "G"}, {"id": 11, "sequence": "TGCTATGTGTAACTAGTAATGGTAATGGATATGTTGGGCTTTTTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCA"}], "edge": [{"from": 1, "to": 2}, {"from": 1, "to": 3}, {"from": 2, "to": 3}, {"from": 3, "to": 4}, {"from": 3, "to": 5}, {"from": 4, "to": 5}, {"from": 5, "to": 6}, {"from": 5, "to": 7}, {"from": 6, "to": 8}, {"from": 7, "to": 8}, {"from": 8, "to": 9}, {"from": 8, "to": 10}, {"from": 9, "to": 11}, {"from": 10, "to": 11}]})"; - - // Load the JSON - vg::Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); + + // Load the JSON into a HashGraph + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(vg::VG(proto_graph)); + xg_index.from_path_handle_graph(graph); gbwt::Verbosity::set(gbwt::Verbosity::SILENT); gbwt::DynamicGBWT* gbwt_index = new gbwt::DynamicGBWT; diff --git a/src/unittest/indexed_vg.cpp b/src/unittest/indexed_vg.cpp index 7f74d92193..27504dea9f 100644 --- a/src/unittest/indexed_vg.cpp +++ b/src/unittest/indexed_vg.cpp @@ -40,7 +40,7 @@ TEST_CASE("An IndexedVG can be created for a single node", "[handle][indexed-vg] ] })"; - // Load the JSON + // Load the JSON to Protobuf specifically. Graph proto_graph; json2pb(proto_graph, graph_json.c_str(), graph_json.size()); diff --git a/src/unittest/mapper.cpp b/src/unittest/mapper.cpp index 2caf42d076..17f81fe17b 100644 --- a/src/unittest/mapper.cpp +++ b/src/unittest/mapper.cpp @@ -1,9 +1,10 @@ /// \file mapper.cpp -/// +/// /// unit tests for the mapper #include #include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include #include #include "../mapper.hpp" @@ -25,14 +26,10 @@ TEST_CASE( "Mapper can map to a one-node graph", "[mapping][mapper]" ) { ]} ] })"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make GCSA quiet gcsa::Verbosity::set(gcsa::Verbosity::SILENT); @@ -245,14 +242,10 @@ TEST_CASE( "Mapper finds optimal mapping for read starting with node-border MEM" {"position":{"node_id":1444},"rank":1059}, {"position":{"node_id":1445},"rank":1060}]}]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make GCSA quiet gcsa::Verbosity::set(gcsa::Verbosity::SILENT); @@ -311,14 +304,10 @@ TEST_CASE( "Mapper can annotate positions correctly on both strands", "[mapper][ ]} ]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make GCSA quiet gcsa::Verbosity::set(gcsa::Verbosity::SILENT); diff --git a/src/unittest/minimizer_mapper.cpp b/src/unittest/minimizer_mapper.cpp index 3ecd5de147..84628276e4 100644 --- a/src/unittest/minimizer_mapper.cpp +++ b/src/unittest/minimizer_mapper.cpp @@ -3,8 +3,8 @@ /// unit tests for the minimizer mapper #include -#include "vg/io/json2pb.h" #include "../io/json2graph.hpp" +#include #include #include "../minimizer_mapper.hpp" #include "../build_index.hpp" @@ -450,15 +450,13 @@ TEST_CASE("MinimizerMapper can map an empty string between odd points", "[giraff {"id": "55511925", "sequence": "CTTCCTTCC"} ] })"; - - // TODO: Write a json_to_handle_graph - vg::Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - auto graph = vg::VG(proto_graph); - + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + Alignment aln; aln.set_sequence(""); - + pos_t left_anchor {55511921, false, 5}; // This is on the final base of the node pos_t right_anchor {55511925, false, 6}; @@ -480,7 +478,7 @@ TEST_CASE("MinimizerMapper can map an empty string between odd points", "[giraff TEST_CASE("MinimizerMapper can map with an initial deletion", "[giraffe][mapping][right_tail]") { Aligner aligner; - + string graph_json = R"({ "edge": [ {"from": "1", "to": "2"}, @@ -492,12 +490,10 @@ TEST_CASE("MinimizerMapper can map with an initial deletion", "[giraffe][mapping {"id": "3", "sequence": "CATTAG"} ] })"; - - // TODO: Write a json_to_handle_graph - vg::Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - auto graph = vg::VG(proto_graph); - + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + Alignment aln; aln.set_sequence("CATTAG"); @@ -527,7 +523,7 @@ TEST_CASE("MinimizerMapper can map with an initial deletion", "[giraffe][mapping TEST_CASE("MinimizerMapper can map with an initial deletion on a multi-base node", "[giraffe][mapping][right_tail]") { Aligner aligner; - + string graph_json = R"({ "edge": [ {"from": "1", "to": "2"}, @@ -539,12 +535,10 @@ TEST_CASE("MinimizerMapper can map with an initial deletion on a multi-base node {"id": "3", "sequence": "CATTAG"} ] })"; - - // TODO: Write a json_to_handle_graph - vg::Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - auto graph = vg::VG(proto_graph); - + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + Alignment aln; aln.set_sequence("CATTAG"); @@ -574,7 +568,7 @@ TEST_CASE("MinimizerMapper can map with an initial deletion on a multi-base node TEST_CASE("MinimizerMapper can map right off the past-the-end base", "[giraffe][mapping][right_tail]") { Aligner aligner; - + string graph_json = R"({ "edge": [ {"from": "1", "to": "2"}, @@ -586,15 +580,13 @@ TEST_CASE("MinimizerMapper can map right off the past-the-end base", "[giraffe][ {"id": "3", "sequence": "CATTAG"} ] })"; - - // TODO: Write a json_to_handle_graph - vg::Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - auto graph = vg::VG(proto_graph); - + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + Alignment aln; aln.set_sequence("CATTAG"); - + pos_t left_anchor {1, false, 1}; // This is the past-end position pos_t right_anchor = empty_pos_t(); @@ -635,15 +627,13 @@ TEST_CASE("MinimizerMapper can compute longest detectable gap in range", "[giraf TEST_CASE("MinimizerMapper can find a significant indel instead of a tempting softclip", "[giraffe][mapping][left_tail]") { Aligner aligner; - + string graph_json = R"({ "edge": [{"from": "30788083", "to": "30788088"}, {"from": "30788083", "to": "30788084"}, {"from": "30788074", "to": "30788075"}, {"from": "30788074", "to": "30788076"}, {"from": "30788079", "to": "30788080"}, {"from": "30788079", "to": "30788081"}, {"from": "30788086", "to": "30788088"}, {"from": "30788086", "to": "30788087", "to_end": true}, {"from": "30788075", "to": "30788077"}, {"from": "30788073", "to": "30788074"}, {"from": "30788078", "to": "30788079"}, {"from": "30788077", "to": "30788078"}, {"from": "30788084", "to": "30788088"}, {"from": "30788084", "to": "30788085"}, {"from": "30788076", "to": "30788077"}, {"from": "30788087", "from_start": true, "to": "30788088"}, {"from": "30788081", "to": "30788082"}, {"from": "30788080", "to": "30788082"}, {"from": "30788082", "to": "30788088"}, {"from": "30788082", "to": "30788083"}, {"from": "30788085", "to": "30788086"}], "node": [{"id": "30788083", "sequence": "AAA"}, {"id": "30788074", "sequence": "AAAAAAAATACAAAAAATTAGC"}, {"id": "30788079", "sequence": "CGCCACTGCACTCCAGCCTGGGC"}, {"id": "30788086", "sequence": "AAAAAAA"}, {"id": "30788075", "sequence": "T"}, {"id": "30788073", "sequence": "GAAAGAGAGTTGTTTAAATTCCATAGTTAGGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGGCTAACACGGTGAAACCCCGTCTCTACTA"}, {"id": "30788078", "sequence": "G"}, {"id": "30788077", "sequence": "GGGCGTGGTAGCGGGCGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATGGCGTGAACCCGGGAGGCGGAGCTTGCAGTGAGCCGAGATC"}, {"id": "30788084", "sequence": "A"}, {"id": "30788088", "sequence": "AATTCCATAGTTAGAAAAATAAGACATATCAGGTTTTCAAAAAGTGTAGCCATTTTCTGTTTCTAAAAGGGACACTTAAAGTGAAA"}, {"id": "30788076", "sequence": "C"}, {"id": "30788087", "sequence": "T"}, {"id": "30788081", "sequence": "A"}, {"id": "30788080", "sequence": "G"}, {"id": "30788082", "sequence": "ACAGAGCGAGACTCCGTCTCAAAAAAAAAAAAAA"}, {"id": "30788085", "sequence": "AA"}] })"; - - // TODO: Write a json_to_handle_graph - vg::Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - auto graph = vg::VG(proto_graph); + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); Alignment aln; aln.set_sequence("TTGAAAACCTGATATGTCTTATTTTTCTAACTATGGAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCGCCCGCTACCACGCCCGGCTAATTTTTTGTATTTTTTTT"); @@ -854,9 +844,8 @@ TEST_CASE("MinimizerMapper can extract a strand-split dagified local graph witho {"id": "60245278", "sequence": "GATTACAGATTACA"}] } )"; - vg::Graph graph_chunk; - json2pb(graph_chunk, graph_json.c_str(), graph_json.size()); - vg::VG graph(graph_chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); TestMinimizerMapper::with_dagified_local_graph(make_pos_t(60245283, false, 10), empty_pos_t(), 50, graph, [&](DeletableHandleGraph& dagified_graph, const handle_t& left_anchor_handle, const handle_t& right_anchor_handle, const std::function(const handle_t&)>& dagified_handle_to_base) { // The graph started as a stick diff --git a/src/unittest/multipath_alignment_graph.cpp b/src/unittest/multipath_alignment_graph.cpp index bea5f687aa..d78e19d6f1 100644 --- a/src/unittest/multipath_alignment_graph.cpp +++ b/src/unittest/multipath_alignment_graph.cpp @@ -3,7 +3,8 @@ /// unit tests for the multipath mapper's MultipathAlignmentGraph #include -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" +#include #include #include "../cactus_snarl_finder.hpp" #include "../integrated_snarl_finder.hpp" @@ -47,13 +48,9 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath })"; // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG vg; - vg.extend(proto_graph); - + bdsg::HashGraph vg; + ::vg::io::json2graph(graph_json, &vg); + // Make snarls on it CactusSnarlFinder bubble_finder(vg); IntegratedSnarlFinder snarl_finder(vg); diff --git a/src/unittest/multipath_mapper.cpp b/src/unittest/multipath_mapper.cpp index be6d3b6194..bc1dc4cdd9 100644 --- a/src/unittest/multipath_mapper.cpp +++ b/src/unittest/multipath_mapper.cpp @@ -4,7 +4,9 @@ #include #include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include +#include #include "../multipath_mapper.hpp" #include "../build_index.hpp" #include "xg.hpp" @@ -111,7 +113,7 @@ TEST_CASE( "MultipathMapper::read_coverage works", "[multipath][mapping][multipa } TEST_CASE( "MultipathMapper::query_cluster_graphs works", "[multipath][mapping][multipathmapper]" ) { - + string graph_json = R"({ "node": [{"id": 1, "sequence": "GATTACA"}], "path": [ @@ -120,14 +122,10 @@ TEST_CASE( "MultipathMapper::query_cluster_graphs works", "[multipath][mapping][ ]} ] })"; - - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); + + // Load the JSON into a HashGraph + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make GCSA quiet gcsa::Verbosity::set(gcsa::Verbosity::SILENT); @@ -135,17 +133,17 @@ TEST_CASE( "MultipathMapper::query_cluster_graphs works", "[multipath][mapping][ // Make pointers to fill in gcsa::GCSA* gcsaidx = nullptr; gcsa::LCPArray* lcpidx = nullptr; - + // Build the GCSA index build_gcsa_lcp(graph, gcsaidx, lcpidx, 16, 3); - + // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); - + xg_index.from_path_handle_graph(graph); + // Make a multipath mapper to map against the graph. TestMultipathMapper mapper(&xg_index, gcsaidx, lcpidx); - + // Make an Alignment that we're pretending we're doing Alignment aln; aln.set_sequence("GATTACA"); @@ -264,7 +262,7 @@ TEST_CASE( "MultipathMapper::query_cluster_graphs works", "[multipath][mapping][ } TEST_CASE( "MultipathMapper can map to a one-node graph", "[multipath][mapping][multipathmapper]" ) { - + string graph_json = R"({ "node": [{"id": 1, "sequence": "GATTACA"}], "path": [ @@ -273,14 +271,10 @@ TEST_CASE( "MultipathMapper can map to a one-node graph", "[multipath][mapping][ ]} ] })"; - - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); + + // Load the JSON into a HashGraph + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make GCSA quiet gcsa::Verbosity::set(gcsa::Verbosity::SILENT); @@ -291,11 +285,11 @@ TEST_CASE( "MultipathMapper can map to a one-node graph", "[multipath][mapping][ // Build the GCSA index build_gcsa_lcp(graph, gcsaidx, lcpidx, 16, 3); - + // Build the xg index xg::XG xg_index; xg_index.from_path_handle_graph(graph); - + // Make a multipath mapper to map against the graph. MultipathMapper mapper(&xg_index, gcsaidx, lcpidx); // Lower the max mapping quality so that it thinks it can find unambiguous mappings of @@ -422,16 +416,12 @@ TEST_CASE( "MultipathMapper can map to a one-node graph", "[multipath][mapping][ } TEST_CASE( "MultipathMapper can work on a bigger graph", "[multipath][mapping][multipathmapper]" ) { - + string graph_json = R"({"node":[{"sequence":"CTTCTCATCCCTCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTTTAAGTTTCCTGTGGACTAAGGACAAAGGTGCGGGGAGATG","id":12},{"sequence":"A","id":2},{"sequence":"CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTGGTTCCTGGTGCTATGTGTAACTAGTAATGGTAATGGATATGTTGGGCTTT","id":3},{"sequence":"TTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCAGACAAATCTGGGTT","id":4},{"sequence":"CAAATCCTCACTTTGCCACATATTAGCCATGTGACTTTGAACAAGTTAGTTAATCTCTCTGAACTTCAGTTTAATTATCTCTAATATGGAGATGATACTA","id":5},{"sequence":"CTGACAGCAGAGGTTTGCTGTGAAGATTAAATTAGGTGATGCTTGTAAAGCTCAGGGAATAGTGCCTGGCATAGAGGAAAGCCTCTGACAACTGGTAGTT","id":6},{"sequence":"ACTGTTATTTACTATGAATCCTCACCTTCCTTGACTTCTTGAAACATTTGGCTATTGACCTCTTTCCTCCTTGAGGCTCTTCTGGCTTTTCATTGTCAAC","id":7},{"sequence":"ACAGTCAACGCTCAATACAAGGGACATTAGGATTGGCAGTAGCTCAGAGATCTCTCTGCTCACCGTGATCTTCAAGTTTGAAAATTGCATCTCAAATCTA","id":8},{"sequence":"AGACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGTGTATACGATGTAACTCTGTTCGGGCACTGGTGAAAGATAA","id":9},{"sequence":"CAGAGGAAATGCCTGGCTTTTTATCAGAACATGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGATCTCTTCACTGGCCTCTTATC","id":10},{"sequence":"TTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCTCAATTGTTCATTTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAG","id":11}],"path":[{"name":"x","mapping":[{"position":{"node_id":3},"edit":[{"from_length":100,"to_length":100}],"rank":1},{"position":{"node_id":4},"edit":[{"from_length":100,"to_length":100}],"rank":2},{"position":{"node_id":5},"edit":[{"from_length":100,"to_length":100}],"rank":3},{"position":{"node_id":6},"edit":[{"from_length":100,"to_length":100}],"rank":4},{"position":{"node_id":7},"edit":[{"from_length":100,"to_length":100}],"rank":5},{"position":{"node_id":8},"edit":[{"from_length":100,"to_length":100}],"rank":6},{"position":{"node_id":9},"edit":[{"from_length":100,"to_length":100}],"rank":7},{"position":{"node_id":10},"edit":[{"from_length":100,"to_length":100}],"rank":8},{"position":{"node_id":11},"edit":[{"from_length":100,"to_length":100}],"rank":9},{"position":{"node_id":12},"edit":[{"from_length":100,"to_length":100}],"rank":10},{"position":{"node_id":2},"edit":[{"from_length":1,"to_length":1}],"rank":11}]}],"edge":[{"from":12,"to":2},{"from":3,"to":4},{"from":4,"to":5},{"from":5,"to":6},{"from":6,"to":7},{"from":7,"to":8},{"from":8,"to":9},{"from":9,"to":10},{"from":10,"to":11},{"from":11,"to":12}]})"; - - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); + + // Load the JSON into a HashGraph + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make GCSA quiet gcsa::Verbosity::set(gcsa::Verbosity::SILENT); @@ -442,11 +432,11 @@ TEST_CASE( "MultipathMapper can work on a bigger graph", "[multipath][mapping][m // Build the GCSA index build_gcsa_lcp(graph, gcsaidx, lcpidx, 16, 3); - + // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); - + xg_index.from_path_handle_graph(graph); + // Make a multipath mapper to map against the graph. TestMultipathMapper mapper(&xg_index, gcsaidx, lcpidx); // Lower the max mapping quality so that it thinks it can find unambiguous mappings of diff --git a/src/unittest/packed_structs.cpp b/src/unittest/packed_structs.cpp index 9c0075751e..512e638620 100644 --- a/src/unittest/packed_structs.cpp +++ b/src/unittest/packed_structs.cpp @@ -69,7 +69,7 @@ using namespace std; case APPEND: for (size_t k = 0; k < appends_per_op; k++) { std_vec.push_back(next_val); - dyn_vec.append(next_val); + dyn_vec.push_back(next_val); next_val++; } @@ -79,7 +79,7 @@ using namespace std; if (!std_vec.empty()) { for (size_t k = 0; k < pops_per_op; k++) { std_vec.pop_back(); - dyn_vec.pop(); + dyn_vec.pop_back(); } } @@ -161,7 +161,7 @@ using namespace std; case APPEND: for (size_t k = 0; k < appends_per_op; k++) { std_vec.push_back(next_val); - dyn_vec.append(next_val); + dyn_vec.push_back(next_val); next_val = val_distr(prng); } @@ -171,7 +171,7 @@ using namespace std; if (!std_vec.empty()) { for (size_t k = 0; k < pops_per_op; k++) { std_vec.pop_back(); - dyn_vec.pop(); + dyn_vec.pop_back(); } } @@ -252,7 +252,7 @@ using namespace std; case APPEND_LEFT: for (size_t k = 0; k < appends_per_op; k++) { std_deq.push_front(next_val); - suc_deq.append_front(next_val); + suc_deq.push_front(next_val); next_val++; } @@ -269,7 +269,7 @@ using namespace std; case APPEND_RIGHT: for (size_t k = 0; k < appends_per_op; k++) { std_deq.push_back(next_val); - suc_deq.append_back(next_val); + suc_deq.push_back(next_val); next_val++; } diff --git a/src/unittest/path_component_index.cpp b/src/unittest/path_component_index.cpp index 058f4bf9c1..edd3a6013a 100644 --- a/src/unittest/path_component_index.cpp +++ b/src/unittest/path_component_index.cpp @@ -8,7 +8,8 @@ #include "path_component_index.hpp" #include "xg.hpp" #include "vg.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" +#include #include namespace vg { @@ -17,14 +18,14 @@ namespace unittest { TEST_CASE("Path component memoization produces expected results", "[pathcomponent]") { string graph_json = R"({"node": [{"sequence": "AAACCC", "id": 1}, {"sequence": "CACACA", "id": 2}, {"sequence": "CACACA", "id": 3}, {"sequence": "TTTTGG", "id": 4}, {"sequence": "ACGTAC", "id": 5}], "path": [{"name": "one", "mapping": [{"position": {"node_id": 1}, "rank": 1}, {"position": {"node_id": 2}, "rank": 2}]}, {"name": "three", "mapping": [{"position": {"node_id": 2}, "rank": 1}, {"position": {"node_id": 3}, "rank": 2}]}, {"name": "two", "mapping": [{"position": {"node_id": 4}, "rank": 1}, {"position": {"node_id": 5}, "rank": 2}]}], "edge": [{"from": 1, "to": 2}, {"from": 2, "to": 3}, {"from": 4, "to": 5}]})"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(graph); unordered_set comp_1; diff --git a/src/unittest/path_index.cpp b/src/unittest/path_index.cpp index b70152ae2d..e1facc2977 100644 --- a/src/unittest/path_index.cpp +++ b/src/unittest/path_index.cpp @@ -5,9 +5,9 @@ #include #include -#include "vg/io/json2pb.h" -#include +#include "../io/json2graph.hpp" #include "../path_index.hpp" +#include #include "catch.hpp" namespace vg { @@ -58,15 +58,11 @@ const string path_index_graph_1 = R"( TEST_CASE("PathIndex can be created", "[pathindex]") { - + // Load the graph - Graph graph; - json2pb(graph, path_index_graph_1.c_str(), path_index_graph_1.size()); - - // Make it into a VG - VG to_index; - to_index.extend(graph); - + bdsg::HashGraph to_index; + vg::io::json2graph(path_index_graph_1, &to_index); + // Make a PathIndex PathIndex index(to_index, "cool", true); @@ -78,13 +74,9 @@ TEST_CASE("PathIndex can be created", "[pathindex]") { TEST_CASE("PathIndex translation can change a node ID", "[pathindex]") { // Load the graph - Graph graph; - json2pb(graph, path_index_graph_1.c_str(), path_index_graph_1.size()); - - // Make it into a VG - VG to_index; - to_index.extend(graph); - + bdsg::HashGraph to_index; + vg::io::json2graph(path_index_graph_1, &to_index); + // Make a PathIndex PathIndex index(to_index, "cool", true); @@ -115,15 +107,11 @@ TEST_CASE("PathIndex translation can change a node ID", "[pathindex]") { } TEST_CASE("PathIndex translation can divide a node", "[pathindex]") { - + // Load the graph - Graph graph; - json2pb(graph, path_index_graph_1.c_str(), path_index_graph_1.size()); - - // Make it into a VG - VG to_index; - to_index.extend(graph); - + bdsg::HashGraph to_index; + vg::io::json2graph(path_index_graph_1, &to_index); + // Make a PathIndex PathIndex index(to_index, "cool", true); @@ -174,15 +162,11 @@ TEST_CASE("PathIndex translation can divide a node", "[pathindex]") { } TEST_CASE("PathIndex translation can create reverse strand mappings", "[pathindex]") { - + // Load the graph - Graph graph; - json2pb(graph, path_index_graph_1.c_str(), path_index_graph_1.size()); - - // Make it into a VG - VG to_index; - to_index.extend(graph); - + bdsg::HashGraph to_index; + vg::io::json2graph(path_index_graph_1, &to_index); + // Make a PathIndex PathIndex index(to_index, "cool", true); @@ -235,15 +219,11 @@ TEST_CASE("PathIndex translation can create reverse strand mappings", "[pathinde } TEST_CASE("PathIndex translation can handle translations articulated for the reverse strand", "[pathindex]") { - + // Load the graph - Graph graph; - json2pb(graph, path_index_graph_1.c_str(), path_index_graph_1.size()); - - // Make it into a VG - VG to_index; - to_index.extend(graph); - + bdsg::HashGraph to_index; + vg::io::json2graph(path_index_graph_1, &to_index); + // Make a PathIndex PathIndex index(to_index, "cool", true); @@ -300,15 +280,11 @@ TEST_CASE("PathIndex translation can handle translations articulated for the rev } TEST_CASE("PathIndex translation can divide the last node", "[pathindex]") { - + // Load the graph - Graph graph; - json2pb(graph, path_index_graph_1.c_str(), path_index_graph_1.size()); - - // Make it into a VG - VG to_index; - to_index.extend(graph); - + bdsg::HashGraph to_index; + vg::io::json2graph(path_index_graph_1, &to_index); + // Make a PathIndex PathIndex index(to_index, "cool", true); diff --git a/src/unittest/phase_unfolder.cpp b/src/unittest/phase_unfolder.cpp index 0c79972941..36cfbca9de 100644 --- a/src/unittest/phase_unfolder.cpp +++ b/src/unittest/phase_unfolder.cpp @@ -12,7 +12,8 @@ #include #include "../phase_unfolder.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" +#include #include "xg.hpp" #include "catch.hpp" @@ -210,10 +211,10 @@ const std::string unfolder_graph_path = R"( TEST_CASE("PhaseUnfolder can unfold XG paths", "[phaseunfolder][indexing]") { // Build an XG index with a path. - Graph graph_with_path; - json2pb(graph_with_path, unfolder_graph_path.c_str(), unfolder_graph_path.size()); + bdsg::HashGraph graph_with_path; + vg::io::json2graph(unfolder_graph_path, &graph_with_path); xg::XG xg_index; - xg_index.from_path_handle_graph(VG(graph_with_path)); + xg_index.from_path_handle_graph(graph_with_path); // Build an empty GBWT index. gbwt::GBWT gbwt_index; @@ -224,9 +225,7 @@ TEST_CASE("PhaseUnfolder can unfold XG paths", "[phaseunfolder][indexing]") { // Build a VG graph. VG vg_graph; - Graph temp_graph; - json2pb(temp_graph, unfolder_graph.c_str(), unfolder_graph.size()); - vg_graph.merge(temp_graph); + vg::io::json2graph(unfolder_graph, &vg_graph); // Remove branching regions from the VG graph, including the last node, // but keep the edge (1, 6) in the graph. @@ -255,10 +254,10 @@ TEST_CASE("PhaseUnfolder can unfold XG paths", "[phaseunfolder][indexing]") { TEST_CASE("PhaseUnfolder can restore XG paths", "[phaseunfolder][indexing]") { // Build an XG index with a path. - Graph graph_with_path; - json2pb(graph_with_path, unfolder_graph_path.c_str(), unfolder_graph_path.size()); + bdsg::HashGraph graph_with_path; + vg::io::json2graph(unfolder_graph_path, &graph_with_path); xg::XG xg_index; - xg_index.from_path_handle_graph(VG(graph_with_path)); + xg_index.from_path_handle_graph(graph_with_path); // Build an empty GBWT index. gbwt::GBWT gbwt_index; @@ -269,9 +268,7 @@ TEST_CASE("PhaseUnfolder can restore XG paths", "[phaseunfolder][indexing]") { // Build a VG graph. VG vg_graph; - Graph temp_graph; - json2pb(temp_graph, unfolder_graph.c_str(), unfolder_graph.size()); - vg_graph.merge(temp_graph); + vg::io::json2graph(unfolder_graph, &vg_graph); // Remove branching regions from the VG graph, including the last node, // but keep the edge (1, 6) in the graph. @@ -299,10 +296,10 @@ TEST_CASE("PhaseUnfolder can restore XG paths", "[phaseunfolder][indexing]") { TEST_CASE("PhaseUnfolder can unfold GBWT threads", "[phaseunfolder][indexing]") { // Build an XG index without a path. - Graph graph_without_path; - json2pb(graph_without_path, unfolder_graph.c_str(), unfolder_graph.size()); + bdsg::HashGraph graph_without_path; + vg::io::json2graph(unfolder_graph, &graph_without_path); xg::XG xg_index; - xg_index.from_path_handle_graph(VG(graph_without_path)); + xg_index.from_path_handle_graph(graph_without_path); // Build a GBWT with three threads including a duplicate. We want to have // only one instance of short_path unfolded, but we want separate copies @@ -335,9 +332,7 @@ TEST_CASE("PhaseUnfolder can unfold GBWT threads", "[phaseunfolder][indexing]") // Build a VG graph. VG vg_graph; - Graph temp_graph; - json2pb(temp_graph, unfolder_graph.c_str(), unfolder_graph.size()); - vg_graph.merge(temp_graph); + vg::io::json2graph(unfolder_graph, &vg_graph); // Remove branching regions from the VG graph, including the last node, // but keep the edge (1, 6) in the graph. @@ -366,10 +361,10 @@ TEST_CASE("PhaseUnfolder can unfold GBWT threads", "[phaseunfolder][indexing]") TEST_CASE("PhaseUnfolder can unfold both XG paths and GBWT threads", "[phaseunfolder][indexing]") { // Build an XG index with a path. - Graph graph_with_path; - json2pb(graph_with_path, unfolder_graph_path.c_str(), unfolder_graph_path.size()); + bdsg::HashGraph graph_with_path; + vg::io::json2graph(unfolder_graph_path, &graph_with_path); xg::XG xg_index; - xg_index.from_path_handle_graph(VG(graph_with_path)); + xg_index.from_path_handle_graph(graph_with_path); // Build a GBWT with three threads including a duplicate. We want to have // only one instance of short_path unfolded, but we want separate copies @@ -402,9 +397,7 @@ TEST_CASE("PhaseUnfolder can unfold both XG paths and GBWT threads", "[phaseunfo // Build a VG graph. VG vg_graph; - Graph temp_graph; - json2pb(temp_graph, unfolder_graph.c_str(), unfolder_graph.size()); - vg_graph.merge(temp_graph); + vg::io::json2graph(unfolder_graph, &vg_graph); // Remove branching regions from the VG graph, including the last node, // but keep the edge (1, 6) in the graph. @@ -501,10 +494,10 @@ const std::string unfolder_graph_simple_path = R"( TEST_CASE("PhaseUnfolder can merge shared prefixes and suffixes", "[phaseunfolder][indexing]") { // Build an XG index. - Graph simple_graph; - json2pb(simple_graph, unfolder_graph_simple.c_str(), unfolder_graph_simple.size()); + bdsg::HashGraph simple_graph; + vg::io::json2graph(unfolder_graph_simple, &simple_graph); xg::XG xg_index; - xg_index.from_path_handle_graph(VG(simple_graph)); + xg_index.from_path_handle_graph(simple_graph); // Build a GBWT with both possible threads. gbwt::vector_type upper_path { @@ -536,9 +529,7 @@ TEST_CASE("PhaseUnfolder can merge shared prefixes and suffixes", "[phaseunfolde // Build a VG graph. VG vg_graph; - Graph temp_graph; - json2pb(temp_graph, unfolder_graph_simple.c_str(), unfolder_graph_simple.size()); - vg_graph.merge(temp_graph); + vg::io::json2graph(unfolder_graph_simple, &vg_graph); // Remove the bubble, including its endpoints. std::set to_remove { 3, 4, 5, 6 }; @@ -566,10 +557,10 @@ TEST_CASE("PhaseUnfolder can merge shared prefixes and suffixes", "[phaseunfolde TEST_CASE("PhaseUnfolder can extend short threads", "[phaseunfolder][indexing]") { // Build an XG index. - Graph simple_graph_with_path; - json2pb(simple_graph_with_path, unfolder_graph_simple_path.c_str(), unfolder_graph_simple_path.size()); + bdsg::HashGraph simple_graph_with_path; + vg::io::json2graph(unfolder_graph_simple_path, &simple_graph_with_path); xg::XG xg_index; - xg_index.from_path_handle_graph(VG(simple_graph_with_path)); + xg_index.from_path_handle_graph(simple_graph_with_path); // Build a GBWT for the fragment that is different from the reference. gbwt::vector_type short_fragment { @@ -586,9 +577,7 @@ TEST_CASE("PhaseUnfolder can extend short threads", "[phaseunfolder][indexing]") // Build a VG graph. VG vg_graph; - Graph temp_graph; - json2pb(temp_graph, unfolder_graph_simple.c_str(), unfolder_graph_simple.size()); - vg_graph.merge(temp_graph); + vg::io::json2graph(unfolder_graph_simple, &vg_graph); // Remove the bubble, including its endpoints. std::set to_remove { 3, 4, 5, 6 }; diff --git a/src/unittest/randomly_flipped_nodes.cpp b/src/unittest/randomly_flipped_nodes.cpp new file mode 100644 index 0000000000..455bdd18ae --- /dev/null +++ b/src/unittest/randomly_flipped_nodes.cpp @@ -0,0 +1,179 @@ +#include "catch.hpp" +#include "../handle.hpp" +#include "../utility.hpp" +#include + +#include "support/randomly_flipped_nodes.hpp" +#include "support/randomness.hpp" +#include "support/random_graph.hpp" + +#include +#include + +namespace vg { +namespace unittest { + +/// Get the canonicalized set of edge sequence pairs from a graph. +/// Each edge is represented as a pair of sequences (left_seq, right_seq) read +/// in the orientation of the edge. To canonicalize, we compare each pair +/// against its reverse complement (RC(right_seq), RC(left_seq)) and keep the +/// lexicographically smaller one. +/// +/// This doesn't fully constrain the graph, but if this doesn't match what it's +/// supposed to, it can tell us that the graph smells off and is wrong. +static set> canonical_edge_pairs(const HandleGraph& graph) { + set> result; + graph.for_each_edge([&](const edge_t& edge) { + string left_seq = graph.get_sequence(edge.first); + string right_seq = graph.get_sequence(edge.second); + + // The reverse complement pair: RC(right) on the left, RC(left) on the right + string rc_right = reverse_complement(right_seq); + string rc_left = reverse_complement(left_seq); + + pair forward_pair = {left_seq, right_seq}; + pair rc_pair = {rc_right, rc_left}; + + // Use the lexicographically smaller one as canonical + if (rc_pair < forward_pair) { + result.insert(rc_pair); + } else { + result.insert(forward_pair); + } + return true; + }); + return result; +} + +/// Make sure that observed and expected graphs are not obviously not +/// isomorphic. +static void validate_graph(const HandleGraph& observed, const HandleGraph& expected, const set>& expected_edges) { + REQUIRE(observed.get_node_count() == expected.get_node_count()); + REQUIRE(observed.get_edge_count() == expected.get_edge_count()); + + auto observed_edges = canonical_edge_pairs(observed); + REQUIRE(observed_edges == expected_edges); +} + +TEST_CASE("randomly_flipped_nodes preserves graph structure on a simple linear graph", "[randomly_flipped_nodes]") { + bdsg::HashGraph graph; + std::string stick_sequence = "GGACTGACTCGCATGTCGAGCGACTCGCGCGAGCTATCGTAGTACGCGAGTCATATTATATTATCACG"; + size_t node_length = 3; + handle_t prev_handle; + for (size_t i = 0; i < stick_sequence.size(); i += node_length) { + handle_t h = graph.create_handle(stick_sequence.substr(i, node_length)); + if (i > 0) { + graph.create_edge(prev_handle, h); + } + prev_handle = h; + } + + auto original_edges = canonical_edge_pairs(graph); + + SECTION("flipping no nodes preserves edges exactly") { + default_random_engine gen(test_seed_source()); + auto flipped = randomly_flipped_nodes(graph, 0.0, gen); + validate_graph(flipped, graph, original_edges); + } + + SECTION("flipping all nodes preserves canonical edge pairs") { + default_random_engine gen(test_seed_source()); + auto flipped = randomly_flipped_nodes(graph, 1.0, gen); + validate_graph(flipped, graph, original_edges); + } + + SECTION("flipping 50% of nodes preserves canonical edge pairs") { + default_random_engine gen(test_seed_source()); + auto flipped = randomly_flipped_nodes(graph, 0.5, gen); + validate_graph(flipped, graph, original_edges); + } +} + +TEST_CASE("randomly_flipped_nodes preserves structure on graph with reversing edges", "[randomly_flipped_nodes]") { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("GATT", 1); + handle_t h2 = graph.create_handle("ACA", 2); + handle_t h3 = graph.create_handle("CGAT", 3); + handle_t h4 = graph.create_handle("TCGAA", 4); + + // Forward edges + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + graph.create_edge(h3, h4); + // Reversing edge: 4 fwd -> 3 rev + graph.create_edge(h4, graph.flip(h3)); + + auto original_edges = canonical_edge_pairs(graph); + + default_random_engine gen(test_seed_source()); + for (int i = 0; i < 10; i++) { + auto flipped = randomly_flipped_nodes(graph, 0.5, gen); + validate_graph(flipped, graph, original_edges); + } +} + +TEST_CASE("randomly_flipped_nodes preserves structure on graph with self-loops", "[randomly_flipped_nodes]") { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("ACGT", 1); + handle_t h2 = graph.create_handle("TTCC", 2); + + graph.create_edge(h1, h2); + // Self-loop on h1: fwd -> fwd + graph.create_edge(h1, h1); + // Inverting self-loop on h2: fwd -> rev + graph.create_edge(h2, graph.flip(h2)); + + auto original_edges = canonical_edge_pairs(graph); + + default_random_engine gen(test_seed_source()); + for (int i = 0; i < 10; i++) { + auto flipped = randomly_flipped_nodes(graph, 0.5, gen); + validate_graph(flipped, graph, original_edges); + } +} + +TEST_CASE("randomly_flipped_nodes preserves structure on random graphs", "[randomly_flipped_nodes]") { + for (int trial = 0; trial < 5; trial++) { + bdsg::HashGraph graph; + random_graph(100, 10, 10, &graph); + + auto original_edges = canonical_edge_pairs(graph); + + default_random_engine gen(test_seed_source()); + for (int i = 0; i < 5; i++) { + auto flipped = randomly_flipped_nodes(graph, 0.5, gen); + validate_graph(flipped, graph, original_edges); + } + } +} + +TEST_CASE("randomly_flipped_nodes preserves node IDs", "[randomly_flipped_nodes]") { + bdsg::HashGraph graph; + graph.create_handle("AAA", 5); + graph.create_handle("CCC", 10); + graph.create_handle("GGG", 15); + graph.create_edge(graph.get_handle(5), graph.get_handle(10)); + graph.create_edge(graph.get_handle(10), graph.get_handle(15)); + + default_random_engine gen(test_seed_source()); + auto flipped = randomly_flipped_nodes(graph, 0.5, gen); + + REQUIRE(flipped.has_node(5)); + REQUIRE(flipped.has_node(10)); + REQUIRE(flipped.has_node(15)); +} + +TEST_CASE("randomly_flipped_nodes actually flips node sequences", "[randomly_flipped_nodes]") { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("AAAC", 1); // RC = GTTT + + default_random_engine gen(test_seed_source()); + // Guarantee a flip + auto flipped = randomly_flipped_nodes(graph, 1.0, gen); + + // The forward sequence should be the RC of the original + REQUIRE(flipped.get_sequence(flipped.get_handle(1)) == "GTTT"); +} + +} // namespace unittest +} // namespace vg diff --git a/src/unittest/readfilter.cpp b/src/unittest/readfilter.cpp index cc1562f3f3..6d84fa0a38 100644 --- a/src/unittest/readfilter.cpp +++ b/src/unittest/readfilter.cpp @@ -5,6 +5,9 @@ #include "catch.hpp" #include "readfilter.hpp" #include "xg.hpp" +#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" +#include namespace vg { namespace unittest { @@ -44,13 +47,13 @@ TEST_CASE("reads with ambiguous ends can be trimmed", "[filter]") { )"; - // Load it into Protobuf - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - + // Load the graph + bdsg::HashGraph chunk; + vg::io::json2graph(graph_json, &chunk); + // Pass it over to XG xg::XG index; - index.from_path_handle_graph(VG(chunk)); + index.from_path_handle_graph(chunk); // Make a ReadFilter; ReadFilter filter; diff --git a/src/unittest/sampler.cpp b/src/unittest/sampler.cpp index d8bb95b650..cda0147f57 100644 --- a/src/unittest/sampler.cpp +++ b/src/unittest/sampler.cpp @@ -6,11 +6,10 @@ #include #include -#include "vg/io/json2pb.h" -#include +#include "../io/json2graph.hpp" +#include #include "../sampler.hpp" #include "../xg.hpp" -#include "../vg.hpp" #include "catch.hpp" namespace vg { @@ -28,13 +27,9 @@ TEST_CASE( "Sampler can sample from a 1-node graph", "[sampler]" ) { })"; // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); - + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + // Build the xg index xg::XG xg_index; xg_index.from_path_handle_graph(graph); @@ -118,13 +113,9 @@ TEST_CASE( "position_at works", "[sampler]" ) { })"; // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); - + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + // Build the xg index xg::XG xg_index; xg_index.from_path_handle_graph(graph); @@ -195,13 +186,9 @@ TEST_CASE( "Sampler can sample from a loop-containing path", "[sampler]" ) { })"; // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); - + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + // Build the xg index xg::XG xg_index; xg_index.from_path_handle_graph(graph); @@ -259,13 +246,9 @@ TEST_CASE( "Sampler can across reversing edges", "[sampler]" ) { })"; // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); - + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + // Build the xg index xg::XG xg_index; xg_index.from_path_handle_graph(graph); diff --git a/src/unittest/snarl_decomposition_fuzzer.cpp b/src/unittest/snarl_decomposition_fuzzer.cpp new file mode 100644 index 0000000000..38742be20c --- /dev/null +++ b/src/unittest/snarl_decomposition_fuzzer.cpp @@ -0,0 +1,339 @@ +#include "catch.hpp" +#include "../handle.hpp" +#include + +#include "support/snarl_decomposition_fuzzer.hpp" + +#include +#include + +namespace vg { +namespace unittest { + +using ET = DecompositionEventType; +using Event = DecompositionEvent; + +TEST_CASE("ReplaySnarlFinder replays events faithfully", "[snarl_decomposition_fuzzer]") { + // Build a small graph to get real handles + bdsg::HashGraph graph; + graph.create_handle("A", 1); + graph.create_handle("C", 2); + graph.create_handle("G", 3); + graph.create_handle("T", 4); + graph.create_handle("AA", 5); + + std::vector events = { + {ET::BEGIN_CHAIN, 1, false}, + {ET::BEGIN_SNARL, 1, false}, + {ET::BEGIN_CHAIN, 2, true}, + {ET::END_CHAIN, 3, true}, + {ET::END_SNARL, 4, false}, + {ET::BEGIN_SNARL, 4, false}, + {ET::END_SNARL, 5, false}, + {ET::END_CHAIN, 5, false}, + }; + + ReplaySnarlFinder finder(&graph, events); + std::vector captured = capture_events(finder, graph); + + REQUIRE(captured == events); +} + +TEST_CASE("SnarlDecompositionFuzzer passes through when nothing is flipped", "[snarl_decomposition_fuzzer]") { + bdsg::HashGraph graph; + graph.create_handle("A", 1); + graph.create_handle("C", 2); + graph.create_handle("G", 3); + graph.create_handle("T", 4); + graph.create_handle("AA", 5); + + std::vector events = { + {ET::BEGIN_CHAIN, 1, false}, + {ET::BEGIN_SNARL, 1, false}, + {ET::BEGIN_CHAIN, 2, true}, + {ET::END_CHAIN, 3, true}, + {ET::END_SNARL, 4, false}, + {ET::BEGIN_SNARL, 4, false}, + {ET::END_SNARL, 5, false}, + {ET::END_CHAIN, 5, false}, + }; + + ReplaySnarlFinder replay(&graph, events); + + // No chains to flip + SnarlDecompositionFuzzer fuzzer(&graph, &replay, {}); + + std::vector captured = capture_events(fuzzer, graph); + + REQUIRE(captured == events); +} + +TEST_CASE("SnarlDecompositionFuzzer flips an outer chain", "[snarl_decomposition_fuzzer]") { + // Graph: + // Chain: 1fwd -> snarl(1fwd, 4fwd) -> snarl(4fwd, 5fwd) -> 5fwd + // Inside first snarl: chain 2rev->3rev + bdsg::HashGraph graph; + graph.create_handle("A", 1); + graph.create_handle("C", 2); + graph.create_handle("G", 3); + graph.create_handle("T", 4); + graph.create_handle("AA", 5); + + std::vector events = { + {ET::BEGIN_CHAIN, 1, false}, + {ET::BEGIN_SNARL, 1, false}, + {ET::BEGIN_CHAIN, 2, true}, + {ET::END_CHAIN, 3, true}, + {ET::END_SNARL, 4, false}, + {ET::BEGIN_SNARL, 4, false}, + {ET::END_SNARL, 5, false}, + {ET::END_CHAIN, 5, false}, + }; + + ReplaySnarlFinder replay(&graph, events); + + SECTION("flip outer chain only") { + // Flip the outer chain (1fwd -> 5fwd) + std::unordered_set flips {1, 5}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + // Expected after flipping the outer chain: + // Flipping a chain reverses everything inside it, including children. + // The nested chain 2rev->3rev gets reversed to 3fwd->2fwd as + // part of the parent flip. + std::vector expected = { + {ET::BEGIN_CHAIN, 5, true}, + {ET::BEGIN_SNARL, 5, true}, + {ET::END_SNARL, 4, true}, + {ET::BEGIN_SNARL, 4, true}, + {ET::BEGIN_CHAIN, 3, false}, + {ET::END_CHAIN, 2, false}, + {ET::END_SNARL, 1, true}, + {ET::END_CHAIN, 1, true}, + }; + + REQUIRE(captured == expected); + } + + SECTION("flip outer and nested chain") { + // Flip outer chain (1fwd->5fwd) AND nested chain (2rev->3rev) + std::unordered_set flips {1, 5, 2, 3}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + // Expected: outer chain flipped (reversing everything, including + // the nested chain to 3fwd->2fwd), AND THEN the nested chain is + // flipped again back to its original orientation 2rev->3rev. + std::vector expected = { + {ET::BEGIN_CHAIN, 5, true}, + {ET::BEGIN_SNARL, 5, true}, + {ET::END_SNARL, 4, true}, + {ET::BEGIN_SNARL, 4, true}, + {ET::BEGIN_CHAIN, 2, true}, + {ET::END_CHAIN, 3, true}, + {ET::END_SNARL, 1, true}, + {ET::END_CHAIN, 1, true}, + }; + + REQUIRE(captured == expected); + } + + SECTION("flip nested chain only") { + // Flip only the nested chain (2rev->3rev), outer stays + std::unordered_set flips {2, 3}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + // Outer chain not flipped, nested chain flipped + std::vector expected = { + {ET::BEGIN_CHAIN, 1, false}, + {ET::BEGIN_SNARL, 1, false}, + {ET::BEGIN_CHAIN, 3, false}, + {ET::END_CHAIN, 2, false}, + {ET::END_SNARL, 4, false}, + {ET::BEGIN_SNARL, 4, false}, + {ET::END_SNARL, 5, false}, + {ET::END_CHAIN, 5, false}, + }; + + REQUIRE(captured == expected); + } +} + +TEST_CASE("SnarlDecompositionFuzzer handles empty chain", "[snarl_decomposition_fuzzer]") { + bdsg::HashGraph graph; + graph.create_handle("ACGT", 1); + + // An empty chain: begin and end with same handle, no snarls inside + std::vector events = { + {ET::BEGIN_CHAIN, 1, false}, + {ET::END_CHAIN, 1, false}, + }; + + ReplaySnarlFinder replay(&graph, events); + + SECTION("flipping an empty chain") { + std::unordered_set flips {1}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + std::vector expected = { + {ET::BEGIN_CHAIN, 1, true}, + {ET::END_CHAIN, 1, true}, + }; + + REQUIRE(captured == expected); + } +} + +TEST_CASE("SnarlDecompositionFuzzer handles multiple top-level chains", "[snarl_decomposition_fuzzer]") { + bdsg::HashGraph graph; + graph.create_handle("A", 1); + graph.create_handle("C", 2); + graph.create_handle("G", 3); + graph.create_handle("T", 4); + + // Two top-level chains in the root snarl + std::vector events = { + // Chain 1: 1fwd -> snarl -> 2fwd + {ET::BEGIN_CHAIN, 1, false}, + {ET::BEGIN_SNARL, 1, false}, + {ET::END_SNARL, 2, false}, + {ET::END_CHAIN, 2, false}, + // Chain 2: 3fwd -> snarl -> 4fwd + {ET::BEGIN_CHAIN, 3, false}, + {ET::BEGIN_SNARL, 3, false}, + {ET::END_SNARL, 4, false}, + {ET::END_CHAIN, 4, false}, + }; + + ReplaySnarlFinder replay(&graph, events); + + SECTION("flip only first chain") { + std::unordered_set flips {1, 2}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + std::vector expected = { + {ET::BEGIN_CHAIN, 2, true}, + {ET::BEGIN_SNARL, 2, true}, + {ET::END_SNARL, 1, true}, + {ET::END_CHAIN, 1, true}, + {ET::BEGIN_CHAIN, 3, false}, + {ET::BEGIN_SNARL, 3, false}, + {ET::END_SNARL, 4, false}, + {ET::END_CHAIN, 4, false}, + }; + + REQUIRE(captured == expected); + } + + SECTION("flip both chains") { + std::unordered_set flips {1, 2, 3, 4}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + std::vector expected = { + {ET::BEGIN_CHAIN, 2, true}, + {ET::BEGIN_SNARL, 2, true}, + {ET::END_SNARL, 1, true}, + {ET::END_CHAIN, 1, true}, + {ET::BEGIN_CHAIN, 4, true}, + {ET::BEGIN_SNARL, 4, true}, + {ET::END_SNARL, 3, true}, + {ET::END_CHAIN, 3, true}, + }; + + REQUIRE(captured == expected); + } +} + +TEST_CASE("SnarlDecompositionFuzzer handles deeply nested chains", "[snarl_decomposition_fuzzer]") { + bdsg::HashGraph graph; + for (nid_t i = 1; i <= 8; i++) { + graph.create_handle("A", i); + } + + // Outer chain: 1->6 + // Snarl(1,4) + // Inner chain: 2->3 + // Snarl(2,3) [leaf snarl, no children] + // Snarl(4,6) + // Inner chain: 5->5 [empty/trivial] + std::vector events = { + {ET::BEGIN_CHAIN, 1, false}, + {ET::BEGIN_SNARL, 1, false}, + {ET::BEGIN_CHAIN, 2, false}, + {ET::BEGIN_SNARL, 2, false}, + {ET::END_SNARL, 3, false}, + {ET::END_CHAIN, 3, false}, + {ET::END_SNARL, 4, false}, + {ET::BEGIN_SNARL, 4, false}, + {ET::BEGIN_CHAIN, 5, false}, + {ET::END_CHAIN, 5, false}, + {ET::END_SNARL, 6, false}, + {ET::END_CHAIN, 6, false}, + }; + + ReplaySnarlFinder replay(&graph, events); + + SECTION("flip outer chain only") { + std::unordered_set flips {1, 6}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + // Inner chain and its snarls should flip too. + std::vector expected = { + {ET::BEGIN_CHAIN, 6, true}, + {ET::BEGIN_SNARL, 6, true}, + {ET::BEGIN_CHAIN, 5, true}, + {ET::END_CHAIN, 5, true}, + {ET::END_SNARL, 4, true}, + {ET::BEGIN_SNARL, 4, true}, + {ET::BEGIN_CHAIN, 3, true}, + {ET::BEGIN_SNARL, 3, true}, + {ET::END_SNARL, 2, true}, + {ET::END_CHAIN, 2, true}, + {ET::END_SNARL, 1, true}, + {ET::END_CHAIN, 1, true}, + }; + + REQUIRE(captured == expected); + } + + SECTION("flip outer and inner chain") { + std::unordered_set flips {1, 6, 2, 3}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + // Outer chain should flip but inner chain should flip back + std::vector expected = { + {ET::BEGIN_CHAIN, 6, true}, + {ET::BEGIN_SNARL, 6, true}, + {ET::BEGIN_CHAIN, 5, true}, + {ET::END_CHAIN, 5, true}, + {ET::END_SNARL, 4, true}, + {ET::BEGIN_SNARL, 4, true}, + {ET::BEGIN_CHAIN, 2, false}, + {ET::BEGIN_SNARL, 2, false}, + {ET::END_SNARL, 3, false}, + {ET::END_CHAIN, 3, false}, + {ET::END_SNARL, 1, true}, + {ET::END_CHAIN, 1, true}, + }; + + REQUIRE(captured == expected); + } +} + +} // namespace unittest +} // namespace vg diff --git a/src/unittest/snarl_distance_index.cpp b/src/unittest/snarl_distance_index.cpp index 36a1b9b74e..d257e719eb 100644 --- a/src/unittest/snarl_distance_index.cpp +++ b/src/unittest/snarl_distance_index.cpp @@ -9,23 +9,34 @@ #include #include #include -#include "vg/io/json2pb.h" -#include +#include "../io/json2graph.hpp" +#include #include "catch.hpp" #include "support/random_graph.hpp" #include "support/randomness.hpp" +#include "support/randomly_flipped_nodes.hpp" +#include "support/snarl_decomposition_fuzzer.hpp" #include "../snarl_distance_index.hpp" #include "../integrated_snarl_finder.hpp" #include "../genotypekit.hpp" #include "../traversal_finder.hpp" +#include "../io/save_handle_graph.hpp" #include #include #include "xg.hpp" +#include +#include //#define debug namespace vg { namespace unittest { + + // TODO: Having *any* operator<< overloads in vg::unittest seems to hide + // the ones that are just in vg, somehow. + using vg::operator<<; + + static pair, unordered_set > pb_contents( VG& graph, const pair, unordered_set >& contents) { pair, unordered_set > ret; @@ -192,7 +203,82 @@ namespace vg { REQUIRE(distance_index.minimum_distance(2, true, 0, 2, true, 1) == 1); } } - TEST_CASE( "Nested chain with loop", "[snarl_distance]" ) { + TEST_CASE( "Can distance index nested chain without loop", "[snarl_distance]" ) { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("G"); + handle_t h2 = graph.create_handle("A"); + handle_t h3 = graph.create_handle("T"); + handle_t h4 = graph.create_handle("T"); + handle_t h5 = graph.create_handle("A"); + handle_t h6 = graph.create_handle("C"); + handle_t h7 = graph.create_handle("A"); + + // Wire it up as a stick + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + graph.create_edge(h3, h4); + graph.create_edge(h4, h5); + graph.create_edge(h5, h6); + graph.create_edge(h6, h7); + + // Allow skipping a run of nodes to make a snarl with a child chain + graph.create_edge(h2, h5); + + IntegratedSnarlFinder snarl_finder(graph); + + SECTION("Snarl classifications are correct") { + SECTION("Distance index") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); + REQUIRE(distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(graph.get_id(h3)))))); + } + SECTION("Distanceless index") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 0); + REQUIRE(distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(graph.get_id(h3)))))); + } + } + } + TEST_CASE( "Can distance index nested chain with a loop hiding in the middle", "[snarl_distance]" ) { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("G"); + handle_t h2 = graph.create_handle("A"); + handle_t h3 = graph.create_handle("T"); + handle_t h4 = graph.create_handle("T"); + handle_t h5 = graph.create_handle("A"); + handle_t h6 = graph.create_handle("C"); + handle_t h7 = graph.create_handle("A"); + + // Wire it up as a stick + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + graph.create_edge(h3, h4); + graph.create_edge(h4, h5); + graph.create_edge(h5, h6); + graph.create_edge(h6, h7); + + // Allow skipping a run of nodes to make a snarl with a child chain that has a few nodes in it + graph.create_edge(h1, h6); + + // Allow turning around with an edge hiding somewhere in the middle of the chain + graph.create_edge(h3, graph.flip(h3)); + + IntegratedSnarlFinder snarl_finder(graph); + + SECTION("Snarl classifications are correct") { + SECTION("Distance index") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); + REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(graph.get_id(h3)))))); + } + SECTION("Distanceless index") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 0); + REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(graph.get_id(h3)))))); + } + } + } + TEST_CASE( "Can distance index nested chain with a loop", "[snarl_distance]" ) { VG graph; @@ -230,7 +316,8 @@ namespace vg { Edge* e17 = graph.create_edge(n11, n12); Edge* e18 = graph.create_edge(n12, n13); - graph.serialize_to_file("test_graph.vg"); + vg::io::save_handle_graph(&graph, "test_graph.vg"); + //get the snarls IntegratedSnarlFinder snarl_finder(graph); SECTION("Traversal of chain") { @@ -248,16 +335,13 @@ namespace vg { fill_in_distance_index(&distance_index, &graph, &snarl_finder); REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n3->id()))))); REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n8->id()))))); - REQUIRE(distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n6->id()))), true)); - REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n6->id()))), false)); + REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n6->id()))))); } SECTION("Distanceless index") { SnarlDistanceIndex distance_index; fill_in_distance_index(&distance_index, &graph, &snarl_finder, 0); - REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n3->id()))), true, &graph)); - REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n8->id()))), true, &graph)); - REQUIRE(distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n6->id()))), true, &graph)); - // TODO: This isn't true because it would be too much work to recursively check all children using only the graph - //REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n6->id()))), false, &graph)); + REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n3->id()))))); + REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n8->id()))))); + REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n6->id()))))); } } SECTION("Minimum distances are correct") { @@ -3577,12 +3661,9 @@ namespace vg { // } // )"; // - // VG graph; - // // // Load up the graph - // Graph g; - // json2pb(g, graph_json.c_str(), graph_json.size()); - // graph.extend(g); + // VG graph; + // vg::io::json2graph(graph_json, &graph); // // // Define the one snarl // Snarl snarl1; @@ -3709,12 +3790,9 @@ namespace vg { // string snarl2_json = R"({"type": 1, "end": {"node_id": 187209, "backward": true}, "start": {"node_id": 178895, "backward": true}, "parent": {"end": {"node_id": 187208}, "start": {"node_id": 178894}}})"; // string snarl3_json = R"({"type": 1, "end": {"node_id": 178896}, "start": {"node_id": 178895}, "parent": {"end": {"node_id": 187208}, "start": {"node_id": 178894}}})"; // - // VG graph; - // // // Load up the graph - // Graph g; - // json2pb(g, graph_json.c_str(), graph_json.size()); - // graph.extend(g); + // VG graph; + // vg::io::json2graph(graph_json, &graph); // // // Load the snarls // Snarl snarl1, snarl2, snarl3; @@ -3885,9 +3963,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); // We need to see the path. REQUIRE(graph.paths.size() == 1); @@ -4145,9 +4221,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -4258,9 +4332,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -4407,9 +4479,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -4536,9 +4606,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -4645,9 +4713,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -4749,9 +4815,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -4919,9 +4983,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -5042,9 +5104,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -6624,6 +6684,25 @@ namespace vg { } } + TEST_CASE( "Tiny oversized snarl", "[snarl_distance]" ) { + VG graph; + handle_t n1 = graph.create_handle("GCA"); + handle_t n2 = graph.create_handle("T"); + handle_t n3 = graph.create_handle("G"); + handle_t n4 = graph.create_handle("CTGA"); + + graph.create_edge(n1, n2); + graph.create_edge(n1, n3); + graph.create_edge(n2, n3); + graph.create_edge(n2, n4); + graph.create_edge(n3, n4); + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 1); + + REQUIRE(distance_index.minimum_distance(2, false, 0, 3, false, 0, false, &graph) == 1); + } + TEST_CASE( "Oversized snarl","[snarl_distance]" ) { VG graph; @@ -7372,6 +7451,9 @@ namespace vg { } + // TODO: This test case doesn't do anything (runs 0 iterations). + // When I tell it to actually run iterations, it fails. + // Has it ever worked? TEST_CASE("random test subgraph", "[snarl_distance][snarl_distance_subgraph]") { int64_t min = 20; int64_t max = 50; @@ -7480,7 +7562,7 @@ namespace vg { << distance_index.minimum_distance(nodeID1, false, 0, node_id, true, 0) << " (" << dist_start_fd << " " << dist_end_fd << " " << dist_start_bk << " " << dist_end_bk << ") " << " is in the subgraph but shouldn't be " << endl; - graph.serialize_to_file("test_graph.vg"); + vg::io::save_handle_graph(&graph, "test_graph.vg"); } REQUIRE((start_forward || end_forward || in_forward || start_backward || end_backward || in_backward)); } else { @@ -7491,7 +7573,7 @@ namespace vg { << distance_index.minimum_distance(nodeID1, false, 0,node_id, true, 0) << " (" << dist_start_fd << " " << dist_end_fd << " " << dist_start_bk << " " << dist_end_bk << ") " << " is not in the subgraph but should be " << endl; - graph.serialize_to_file("test_graph.vg"); + vg::io::save_handle_graph(&graph, "test_graph.vg"); REQUIRE(!(start_forward || end_forward || in_forward || start_backward || end_backward || in_backward)); } } @@ -7556,31 +7638,49 @@ namespace vg { */ TEST_CASE( "Distance index can traverse all the snarls in random graphs", - "[snarl_distance_random]" ) { + "[snarl_distance][snarl_distance_random]" ) { // Each actual graph takes a fairly long time to do so we randomize sizes... - default_random_engine generator(test_seed_source()); + std::default_random_engine generator(test_seed_source()); for (size_t repeat = 0; repeat < 1000; repeat++) { - uniform_int_distribution bases_dist(100, 1000); + std::uniform_int_distribution bases_dist(100, 1000); size_t bases = bases_dist(generator); - uniform_int_distribution variant_bases_dist(1, bases/20); + std::uniform_int_distribution variant_bases_dist(1, bases/20); size_t variant_bases = variant_bases_dist(generator); - uniform_int_distribution variant_count_dist(1, bases/30); + std::uniform_int_distribution variant_count_dist(1, bases/30); size_t variant_count = variant_count_dist(generator); + + std::uniform_real_distribution flip_dist(0.0, 1.0); + double node_flip_fraction = flip_dist(generator); + double chain_flip_fraction = flip_dist(generator); - uniform_int_distribution snarl_size_limit_dist(500, 1000); + std::uniform_int_distribution snarl_size_limit_dist(2, 1000); size_t size_limit = snarl_size_limit_dist(generator); - + #ifdef debug - cerr << repeat << ": Do graph of " << bases << " bp with ~" << variant_bases << " bp large variant length and " << variant_count << " events" << endl; + cerr << repeat << ": Do graph of " << bases << " bp with ~" << variant_bases << " bp large variant length and " << variant_count << " events with " << node_flip_fraction << " nodes flipped and " << chain_flip_fraction << " of chains flipped, with size limit " << size_limit << endl; #endif - - VG graph; - random_graph(bases, variant_bases, variant_count, &graph); - IntegratedSnarlFinder finder(graph); + + // Generate a base graph + VG base_graph; + random_graph(bases, variant_bases, variant_count, &base_graph); + + // Flip some fraction of the nodes to their local reverse orientation + bdsg::HashGraph graph = randomly_flipped_nodes(base_graph, node_flip_fraction, generator); + + // Find snarls + IntegratedSnarlFinder base_finder(graph); + + // Flip some fraction of the chains to their opposite orientation. + // Note that we can't flip the snarls because the snarl decomposition + // requires snarls to be articulated as forward along their + // chains. + SnarlDecompositionFuzzer finder(&graph, &base_finder, chain_flip_fraction, generator); + + // Build the index SnarlDistanceIndex distance_index; fill_in_distance_index(&distance_index, &graph, &finder, size_limit); @@ -7640,7 +7740,7 @@ namespace vg { cerr << node_id1 << " " << (rev1 ? "rev" : "fd") << offset1 << " -> " << node_id2 << (rev2 ? "rev" : "fd") << offset2 << endl; cerr << "guessed: " << snarl_distance << " actual: " << dijkstra_distance << endl; cerr << "serializing graph to test_graph.vg" << endl; - graph.serialize_to_file("test_graph.vg"); + vg::io::save_handle_graph(&graph, "test_graph.vg"); REQUIRE(false); } if (max_distance < snarl_distance){ @@ -7648,11 +7748,10 @@ namespace vg { cerr << node_id1 << " " << (rev1 ? "rev" : "fd") << offset1 << " -> " << node_id2 << (rev2 ? "rev" : "fd") << offset2 << endl; cerr << "minimum: " << snarl_distance << " maximum: " << max_distance << endl; cerr << "serializing graph to test_graph.vg" << endl; - graph.serialize_to_file("test_graph.vg"); + vg::io::save_handle_graph(&graph, "test_graph.vg"); REQUIRE(false); } REQUIRE((snarl_distance >= dijkstra_distance || snarl_distance == std::numeric_limits::max())); - graph.serialize_to_file("test_graph.vg"); if (!traceback.first.empty() && ! traceback.second.empty()) { size_t traceback_distance = 0; for (auto x : traceback.first){ @@ -7699,7 +7798,7 @@ namespace vg { cerr << node_id1 << " " << (rev1 ? "rev" : "fd") << offset1 << " -> " << node_id2 << (rev2 ? "rev" : "fd") << offset2 << endl; cerr << "guessed: " << snarl_distance << " actual: " << dijkstra_distance << endl; cerr << "serializing graph to test_graph.vg" << endl; - graph.serialize_to_file("test_graph.vg"); + vg::io::save_handle_graph(&graph, "test_graph.vg"); REQUIRE(false); } REQUIRE((snarl_distance >= dijkstra_distance || snarl_distance == std::numeric_limits::max())); @@ -7766,9 +7865,179 @@ namespace vg { */ } - - - } + + + } + + TEST_CASE( "Distance index hub labeling matches Dijkstra on random graphs", + "[snarl_distance][snarl_distance_random_hub_labels]" ) { + + // Force hub labeling on essentially every snarl by pinning size_limit + // to 1, and bias graphs toward snarlier topologies so the hub-label + // path actually has interior nodes to label. + + std::default_random_engine generator(test_seed_source()); + + for (size_t repeat = 0; repeat < 1000; repeat++) { + + std::uniform_int_distribution bases_dist(200, 1500); + size_t bases = bases_dist(generator); + std::uniform_int_distribution variant_bases_dist(1, bases/15); + size_t variant_bases = variant_bases_dist(generator); + std::uniform_int_distribution variant_count_dist(bases/15, bases/8); + size_t variant_count = variant_count_dist(generator); + + std::uniform_real_distribution flip_dist(0.0, 1.0); + double node_flip_fraction = flip_dist(generator); + double chain_flip_fraction = flip_dist(generator); + + // Anything > 1 trips the populate_hub_labeling branch. + const size_t size_limit = 1; + +#ifdef debug + cerr << repeat << ": Do graph of " << bases << " bp with ~" << variant_bases << " bp large variant length and " << variant_count << " events with " << node_flip_fraction << " nodes flipped and " << chain_flip_fraction << " of chains flipped, with size limit " << size_limit << endl; +#endif + + // Generate a base graph + VG base_graph; + random_graph(bases, variant_bases, variant_count, &base_graph); + + // Flip some fraction of the nodes to their local reverse orientation + bdsg::HashGraph graph = randomly_flipped_nodes(base_graph, node_flip_fraction, generator); + + // Find snarls + IntegratedSnarlFinder base_finder(graph); + + // Flip some fraction of the chains to their opposite orientation. + SnarlDecompositionFuzzer finder(&graph, &base_finder, chain_flip_fraction, generator); + + // Build the index + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &finder, size_limit); + + //Make sure that the distance index found all the nodes + for (id_t id = graph.min_node_id() ; id <= graph.max_node_id() ; id++) { + if (graph.has_node(id)) { + handle_t handle = graph.get_handle(id); + REQUIRE(graph.get_length(handle) == + distance_index.node_length(distance_index.get_net(handle, &graph))); + } + } + + for (size_t repeat_positions = 0 ; repeat_positions < 500 ; repeat_positions++) { + //Pick random pairs of positions and find the distance between them + id_t node_id1 = 0; + id_t node_id2 = 0; + uniform_int_distribution random_node_ids(graph.min_node_id(),graph.max_node_id()); + default_random_engine generator(test_seed_source()); + while (node_id1 == 0) { + id_t new_id = random_node_ids(generator); + if (graph.has_node(new_id)) { + node_id1 = new_id; + } + } + while (node_id2 == 0) { + id_t new_id = random_node_ids(generator); + if (graph.has_node(new_id)) { + node_id2 = new_id; + } + } + + REQUIRE(graph.has_node(node_id1)); + REQUIRE(graph.has_node(node_id2)); + + + offset_t offset1 = uniform_int_distribution(0,graph.get_length(graph.get_handle(node_id1)) - 1)(generator); + offset_t offset2 = uniform_int_distribution(0,graph.get_length(graph.get_handle(node_id2)) - 1)(generator); + bool rev1 = uniform_int_distribution(0,1)(generator) == 0; + bool rev2 = uniform_int_distribution(0,1)(generator) == 0; + + + handle_t handle1 = graph.get_handle(node_id1, rev1); + handle_t handle2 = graph.get_handle(node_id2, rev2); + + + //Find actual distance + size_t dijkstra_distance = std::numeric_limits::max(); + if (node_id1 == node_id2 && offset1 <= offset2 && rev1 == rev2) { + dijkstra_distance = offset2 - offset1; + + pair>,vector>> traceback; + size_t snarl_distance = distance_index.minimum_distance(node_id1, rev1, offset1, node_id2, rev2, offset2, false, &graph, &traceback); + size_t max_distance = distance_index.maximum_distance(node_id1, rev1, offset1, node_id2, rev2, offset2); + if (snarl_distance != dijkstra_distance){ + cerr << "Failed random hub-label test" << endl; + cerr << "Snarl size limit: " << size_limit << endl; + cerr << node_id1 << " " << (rev1 ? "rev" : "fd") << offset1 << " -> " << node_id2 << (rev2 ? "rev" : "fd") << offset2 << endl; + cerr << "guessed: " << snarl_distance << " actual: " << dijkstra_distance << endl; + cerr << "serializing graph to test_graph.vg" << endl; + vg::io::save_handle_graph(&graph, "test_graph.vg"); + REQUIRE(false); + } + if (max_distance < snarl_distance){ + cerr << "Failed random hub-label test" << endl; + cerr << node_id1 << " " << (rev1 ? "rev" : "fd") << offset1 << " -> " << node_id2 << (rev2 ? "rev" : "fd") << offset2 << endl; + cerr << "minimum: " << snarl_distance << " maximum: " << max_distance << endl; + cerr << "serializing graph to test_graph.vg" << endl; + vg::io::save_handle_graph(&graph, "test_graph.vg"); + REQUIRE(false); + } + REQUIRE((snarl_distance >= dijkstra_distance || snarl_distance == std::numeric_limits::max())); + if (!traceback.first.empty() && ! traceback.second.empty()) { + size_t traceback_distance = 0; + for (auto x : traceback.first){ + if (std::get<1>(x) != std::numeric_limits::max() && std::get<1>(x) != std::numeric_limits::min()) { + traceback_distance += std::abs(std::get<1>(x)); + } else if (std::get<2>(x) != std::numeric_limits::max() && std::get<2>(x) != std::numeric_limits::min()){ + traceback_distance += std::abs(std::get<2>(x)); + } + } + for (size_t i = 0 ; i < traceback.second.size()-1 ; i++) { + auto x = traceback.second[i]; + + if (std::get<1>(x) != std::numeric_limits::max() && std::get<1>(x) != std::numeric_limits::min()) { + traceback_distance += std::abs(std::get<1>(x)); + } else if (std::get<2>(x) != std::numeric_limits::max() && std::get<2>(x) != std::numeric_limits::min()){ + traceback_distance += std::abs(std::get<2>(x)); + } + } + REQUIRE(snarl_distance == traceback_distance); + } else { + REQUIRE(snarl_distance == std::numeric_limits::max()); + } + + } else if (node_id1 == node_id2 ) { + //TOOD: The dijkstra algorithm won't visit the start node twice + } else { + bool first = true; + handlegraph::algorithms::dijkstra(&graph, handle1, [&](const handle_t& reached, size_t distance) { + if (reached == handle2 && ! first) { + dijkstra_distance = distance; + dijkstra_distance += graph.get_length(graph.get_handle(node_id1)) - offset1; + dijkstra_distance += offset2; + return false; + } + first = false; + return true; + } + , false); + + size_t snarl_distance = distance_index.minimum_distance(node_id1, rev1, offset1, node_id2, rev2, offset2, false, &graph); + if (snarl_distance != dijkstra_distance){ + cerr << "Failed random hub-label test" << endl; + cerr << "Snarl size limit: " << size_limit << endl; + cerr << node_id1 << " " << (rev1 ? "rev" : "fd") << offset1 << " -> " << node_id2 << (rev2 ? "rev" : "fd") << offset2 << endl; + cerr << "guessed: " << snarl_distance << " actual: " << dijkstra_distance << endl; + cerr << "serializing graph to test_graph.vg" << endl; + vg::io::save_handle_graph(&graph, "test_graph.vg"); + REQUIRE(false); + } + REQUIRE((snarl_distance >= dijkstra_distance || snarl_distance == std::numeric_limits::max())); + } + } + } + } + //TEST_CASE("Failed unit test", "[failed]") { // //Load failed random graph // ifstream vg_stream("test_graph.hg"); @@ -7789,8 +8058,372 @@ namespace vg { // return true; // }); //} + + TEST_CASE( "Distance index can query a troublesome oversized snarl", + "[snarl_distance]" ) { + + std::string graph_json = R"({ + "node": [ + {"id": "19","sequence": "A"}, + {"id": "20","sequence": "A"}, + {"id": "21","sequence": "A"}, + {"id": "22","sequence": "A"}, + {"id": "23","sequence": "A"} + ], "edge": [ + {"from": "19","to": "20"}, + {"from": "19","to": "22"}, + {"from": "20","to": "21"}, + {"from": "20","to": "23"}, + {"from": "21","to": "22"}, + {"from": "22","to": "23"} + ] + })"; + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 2); + + id_t node_id1 = 19; bool rev1 = false ; size_t offset1 = 0; + id_t node_id2 = 23; bool rev2 = false ; size_t offset2 = 0; + handle_t handle1 = graph.get_handle(node_id1, rev1); + handle_t handle2 = graph.get_handle(node_id2, rev2); + + //Find actual distance + size_t dijkstra_distance = std::numeric_limits::max(); + handlegraph::algorithms::dijkstra(&graph, handle1, [&](const handle_t& reached, size_t distance) { + if (reached == handle2) { + dijkstra_distance = distance; + dijkstra_distance += graph.get_length(graph.get_handle(node_id1)) - offset1; + dijkstra_distance += offset2; + return false; + } + return true; + } + , false); + + REQUIRE(distance_index.minimum_distance(node_id1, rev1, offset1, node_id2, rev2, offset2, false, &graph) == dijkstra_distance); + } + + TEST_CASE( "Distance index can query out of a SNP with a reversing allele as an oversided snarl", + "[snarl_distance]" ) { + + // This is a snarl from 1 to 2, where 4 nand 5 are a SNP, and 3 + // lets you double back to the start + std::string graph_json = R"({ + "node": [ + {"id": "1","sequence": "AAAAA"}, + {"id": "2","sequence": "AAAAA"}, + {"id": "3","sequence": "A"}, + {"id": "4","sequence": "A"}, + {"id": "5","sequence": "A"} + ], "edge": [ + {"from": "1","to": "3"}, + {"from": "1","to": "4"}, + {"from": "1","to": "5"}, + {"from": "3","to": "1", "to_end": true}, + {"from": "4","to": "2"}, + {"from": "5","to": "2"} + ] + })"; + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 2); + + // We want to be able to get out of the snarl from node 4, which we definitely can. + id_t node_id1 = 4; bool rev1 = false ; size_t offset1 = 1; + id_t node_id2 = 2; bool rev2 = false ; size_t offset2 = 0; + handle_t handle1 = graph.get_handle(node_id1, rev1); + handle_t handle2 = graph.get_handle(node_id2, rev2); + + //Find actual distance + size_t true_distance = 0; + + REQUIRE(distance_index.minimum_distance(node_id1, rev1, offset1, node_id2, rev2, offset2, false, &graph) == true_distance); + + // And out of the snarl to the left from 3 reverse to 1 reverse should also be 0 + node_id1 = 3; rev1 = true; offset1 = 1; + node_id2 = 1; rev2 = true; offset2 = 0; + true_distance = 0; + REQUIRE(distance_index.minimum_distance(node_id1, rev1, offset1, node_id2, rev2, offset2, false, &graph) == true_distance); + + } + + TEST_CASE( "Distance index can query within a fiddly snarl", + "[snarl_distance]" ) { + + std::string graph_json = R"({"edge": [{"from": "1", "to": "3"}, {"from": "1", "to": "3", "to_end": true}, {"from": "1", "to": "4"}, {"from": "1", "to": "5"}, {"from": "4", "to": "5", "to_end": true}, {"from": "2", "from_start": true, "to": "4", "to_end": true}], "node": [{"id": "5", "sequence": "A"}, {"id": "1", "sequence": "AAAAA"}, {"id": "4", "sequence": "A"}, {"id": "2", "sequence": "AAAAA"}, {"id": "3", "sequence": "A"}]})"; + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 2); + + id_t node_id1 = 4; bool rev1 = false ; size_t offset1 = 1; + id_t node_id2 = 5; bool rev2 = true ; size_t offset2 = 0; + handle_t handle1 = graph.get_handle(node_id1, rev1); + handle_t handle2 = graph.get_handle(node_id2, rev2); + + //Find actual distance + size_t true_distance = 0; + + REQUIRE(distance_index.minimum_distance(node_id1, rev1, offset1, node_id2, rev2, offset2, false, &graph) == true_distance); + } + + TEST_CASE( "Distance index can query into a child snarl in reverse", + "[snarl_distance]" ) { + + std::string graph_json = R"({"node":[{"id":"79","sequence":"A"},{"id":"16","sequence":"A"},{"id":"60","sequence":"A"},{"id":"37","sequence":"A"},{"id":"40","sequence":"A"},{"id":"53","sequence":"A"},{"id":"59","sequence":"A"},{"id":"63","sequence":"A"},{"id":"18","sequence":"A"},{"id":"38","sequence":"A"},{"id":"62","sequence":"A"}],"edge":[{"from":"16","to":"53"},{"from":"16","from_start":true,"to":"79","to_end":true},{"from":"60","to":"62"},{"from":"60","from_start":true,"to":"79","to_end":true},{"from":"37","from_start":true,"to":"63","to_end":true},{"from":"37","from_start":true,"to":"40"},{"from":"53","to":"60"},{"from":"59","to":"63"},{"from":"59","from_start":true,"to":"60","to_end":true},{"from":"18","to":"53"},{"from":"18","to":"38"},{"from":"18","from_start":true,"to":"79","to_end":true},{"from":"18","from_start":true,"to":"37","to_end":true},{"from":"38","to":"63","to_end":true},{"from":"38","to":"40"},{"from":"62","to":"63"}]})"; + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 2); + + id_t node_id1 = 16; bool rev1 = false ; size_t offset1 = 1; + id_t node_id2 = 62; bool rev2 = true ; size_t offset2 = 0; + handle_t handle1 = graph.get_handle(node_id1, rev1); + handle_t handle2 = graph.get_handle(node_id2, rev2); + + //Find actual distance + size_t dijkstra_distance = std::numeric_limits::max(); + handlegraph::algorithms::dijkstra(&graph, handle1, [&](const handle_t& reached, size_t distance) { + if (reached == handle2) { + dijkstra_distance = distance; + dijkstra_distance += graph.get_length(graph.get_handle(node_id1)) - offset1; + dijkstra_distance += offset2; + return false; + } + return true; + } + , false); + + size_t index_distance = distance_index.minimum_distance(node_id1, rev1, offset1, node_id2, rev2, offset2, false, &graph); + + REQUIRE(index_distance == dijkstra_distance); + } + + + TEST_CASE( "Distance index can query all possible 3-node-with-legs snarls", + "[snarl_distance]" ) { + + // We're going to generate all possible snarls you can get by + // starting with the boundary nodes, taking up to 3 nodes and + // connecting them, one nodeside at a time, onto the existing + // nodes. + // + // Combinatorics says this is a manageable number; each nodeside + // picks from one of the previous nodesides and attaches to it. + + /// Call the callback with each possible combination of choices of + /// previous items. + /// + /// start_size is the number of items present before we start + /// making choices; the first entry can choose from start_size + /// items. + /// + /// end_size is the total number of items to think about, including + /// those in start_size. + /// + /// Calls the callback with all possible vectors of length + /// (end_size - start_size) matching these constraints. + auto for_all_choice_combinations = [](size_t start_size, size_t end_size, const std::function&)>& callback) { + + std::vector choices(end_size - start_size, 0); + while (true) { +#ifdef debug + std::cerr << "Consider combination:"; + for (auto& item : choices) { + std::cerr << " " << item; + } + std::cerr << std::endl; +#endif + callback(choices); + choices.back()++; + for (size_t i = end_size - 1; i >= start_size; i--) { + if (choices.at(i - start_size) >= i) { + // We've reached the point where we want to pick from a + // choice not available at this point. + // At i=2 we can choose between 0 and 1, so we carry at i. + if (i == start_size) { + // We've counted all possibilities + return; + } else { + // Carry and reset to 0. + choices.at(i - start_size - 1)++; + choices.at(i - start_size) = 0; + } + } else { + // No more carrying to do + break; + } + } + } + }; + + // How big should a snarl be allowed to be before being oversized? + size_t size_limit = 2; + // How many content nodes should be inside the snarl? + const size_t MAX_NODES = 3; + // How many node sides do we need to worry about, including the boundary sentinels? + size_t max_node_sides = MAX_NODES * 2 + 2; + for_all_choice_combinations(2, max_node_sides, [&](const std::vector& choices) { + // Build the choices into a graph. + + bdsg::HashGraph graph; + // Make the bounding nodes heavy so they are likely to root the snarl + handle_t start_node = graph.create_handle("AAAAA"); + handle_t end_node = graph.create_handle("AAAAA"); + + std::vector connect_to; + connect_to.reserve(max_node_sides); + // Choice 0 is start node, arriving reading out + connect_to.push_back(graph.flip(start_node)); + // Choice 1 is end node reading out + connect_to.push_back(end_node); + + for (size_t i = 0; i < choices.size(); i += 2) { + // Make a node + handle_t new_node = graph.create_handle("A"); + // Make sure to remember it so it can choose itself + connect_to.push_back(new_node); + connect_to.push_back(graph.flip(new_node)); + // Connect its left and right to each pair of choices. + graph.create_edge(graph.flip(new_node), connect_to.at(choices.at(i))); + graph.create_edge(new_node, connect_to.at(choices.at(i + 1))); + } + + // TODO: It might be more efficient to un-build the things that + // change between graphs instead of rebuilding from scratch for + // every case. + + // Skip graphs where the choices mean the graph isn't actually + // connected, because then it can't be recognized as a snarl + // probably. + std::vector> components = handlegraph::algorithms::weakly_connected_components(&graph); + if (components.size() > 1) { + return; + } + + // Now index the graph for query + IntegratedSnarlFinder finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &finder, size_limit); + + // Compute the truth all-to-all distances, between outgoing + // side of first handle and incoming side of second. + // Both handles are oriented along the connecting path. + // TODO: We compute/store both triangles of the matrix; can we avoid one somehow? + std::unordered_map> dijkstra_distances; + graph.for_each_handle([&](const handle_t& base) { + for (const handle_t& here : {base, graph.flip(base)}) { + if (here == graph.flip(start_node) || here == end_node) { + // Skip traversals looking out of the snarl + return; + } + dijkstra_distances.emplace(here, handlegraph::algorithms::find_shortest_paths(&graph, here)); + } + }); + + // The Dijkstra traversal always sees a handle to itself at + // distance 0. We need to get the real back-to-self distance, + // if any, and fill that in. + graph.for_each_handle([&](const handle_t& base) { + for (const handle_t& here : {base, graph.flip(base)}) { + if (here == graph.flip(start_node) || here == end_node) { + // Skip traversals looking out of the snarl + return; + } + + // The place we need to arrive at is ourselves, since + // both start and end are oriented along the connecting + // path here. + + size_t loop_distance = std::numeric_limits::max(); + // See if we can get back here from any of the places we can get + graph.follow_edges(here, false, [&](const handle_t next) { + if (next == here) { + // We found a real self loop + loop_distance = 0; + return false; + } + auto found_index = dijkstra_distances.find(next); + if (found_index == dijkstra_distances.end()) { + // This destination can't get anywhere. + // This should be impossible since the Dijkstra always will point a node at itself. + return true; + } + auto found_distance = found_index->second.find(here); + if (found_distance == found_index->second.end()) { + // This destination can't get back to us + return true; + } + // If we find a way back, min in its distance. + loop_distance = std::min(loop_distance, graph.get_length(next) + found_distance->second); + return true; + }); + +#ifdef debug + std::cerr << "Real self loop distance for " << graph.get_id(here) << (graph.get_is_reverse(here) ? "rev" : "fd") << " -> " << graph.get_id(here) << (graph.get_is_reverse(here) ? "rev" : "fd") << " is " << loop_distance << std::endl; +#endif + + if (loop_distance == std::numeric_limits::max()) { + // There's really no way back from this node to itself in the same orientation. Delete the entry the Dijkstra search adds. + dijkstra_distances.at(here).erase(here); + } else { + // There is a way back; store the value. + dijkstra_distances.at(here)[here] = loop_distance; + } + }; + }); + +#ifdef debug + for (auto& [start_handle, distances] : dijkstra_distances) { + for (auto& [end_handle, dijkstra_distance] : distances) { + cerr << "Dijkstra sees: " << graph.get_id(start_handle) << (graph.get_is_reverse(start_handle) ? "rev" : "fd") << graph.get_length(start_handle) << " -> " << graph.get_id(end_handle) << (graph.get_is_reverse(end_handle) ? "rev" : "fd") << 0 << " = " << dijkstra_distance << endl; + } + } +#endif + + // Now query all of the distances against the index + for (auto& [start_handle, distances] : dijkstra_distances) { + for (auto& [end_handle, dijkstra_distance] : distances) { + // Ask for distance between outgoing side of first handle and incoming side of second. + +#ifdef debug + cerr << "Measure: " << graph.get_id(start_handle) << (graph.get_is_reverse(start_handle) ? "rev" : "fd") << graph.get_length(start_handle) << " -> " << graph.get_id(end_handle) << (graph.get_is_reverse(end_handle) ? "rev" : "fd") << 0 << endl; +#endif + + size_t snarl_distance = distance_index.minimum_distance(graph.get_id(start_handle), graph.get_is_reverse(start_handle), graph.get_length(start_handle), graph.get_id(end_handle), graph.get_is_reverse(end_handle), 0, false, &graph); + + if (snarl_distance != dijkstra_distance) { + cerr << "Failed exhaustive test" << endl; + cerr << "Snarl size limit: " << size_limit << endl; + cerr << graph.get_id(start_handle) << (graph.get_is_reverse(start_handle) ? "rev" : "fd") << graph.get_length(start_handle) << " -> " << graph.get_id(end_handle) << (graph.get_is_reverse(end_handle) ? "rev" : "fd") << 0 << endl; + cerr << "guessed: " << snarl_distance << " actual: " << dijkstra_distance << endl; + cerr << "serializing graph to test_graph.vg" << endl; + vg::io::save_handle_graph(&graph, "test_graph.vg"); + } + REQUIRE(snarl_distance == dijkstra_distance); + } + } + }); + + } + + TEST_CASE( "random minimum distance paths", - "[snarl_distance_random_paths]" ) { + "[snarl_distance][snarl_distance_random_paths]" ) { // Each actual graph takes a fairly long time to do so we randomize sizes... @@ -7809,7 +8442,7 @@ namespace vg { size_t size_limit = snarl_size_limit_dist(generator); #ifdef debug - cerr << repeat << ": Do graph of " << bases << " bp with ~" << variant_bases << " bp large variant length and " << variant_count << " events" << endl; + cerr << repeat << ": Do graph of " << bases << " bp with ~" << variant_bases << " bp large variant length and " << variant_count << " events with size limit " << size_limit << endl; #endif VG graph; @@ -7818,7 +8451,7 @@ namespace vg { SnarlDistanceIndex distance_index; fill_in_distance_index(&distance_index, &graph, &finder, size_limit); - graph.serialize_to_file("test_graph.vg"); + vg::io::save_handle_graph(&graph, "test_graph.vg"); for (size_t repeat_positions = 0 ; repeat_positions < 500 ; repeat_positions++) { //Pick random pairs of positions and find the distance between them id_t node_id1 = 0; diff --git a/src/unittest/snarls.cpp b/src/unittest/snarls.cpp index c2f5030326..c7edf85b05 100644 --- a/src/unittest/snarls.cpp +++ b/src/unittest/snarls.cpp @@ -9,6 +9,8 @@ #include #include #include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" +#include #include #include "catch.hpp" #include "support/random_graph.hpp" @@ -1697,14 +1699,12 @@ namespace vg { ] } )"; - + VG graph; - + // Load up the graph - Graph g; - json2pb(g, graph_json.c_str(), graph_json.size()); - graph.extend(g); - + vg::io::json2graph(graph_json, &graph); + // Define the one snarl Snarl snarl1; snarl1.mutable_start()->set_node_id(6462830); @@ -1830,14 +1830,12 @@ namespace vg { string snarl1_json = R"({"type": 1, "end": {"node_id": 187208}, "start": {"node_id": 178894}})"; string snarl2_json = R"({"type": 1, "end": {"node_id": 187209, "backward": true}, "start": {"node_id": 178895, "backward": true}, "parent": {"end": {"node_id": 187208}, "start": {"node_id": 178894}}})"; string snarl3_json = R"({"type": 1, "end": {"node_id": 178896}, "start": {"node_id": 178895}, "parent": {"end": {"node_id": 187208}, "start": {"node_id": 178894}}})"; - + VG graph; - + // Load up the graph - Graph g; - json2pb(g, graph_json.c_str(), graph_json.size()); - graph.extend(g); - + vg::io::json2graph(graph_json, &graph); + // Load the snarls Snarl snarl1, snarl2, snarl3; json2pb(snarl1, snarl1_json.c_str(), snarl1_json.size()); @@ -1917,13 +1915,11 @@ namespace vg { } )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + // We need to see the path. REQUIRE(graph.paths.size() == 1); @@ -2045,10 +2041,8 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug @@ -2061,7 +2055,7 @@ namespace vg { cerr << endl; }); #endif - + SECTION("Root node has 1 child bubble") { REQUIRE(snarl_manager.top_level_snarls().size() == 1); @@ -2127,15 +2121,13 @@ namespace vg { ]} ] } - + )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug @@ -2246,15 +2238,13 @@ namespace vg { ]} ] } - + )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug @@ -2354,18 +2344,16 @@ namespace vg { {"from": 2, "to": 4}, {"from": 2, "to": 3}, {"from": 2, "to": 2}, - {"from": 3, "to": 3} + {"from": 3, "to": 3} ] } - + )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug @@ -2415,15 +2403,13 @@ namespace vg { ]} ] } - + )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug @@ -2490,18 +2476,16 @@ namespace vg { "edge": [ {"from": 1, "to": 2}, {"from": 2, "to": 1} - + ] } - + )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug @@ -2555,15 +2539,13 @@ namespace vg { {"from": 3, "to": 6} ] } - + )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug @@ -2767,15 +2749,13 @@ namespace vg { {"from": 9, "to": 10} ] } - + )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug snarl_manager.for_each_snarl_preorder([&](const Snarl* snarl) { @@ -3919,14 +3899,12 @@ namespace vg { {"position": {"node_id": 7, "is_reverse" : "true"}, "rank" : 5 } ]} ] - } + } )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); assert(graph.is_valid()); SECTION( "PathTraversalFinder can find simple forward traversals") { diff --git a/src/unittest/source_sink_overlay.cpp b/src/unittest/source_sink_overlay.cpp index 4c0ecbc20f..bf2aa3bc13 100644 --- a/src/unittest/source_sink_overlay.cpp +++ b/src/unittest/source_sink_overlay.cpp @@ -10,7 +10,8 @@ #include "../source_sink_overlay.hpp" #include "../kmer.hpp" #include "../vg.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" +#include #include #include @@ -132,11 +133,9 @@ TEST_CASE("SourceSinkOverlay adds a source and a sink to a 1-node graph", "[over TEST_CASE("SourceSinkOverlay agrees with VG::add_start_end_markers in a tiny graph", "[overlay]") { const string graph_json = R"({"node":[{"sequence":"CAAATAAG","id":"1"},{"sequence":"A","id":"2"},{"sequence":"G","id":"3"},{"sequence":"T","id":"4"},{"sequence":"C","id":"5"},{"sequence":"TTG","id":"6"},{"sequence":"A","id":"7"},{"sequence":"G","id":"8"},{"sequence":"AAATTTTCTGGAGTTCTAT","id":"9"},{"sequence":"A","id":"10"},{"sequence":"T","id":"11"},{"sequence":"ATAT","id":"12"},{"sequence":"A","id":"13"},{"sequence":"T","id":"14"},{"sequence":"CCAACTCTCTG","id":"15"}],"edge":[{"from":"1","to":"2"},{"from":"1","to":"3"},{"from":"2","to":"4"},{"from":"2","to":"5"},{"from":"3","to":"4"},{"from":"3","to":"5"},{"from":"4","to":"6"},{"from":"5","to":"6"},{"from":"6","to":"7"},{"from":"6","to":"8"},{"from":"7","to":"9"},{"from":"8","to":"9"},{"from":"9","to":"10"},{"from":"9","to":"11"},{"from":"10","to":"12"},{"from":"11","to":"12"},{"from":"12","to":"13"},{"from":"12","to":"14"},{"from":"13","to":"15"},{"from":"14","to":"15"}],"path":[{"name":"x","mapping":[{"position":{"node_id":"1"},"edit":[{"from_length":8,"to_length":8}],"rank":"1"},{"position":{"node_id":"3"},"edit":[{"from_length":1,"to_length":1}],"rank":"2"},{"position":{"node_id":"5"},"edit":[{"from_length":1,"to_length":1}],"rank":"3"},{"position":{"node_id":"6"},"edit":[{"from_length":3,"to_length":3}],"rank":"4"},{"position":{"node_id":"8"},"edit":[{"from_length":1,"to_length":1}],"rank":"5"},{"position":{"node_id":"9"},"edit":[{"from_length":19,"to_length":19}],"rank":"6"},{"position":{"node_id":"11"},"edit":[{"from_length":1,"to_length":1}],"rank":"7"},{"position":{"node_id":"12"},"edit":[{"from_length":4,"to_length":4}],"rank":"8"},{"position":{"node_id":"14"},"edit":[{"from_length":1,"to_length":1}],"rank":"9"},{"position":{"node_id":"15"},"edit":[{"from_length":11,"to_length":11}],"rank":"10"}]}]})"; - - Graph graph; - json2pb(graph, graph_json); - - VG produced(graph); + + VG produced; + vg::io::json2graph(graph_json, &produced); id_t highest_id = produced.max_node_id(); id_t start_id = highest_id + 1; diff --git a/src/unittest/support/random_graph.hpp b/src/unittest/support/random_graph.hpp index 7597beeab9..e3e812d265 100644 --- a/src/unittest/support/random_graph.hpp +++ b/src/unittest/support/random_graph.hpp @@ -1,11 +1,16 @@ +#ifndef VG_UNITTEST_RANDOM_GRAPH_HPP_INCLUDED +#define VG_UNITTEST_RANDOM_GRAPH_HPP_INCLUDED +/** \file random_graph.hpp + * Utilities for randomizing graphs for test cases. + */ + + #include "handle.hpp" #include -#ifndef VG_UNITTEST_RANDOM_GRAPH_HPP_INCLUDED -#define VG_UNITTEST_RANDOM_GRAPH_HPP_INCLUDED -namespace vg{ -namespace unittest{ +namespace vg { +namespace unittest { /// Create a random graph by adding variation to a sequence of length seq_size /// variant_len is the mean length of a larger variation and variant_count diff --git a/src/unittest/support/randomly_flipped_nodes.hpp b/src/unittest/support/randomly_flipped_nodes.hpp new file mode 100644 index 0000000000..40b00bda26 --- /dev/null +++ b/src/unittest/support/randomly_flipped_nodes.hpp @@ -0,0 +1,83 @@ +#ifndef VG_UNITTEST_RANDOMLY_FLIPPED_NODES_HPP_INCLUDED +#define VG_UNITTEST_RANDOMLY_FLIPPED_NODES_HPP_INCLUDED + +/** + * \file randomly_flipped_nodes.hpp + * Utility for creating a copy of a HandleGraph with a random subset of nodes + * flipped in orientation. + */ + +#include +#include +#include "handle.hpp" + +namespace vg { +namespace unittest { + +/** + * Return a copy of the given graph with approximately p_flip fraction of its + * nodes reversed in their local forward orientation. When a node is flipped, + * its sequence is reverse-complemented and all edges that connected to its + * forward orientation now connect to its reverse orientation, and vice versa. + * + * The returned graph preserves node IDs. + */ +template +bdsg::HashGraph randomly_flipped_nodes(const HandleGraph& source, double p_flip, URNG& generator) { + bdsg::HashGraph result; + + std::uniform_real_distribution dist(0.0, 1.0); + + // Track which nodes get flipped + std::unordered_set flipped; + + // Copy all nodes, flipping some + source.for_each_handle([&](const handle_t& handle) { + nid_t id = source.get_id(handle); + if (dist(generator) < p_flip) { + // Flip this node: store its reverse complement sequence as forward + result.create_handle(source.get_sequence(source.flip(handle)), id); + flipped.insert(id); + } else { + // Keep this node as-is + result.create_handle(source.get_sequence(handle), id); + } + }); + + // Copy all edges, adjusting for flipped nodes. + // An edge (left, right) means: leave left in its orientation, enter right + // in its orientation. If we flipped a node, we need to toggle the + // orientation on that side of the edge. + source.for_each_edge([&](const edge_t& edge) { + handle_t left = edge.first; + handle_t right = edge.second; + + nid_t left_id = source.get_id(left); + bool left_is_reverse = source.get_is_reverse(left); + + nid_t right_id = source.get_id(right); + bool right_is_reverse = source.get_is_reverse(right); + + // If we flipped a node, toggle the orientation for that side + if (flipped.count(left_id)) { + left_is_reverse = !left_is_reverse; + } + if (flipped.count(right_id)) { + right_is_reverse = !right_is_reverse; + } + + result.create_edge( + result.get_handle(left_id, left_is_reverse), + result.get_handle(right_id, right_is_reverse) + ); + + return true; + }); + + return result; +} + +} // namespace unittest +} // namespace vg + +#endif diff --git a/src/unittest/support/snarl_decomposition_fuzzer.cpp b/src/unittest/support/snarl_decomposition_fuzzer.cpp new file mode 100644 index 0000000000..263ad486cf --- /dev/null +++ b/src/unittest/support/snarl_decomposition_fuzzer.cpp @@ -0,0 +1,187 @@ +#include "snarl_decomposition_fuzzer.hpp" + +#include +#include + +namespace vg { +namespace unittest { + +using ET = DecompositionEventType; + +SnarlDecompositionFuzzer::SnarlDecompositionFuzzer( + const HandleGraph* graph, + const HandleGraphSnarlFinder* finder, + const std::unordered_set& chains_to_flip) + : HandleGraphSnarlFinder(graph), wrapped(finder) +{ + + should_flip = [chains_to_flip, graph](nid_t node_id) -> bool { + return chains_to_flip.count(node_id); + }; +} + +void SnarlDecompositionFuzzer::traverse_decomposition( + const function& begin_chain, + const function& end_chain, + const function& begin_snarl, + const function& end_snarl) const +{ + // Step 1: Capture all events from the wrapped finder. + std::vector events = capture_events(*wrapped); + + if (events.empty()) { + return; + } + + // Step 2: Build pairing vector mapping each begin to its matching end + // and vice versa, using separate stacks for chains and snarls. + std::vector other_bound(events.size()); + { + stack chain_stack, snarl_stack; + for (size_t i = 0; i < events.size(); i++) { + switch (events[i].type) { + case ET::BEGIN_CHAIN: + chain_stack.push(i); + break; + case ET::END_CHAIN: + assert(!chain_stack.empty()); + other_bound[i] = chain_stack.top(); + other_bound[chain_stack.top()] = i; + chain_stack.pop(); + break; + case ET::BEGIN_SNARL: + snarl_stack.push(i); + break; + case ET::END_SNARL: + assert(!snarl_stack.empty()); + other_bound[i] = snarl_stack.top(); + other_bound[snarl_stack.top()] = i; + snarl_stack.pop(); + break; + } + } + } + + // Step 3: Walk through events with a cursor, flipping chains as needed. + // When we flip a chain, we jump to the other end and reverse direction, + // pushing the entry point onto a stack. When the cursor reaches a stack + // entry point, we jump back to the far end and restore direction. + struct FlipEntry { + size_t entry_index; + bool original_reverse; + }; + std::stack flip_stack; + + auto emitter = event_emitter(begin_chain, end_chain, begin_snarl, end_snarl); + + bool reverse = false; + for (size_t cursor = 0; cursor != events.size(); cursor += reverse ? -1 : 1) { + // We know if we're entering a chain, we can't be at a stack pop point. + // So we can handle those cases separately. + + if (events[cursor].type == (reverse ? ET::END_CHAIN : ET::BEGIN_CHAIN) && + should_flip(graph->get_id(events[cursor].handle))) { + + // We're entering a chain, and this is a chain we want to flip. So + // flip before emitting anything. + + // Flip: remember where we entered, jump to the other end, + // reverse direction, emit the entry event there. + flip_stack.push({cursor, reverse}); + cursor = other_bound[cursor]; + reverse = !reverse; + } + + // Emit the event here + emitter(reverse ? flip(events[cursor], graph) : events[cursor]); + + if (!flip_stack.empty() && cursor == flip_stack.top().entry_index) { + // We've returned to the entry point of a flipped chain, so after + // emitting, go back to the entry orientation and jump to the other + // side, so we can advance out of it. + + FlipEntry entry = flip_stack.top(); + flip_stack.pop(); + cursor = other_bound[entry.entry_index]; + reverse = entry.original_reverse; + } + } +} + +// ReplaySnarlFinder implementation + +ReplaySnarlFinder::ReplaySnarlFinder(const HandleGraph* graph, const std::vector& events) : HandleGraphSnarlFinder(graph) { + this->events.reserve(events.size()); + for (const DecompositionEvent& e : events) { + // Translate input events into handles + this->events.emplace_back(e.type, graph->get_handle(e.id, e.is_reverse)); + } +} + +void ReplaySnarlFinder::traverse_decomposition( + const std::function& begin_chain, + const std::function& end_chain, + const std::function& begin_snarl, + const std::function& end_snarl) const +{ + auto emitter = event_emitter(begin_chain, end_chain, begin_snarl, end_snarl); + for (auto& event : events) { + emitter(event); + } +} + +std::function event_emitter( + const std::function& begin_chain, + const std::function& end_chain, + const std::function& begin_snarl, + const std::function& end_snarl +) { + return [&](const DecompositionHandleEvent& event) { + switch (event.type) { + case ET::BEGIN_CHAIN: + begin_chain(event.handle); + break; + case ET::END_CHAIN: + end_chain(event.handle); + break; + case ET::BEGIN_SNARL: + begin_snarl(event.handle); + break; + case ET::END_SNARL: + end_snarl(event.handle); + break; + } + }; +} + +std::vector capture_events(const HandleGraphSnarlFinder& finder, const HandleGraph& graph) { + // Get all the events in terms of handles + std::vector handle_result = capture_events(finder); + // And translate them to IDs and orientations + std::vector result; + result.reserve(handle_result.size()); + for (DecompositionHandleEvent& e : handle_result) { + result.emplace_back(e.type, graph.get_id(e.handle), graph.get_is_reverse(e.handle)); + } + return result; +} + +std::vector capture_events(const HandleGraphSnarlFinder& finder) { + std::vector result; + // Mint out functions that push events of different types. + auto event_pusher = [&result](ET event) { + return [event,&result](const handle_t& h) { + result.push_back({event, h}); + }; + }; + finder.traverse_decomposition( + event_pusher(ET::BEGIN_CHAIN), + event_pusher(ET::END_CHAIN), + event_pusher(ET::BEGIN_SNARL), + event_pusher(ET::END_SNARL) + ); + return result; +} + +} // namespace unittest +} // namespace vg diff --git a/src/unittest/support/snarl_decomposition_fuzzer.hpp b/src/unittest/support/snarl_decomposition_fuzzer.hpp new file mode 100644 index 0000000000..91d92e97cb --- /dev/null +++ b/src/unittest/support/snarl_decomposition_fuzzer.hpp @@ -0,0 +1,197 @@ +#ifndef VG_UNITTEST_SNARL_DECOMPOSITION_FUZZER_HPP_INCLUDED +#define VG_UNITTEST_SNARL_DECOMPOSITION_FUZZER_HPP_INCLUDED + +/** + * \file snarl_decomposition_fuzzer.hpp + * Provides SnarlDecompositionFuzzer, which wraps a HandleGraphSnarlFinder and + * randomly flips chains in the snarl decomposition, and ReplaySnarlFinder, + * which replays a scripted sequence of decomposition events. + */ + +#include +#include +#include +#include +#include +#include "snarls.hpp" +#include "handle.hpp" + +namespace vg { +namespace unittest { + +/// Event types for snarl decomposition traversal. +enum class DecompositionEventType { + BEGIN_CHAIN = 0, + END_CHAIN, + BEGIN_SNARL, + END_SNARL +}; + +inline std::ostream& operator<<(std::ostream& out, const DecompositionEventType& t) { + int bits = (int)t; + return out << (bits & 1 ? "END" : "BEGIN") << "_" << (bits & 2 ? "SNARL" : "CHAIN"); +} + +/// Flip the polatiry of an event type (start vs. end) +inline DecompositionEventType flip(const DecompositionEventType& t) { + // We can flip by toggling the low bit. + return (DecompositionEventType)((int) t ^ 1); +} + +/// A single event in a snarl decomposition traversal. +/// This is in terms of IDs and orientations because those are easier to write in test code. +struct DecompositionEvent { + DecompositionEventType type; + nid_t id; + bool is_reverse; + + inline bool operator==(const DecompositionEvent& other) const { + return type == other.type && id == other.id && is_reverse == other.is_reverse; + } + + inline bool operator!=(const DecompositionEvent& other) const { + return ! (*this == other); + } +}; + +inline std::ostream& operator<<(std::ostream& out, const DecompositionEvent& e) { + return out << e.type << "(" << e.id << (e.is_reverse ? "-" : "+") << ")"; +} + +/// A single event in a snarl decomposition traversal. +/// This is in terms of handles because those are easier to work with internally. +struct DecompositionHandleEvent { + DecompositionEventType type; + handle_t handle; +}; + +/// Flip the polarity of a whole event (event type between begin and end, and handle orientation) +inline DecompositionHandleEvent flip(const DecompositionHandleEvent& e, const HandleGraph* g) { + return {flip(e.type), g->flip(e.handle)}; +} + +/// Turn begin and end functions to call into a function that emits an event by +/// type. The provided functions must outlive the returned function. +std::function event_emitter( + const std::function& begin_chain, + const std::function& end_chain, + const std::function& begin_snarl, + const std::function& end_snarl +); + +/// Capture all events emitted by a snarl finder, in terms of IDs and orientations. +std::vector capture_events(const HandleGraphSnarlFinder& finder, const HandleGraph& graph); + +/// Capture all events emitted by a snarl finder, in terms of handles. +std::vector capture_events(const HandleGraphSnarlFinder& finder); + +/** + * A HandleGraphSnarlFinder that wraps another HandleGraphSnarlFinder and + * randomly flips chains in the snarl decomposition. Flipping a chain reverses + * the entire chain including all children; if a child chain is also selected + * for flipping, it gets flipped again (canceling the parent's flip for that + * child). + * + * For non-randomized testing, the specific chains to flip can be + * pre-identified and provided on construction. + */ +class SnarlDecompositionFuzzer : public HandleGraphSnarlFinder { +public: + /** + * Construct a fuzzer wrapping the given finder, flipping chains with + * probability p_flip using the given random generator. + * The graph pointer is needed to flip handles. + */ + template + SnarlDecompositionFuzzer(const HandleGraph* graph, + const HandleGraphSnarlFinder* finder, + double p_flip, URNG& generator); + + /** + * Construct a fuzzer wrapping the given finder, flipping the chains + * bounded by the given node IDs. + * + * You should provide both bounding IDs for each chain, but only the one + * that the chain is actually arrived at through during the traversal will + * really get used. + * + * Note that a node can bound at most one chain. + * + * This is mostly for testing the fuzzer itself. + */ + SnarlDecompositionFuzzer(const HandleGraph* graph, + const HandleGraphSnarlFinder* finder, + const std::unordered_set& chains_to_flip); + + virtual ~SnarlDecompositionFuzzer() = default; + + /** + * Traverse the snarl decomposition, flipping selected chains. + */ + virtual void traverse_decomposition( + const std::function& begin_chain, + const std::function& end_chain, + const std::function& begin_snarl, + const std::function& end_snarl + ) const override; + +private: + /// The wrapped snarl finder + const HandleGraphSnarlFinder* wrapped; + + /// Function that decides whether to flip a chain, given either of its + /// bounding node IDs. May produce different results when called + /// multiple times with the same input. + std::function should_flip; +}; + +/** + * A HandleGraphSnarlFinder that replays a scripted sequence of decomposition + * events. Useful for testing SnarlDecompositionFuzzer without needing a real + * graph or snarl finder. + */ +class ReplaySnarlFinder : public HandleGraphSnarlFinder { +public: + /** + * Construct a replay finder that will emit the given events. + */ + ReplaySnarlFinder(const HandleGraph* graph, const std::vector& events); + + virtual ~ReplaySnarlFinder() = default; + + /** + * Replay the scripted events. + */ + virtual void traverse_decomposition( + const std::function& begin_chain, + const std::function& end_chain, + const std::function& begin_snarl, + const std::function& end_snarl + ) const override; + +private: + + using EventType = DecompositionEventType; + using Event = DecompositionHandleEvent; + + /// This stores events we are going to replay. + std::vector events; +}; + + +template +SnarlDecompositionFuzzer::SnarlDecompositionFuzzer( + const HandleGraph* graph, + const HandleGraphSnarlFinder* finder, + double p_flip, URNG& generator) + : HandleGraphSnarlFinder(graph), wrapped(finder) +{ + should_flip = [&generator, p_flip](nid_t ignored) -> bool { + return std::uniform_real_distribution(0.0, 1.0)(generator) < p_flip; + }; +} + +} // namespace unittest +} // namespace vg + +#endif diff --git a/src/unittest/variant_adder.cpp b/src/unittest/variant_adder.cpp index afe3353e4b..6fad7d82ab 100644 --- a/src/unittest/variant_adder.cpp +++ b/src/unittest/variant_adder.cpp @@ -9,7 +9,7 @@ #include "../utility.hpp" #include "../path.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include #include @@ -38,7 +38,7 @@ ref 5 rs1337 A G 29 PASS . GT // Make a stream out of the data std::stringstream vcf_stream(vcf_data); - + // Load it up in vcflib vcflib::VariantCallFile vcf; vcf.open(vcf_stream); @@ -51,14 +51,10 @@ ref 5 rs1337 A G 29 PASS . GT ]} ] })"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder @@ -85,7 +81,7 @@ ref 5 rs1337 A G 29 PASS . GT 0/1 // Make a stream out of the data std::stringstream vcf_stream(vcf_data); - + // Load it up in vcflib vcflib::VariantCallFile vcf; vcf.open(vcf_stream); @@ -98,14 +94,10 @@ ref 5 rs1337 A G 29 PASS . GT 0/1 ]} ] })"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder VariantAdder adder(graph); @@ -139,7 +131,7 @@ ref 5 rs1337 AAAAAAAAAAAAAAAAAAAAA A 29 PASS . GT 0/1 // Make a stream out of the data std::stringstream vcf_stream(vcf_data); - + // Load it up in vcflib vcflib::VariantCallFile vcf; vcf.open(vcf_stream); @@ -152,14 +144,10 @@ ref 5 rs1337 AAAAAAAAAAAAAAAAAAAAA A 29 PASS . GT 0/1 ]} ] })"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder VariantAdder adder(graph); @@ -193,7 +181,7 @@ ref 5 rs1337 AAAAAAAAAAAAAAAAAAAAA A 29 PASS . GT 0/1 // Make a stream out of the data std::stringstream vcf_stream(vcf_data); - + // Load it up in vcflib vcflib::VariantCallFile vcf; vcf.open(vcf_stream); @@ -213,14 +201,10 @@ ref 5 rs1337 AAAAAAAAAAAAAAAAAAAAA A 29 PASS . GT 0/1 ]} ] })"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); SECTION ("should work when the graph is as given") { @@ -280,7 +264,7 @@ ref 5 rs1337 AAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 29 // Make a stream out of the data std::stringstream vcf_stream(vcf_data); - + // Load it up in vcflib vcflib::VariantCallFile vcf; vcf.open(vcf_stream); @@ -293,14 +277,10 @@ ref 5 rs1337 AAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 29 ]} ] })"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder VariantAdder adder(graph); @@ -323,14 +303,10 @@ TEST_CASE( "The smart aligner works on very large inserts", "[variantadder]" ) { string graph_json = R"({ "node": [{"id": 1, "sequence": "GCGCAAAAAAAAAAAAAAAAAAAAAGCGC"}] })"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder VariantAdder adder(graph); @@ -396,21 +372,17 @@ TEST_CASE( "The smart aligner should use mapping offsets on huge deletions", "[v {"from": 2, "to": 3} ] })"; - + // Make the graph have lots of As stringstream a_stream; for(size_t i = 0; i < 10000; i++) { a_stream << "A"; } graph_json = regex_replace(graph_json, std::regex("<10kAs>"), a_stream.str()); - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder VariantAdder adder(graph); @@ -484,21 +456,17 @@ TEST_CASE( "The smart aligner should find existing huge deletions", "[variantadd {"from": 2, "to": 3} ] })"; - + // Make the graph have lots of As stringstream a_stream; for(size_t i = 0; i < 10000; i++) { a_stream << "A"; } graph_json = regex_replace(graph_json, std::regex("<10kAs>"), a_stream.str()); - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder VariantAdder adder(graph); @@ -564,21 +532,17 @@ TEST_CASE( "The smart aligner should use deletion edits on medium deletions", "[ string graph_json = R"({ "node": [{"id": 1, "sequence": "GCGC<100As>GCGC"}] })"; - + // Make the graph have lots of As stringstream a_stream; for(size_t i = 0; i < 100; i++) { a_stream << "A"; } graph_json = regex_replace(graph_json, std::regex("<100As>"), a_stream.str()); - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder VariantAdder adder(graph); diff --git a/src/unittest/vg.cpp b/src/unittest/vg.cpp index 9beb3e1ca7..b2795b57cc 100644 --- a/src/unittest/vg.cpp +++ b/src/unittest/vg.cpp @@ -8,6 +8,7 @@ #include "../utility.hpp" #include "../algorithms/normalize.hpp" #include "../algorithms/disjoint_components.hpp" +#include "../io/json2graph.hpp" #include "handle.hpp" namespace vg { @@ -15,16 +16,6 @@ namespace unittest { using namespace std; -// Turn a JSON string into a VG graph -VG string_to_graph(const string& json) { - VG graph; - Graph chunk; - json2pb(chunk, json.c_str(), json.size()); - graph.merge(chunk); - - return graph; -} - TEST_CASE("dagify() should render the graph acyclic", "[vg][cycles][dagify]") { unordered_map > node_translation; @@ -44,7 +35,7 @@ TEST_CASE("dagify() should render the graph acyclic", "[vg][cycles][dagify]") { )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); VG dag = graph.dagify(5, node_translation, 5, 0); @@ -69,7 +60,7 @@ TEST_CASE("dagify() should render the graph acyclic", "[vg][cycles][dagify]") { )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); VG dag = graph.dagify(5, node_translation, 5, 0); @@ -93,7 +84,7 @@ TEST_CASE("dagify() should render the graph acyclic", "[vg][cycles][dagify]") { )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); VG dag = graph.dagify(5, node_translation, 5, 0); @@ -123,7 +114,7 @@ TEST_CASE("unfold() should properly unfold a graph out to the requested length", } )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); unordered_map > node_translation; VG unfolded = graph.unfold(10000, node_translation); @@ -252,7 +243,7 @@ TEST_CASE("unfold() should properly unfold a graph out to the requested length", } )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); unordered_map > node_translation; VG unfolded = graph.unfold(10000, node_translation); @@ -327,7 +318,7 @@ TEST_CASE("unfold() should properly unfold a graph out to the requested length", } )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); unordered_map > node_translation; VG unfolded = graph.unfold(10000, node_translation); @@ -417,7 +408,7 @@ TEST_CASE("unfold() should properly unfold a graph out to the requested length", } )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); unordered_map > node_translation; VG unfolded = graph.unfold(10000, node_translation); @@ -574,7 +565,7 @@ TEST_CASE("unfold() should properly unfold a graph out to the requested length", } )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); unordered_map > node_translation; VG unfolded = graph.unfold(10000, node_translation); @@ -742,7 +733,7 @@ TEST_CASE("unfold() should properly unfold a graph out to the requested length", } )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); unordered_map > node_translation; VG unfolded = graph.unfold(2, node_translation); @@ -904,7 +895,7 @@ TEST_CASE("expand_context_by_length() should respect barriers", "[vg][context]") } )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); SECTION("barriers on either end of the seed node should stop anything being extracted") { @@ -962,7 +953,7 @@ TEST_CASE("add_nodes_and_edges() should connect all nodes", "[vg][edit]") { )"; // Define a graph - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); const string path_json = R"( { @@ -1051,7 +1042,7 @@ TEST_CASE("edit() should not get confused even under very confusing circumstance )"; // Define a graph - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); // And a path that doubles back on itself through an edge that isn't in the graph yet const string path_json = R"( @@ -1310,7 +1301,7 @@ TEST_CASE("normalize() can join nodes and merge siblings", "[vg][normalize]") { )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); algorithms::normalize(&graph); // One of the two alternative Ts should have been eliminated @@ -1341,7 +1332,7 @@ TEST_CASE("normalize() can join nodes and merge siblings", "[vg][normalize]") { )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); algorithms::normalize(&graph); // Those duplicate Ts should be eliminated @@ -1375,7 +1366,7 @@ TEST_CASE("normalize() can join nodes and merge siblings", "[vg][normalize]") { )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); algorithms::normalize(&graph); // Those duplicate Ts and Gs should be eliminated @@ -1409,7 +1400,7 @@ TEST_CASE("normalize() can join nodes and merge siblings", "[vg][normalize]") { )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); algorithms::normalize(&graph); // Those duplicate Ts and Gs should be eliminated @@ -1447,7 +1438,7 @@ TEST_CASE("normalize() can join nodes and merge siblings when nodes are backward )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); algorithms::normalize(&graph); // Those duplicate Ts (actually As) should be eliminated @@ -1486,7 +1477,7 @@ TEST_CASE("normalize() can join nodes and merge siblings when nodes are backward )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); algorithms::normalize(&graph); // Those duplicate Ts (actually As) and Gs (actually Cs) should be eliminated diff --git a/src/unittest/vg_algorithms.cpp b/src/unittest/vg_algorithms.cpp index b4fc736734..8e713f87f7 100644 --- a/src/unittest/vg_algorithms.cpp +++ b/src/unittest/vg_algorithms.cpp @@ -27,7 +27,7 @@ #include "../vg.hpp" #include "../xg.hpp" #include -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" using namespace google::protobuf; @@ -1092,11 +1092,8 @@ TEST_CASE( "Connecting graph extraction works on a cool loop without leaving ext {"edge": [{"from": "185927720", "to": "185927722"}, {"from": "185927721", "from_start": true, "to": "185927722"}, {"from": "185927722", "to": "186681786", "to_end": true}, {"from": "185927722", "to": "185927723"}, {"from": "186681786", "to": "186683083"}, {"from": "186681786", "from_start": true, "to": "186681787", "to_end": true}, {"from": "186681787", "to": "186683069", "to_end": true}, {"from": "186681787", "from_start": true, "to": "186681789"}, {"from": "186681787", "from_start": true, "to": "186681788", "to_end": true}, {"from": "186681788", "from_start": true, "to": "186681790", "to_end": true}, {"from": "186681789", "to": "186681790", "to_end": true}, {"from": "186681790", "from_start": true, "to": "186681792", "to_end": true}, {"from": "186683069", "from_start": true, "to": "186683079", "to_end": true}, {"from": "186683079", "from_start": true, "to": "186683080", "to_end": true}, {"from": "186683080", "from_start": true, "to": "186683081", "to_end": true}, {"from": "186683081", "from_start": true, "to": "186683083", "to_end": true}], "node": [{"id": "185927720", "sequence": "G"}, {"id": "185927721", "sequence": "A"}, {"id": "185927722", "sequence": "ACCGGG"}, {"id": "185927723", "sequence": "AGTGGGGG"}, {"id": "186681786", "sequence": "C"}, {"id": "186681787", "sequence": "TGGGAGTCTAAGTCTCTTTTGATCACACTTTAAAGACCAAAAGGTAGAAGCGCAAAGACGTTATCTGTCCAATATTACAAACCTAGTAAGTGGTGGAATTTGGCCTTGAACCCAGATCTGTAACTCCAGAGCCGAAGTGCTTCACCCACCTCCCTGTGGTG"}, {"id": "186681788", "sequence": "G"}, {"id": "186681789", "sequence": "T"}, {"id": "186681790", "sequence": "TAT"}, {"id": "186681792", "sequence": "T"}, {"id": "186683069", "sequence": "G"}, {"id": "186683079", "sequence": "G"}, {"id": "186683080", "sequence": "TACCCCGGAATCCCTGCCGCGGCCCCTCGGGCCTGTCCACATCCCTCTGCCCCTCCCAGACCTCTGTCCTTCCACCAATCGCCTCCCGCAGCCCCGAGCCGCCACTCCCAGTCCCCCGAGTCCCTGCCGCGCGCCCTCGCGCCTGTCCACATCCCTCTGCCCATCCGAGACCTCTGTCCTTACACCACTAGCCACCCCACGTGGGACTTCCATGGCTTCTGAGTACAAGGCCAGCCCCCCGGCCCACCAGCTTTCGGAATGCCTGCTTACCTCTTTTTCTGTAGA"}, {"id": "186683081", "sequence": "CCGG"}, {"id": "186683083", "sequence": "C"}]} )"; - Graph source; - json2pb(source, graph_json.c_str(), graph_json.size()); - VG vg; - vg.extend(source); + vg::io::json2graph(graph_json, &vg); bdsg::HashGraph extractor; @@ -1688,11 +1685,8 @@ TEST_CASE( "Connecting graph extraction works on a particular case without leavi )"; - Graph source; - json2pb(source, graph_json.c_str(), graph_json.size()); - VG vg; - vg.extend(source); + vg::io::json2graph(graph_json, &vg); VG extractor; @@ -2583,13 +2577,9 @@ TEST_CASE( "Topological sort works on a more complex graph", {"node": [{"id": 1, "sequence": "GTATTTTTAGTA"}, {"id": 2, "sequence": "G"}, {"id": 3, "sequence": "GAGACGGGGTTTCACCATGTT"}, {"id": 4, "sequence": "T"}, {"id": 5, "sequence": "CTAATTTTT"}, {"id": 6, "sequence": "CA"}, {"id": 7, "sequence": "GG"}, {"id": 8, "sequence": "ACGCCC"}, {"id": 9, "sequence": "C"}, {"id": 10, "sequence": "T"}, {"id": 11, "sequence": "C"}, {"id": 12, "sequence": "GCCA"}, {"id": 13, "sequence": "A"}, {"id": 14, "sequence": "GGGATTACAGGCGCACACC"}, {"id": 15, "sequence": "CCACACC"}, {"id": 16, "sequence": "AT"}, {"id": 17, "sequence": "CC"}, {"id": 18, "sequence": "GGTCAGGCTGGTCTCGACTCC"}, {"id": 19, "sequence": "TGACCTCCTGATCTGCCCCCC"}, {"id": 20, "sequence": "A"}, {"id": 21, "sequence": "G"}, {"id": 22, "sequence": "TATTTTTAGTA"}, {"id": 23, "sequence": "A"}, {"id": 24, "sequence": "G"}, {"id": 25, "sequence": "GA"}], "edge": [{"from": 4, "to": 1}, {"from": 5, "to": 1}, {"from": 1, "to": 2}, {"from": 1, "to": 3}, {"from": 22, "to": 2}, {"from": 2, "to": 20}, {"from": 2, "to": 21}, {"from": 3, "to": 18}, {"from": 5, "to": 4}, {"from": 6, "to": 5}, {"from": 7, "to": 5}, {"from": 8, "to": 6}, {"from": 8, "to": 7}, {"from": 9, "to": 8}, {"from": 10, "to": 8}, {"from": 11, "to": 9}, {"from": 11, "to": 10}, {"from": 12, "to": 11}, {"from": 13, "to": 11}, {"from": 16, "to": 12}, {"from": 17, "to": 12}, {"from": 12, "to": 15}, {"from": 14, "to": 13}, {"from": 18, "to": 19}, {"from": 20, "to": 25}, {"from": 21, "to": 25}, {"from": 23, "to": 22}, {"from": 24, "to": 22}]} )"; - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG + // Load the JSON into a VG VG vg; - vg.extend(proto_graph); + vg::io::json2graph(graph_json, &vg); SECTION( "handlealgs::topological_order produces a consistent total ordering and orientation" ) { auto handle_sort = handlealgs::topological_order(&vg); @@ -5385,11 +5375,8 @@ TEST_CASE("simplify_siblings() works on a graph with a reversing self loop", "[a {"edge": [{"from": "1", "to": "3"}, {"from": "1", "to": "2"}, {"from": "2", "to": "2", "to_end": true}], "node": [{"id": "1", "sequence": "T"}, {"id": "2", "sequence": "A"}, {"id": "3", "sequence": "ACA"}], "path": [{"mapping": [{"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "1"}, "rank": "1"}, {"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "2"}, "rank": "2"}, {"edit": [{"from_length": 1, "to_length": 1}], "position": {"is_reverse": true, "node_id": "2"}, "rank": "3"}], "name": "x"}, {"mapping": [{"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "1"}, "rank": "1"}, {"edit": [{"from_length": 3, "to_length": 3}], "position": {"node_id": "3"}, "rank": "2"}], "name": "y"}]} )"; - Graph source; - json2pb(source, graph_json.c_str(), graph_json.size()); - VG graph; - graph.extend(source); + vg::io::json2graph(graph_json, &graph); @@ -5405,11 +5392,8 @@ TEST_CASE("simplify_siblings() works on a smaller graph with a reversing self lo {"edge": [{"from": "1", "to": "3"}, {"from": "1", "to": "2"}, {"from": "2", "to": "2", "to_end": true}], "node": [{"id": "1", "sequence": "T"}, {"id": "2", "sequence": "A"}, {"id": "3", "sequence": "A"}], "path": [{"mapping": [{"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "1"}, "rank": "1"}, {"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "2"}, "rank": "2"}, {"edit": [{"from_length": 1, "to_length": 1}], "position": {"is_reverse": true, "node_id": "2"}, "rank": "3"}], "name": "x"}]} )"; - Graph source; - json2pb(source, graph_json.c_str(), graph_json.size()); - VG graph; - graph.extend(source); + vg::io::json2graph(graph_json, &graph); @@ -5425,11 +5409,8 @@ TEST_CASE("normalize() works on a graph with a reversing self loop", "[algorithm {"edge": [{"from": "1", "to": "3"}, {"from": "1", "to": "2"}, {"from": "2", "to": "2", "to_end": true}], "node": [{"id": "1", "sequence": "T"}, {"id": "2", "sequence": "A"}, {"id": "3", "sequence": "ACA"}], "path": [{"mapping": [{"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "1"}, "rank": "1"}, {"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "2"}, "rank": "2"}, {"edit": [{"from_length": 1, "to_length": 1}], "position": {"is_reverse": true, "node_id": "2"}, "rank": "3"}], "name": "x"}, {"mapping": [{"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "1"}, "rank": "1"}, {"edit": [{"from_length": 3, "to_length": 3}], "position": {"node_id": "3"}, "rank": "2"}], "name": "y"}]} )"; - Graph source; - json2pb(source, graph_json.c_str(), graph_json.size()); - VG graph; - graph.extend(source); + vg::io::json2graph(graph_json, &graph); diff --git a/src/unittest/vpkg.cpp b/src/unittest/vpkg.cpp index 51a849c446..977814ff9c 100644 --- a/src/unittest/vpkg.cpp +++ b/src/unittest/vpkg.cpp @@ -13,7 +13,7 @@ #include "xg.hpp" #include "../vg.hpp" #include "../snarl_seed_clusterer.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include #include #include @@ -50,12 +50,12 @@ TEST_CASE("We can read and write XG", "[vpkg][handlegraph][xg]") { )"; // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + bdsg::HashGraph hash_graph; + vg::io::json2graph(graph_json, &hash_graph); + // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(hash_graph); stringstream ss; @@ -148,13 +148,10 @@ TEST_CASE("We can read VG from a VPKG-wrapped stream as a VG", "[vpkg][handlegra {"id":2,"sequence":"ACA"}], "edge":[{"to":2,"from":1}]} )"; - - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Build the VG - vg::VG vg_graph(proto_graph); + + // Load the JSON directly into VG + vg::VG vg_graph; + vg::io::json2graph(graph_json, &vg_graph); // Save it stringstream ss; @@ -179,13 +176,10 @@ TEST_CASE("We can read VG from a VPKG-wrapped stream as a HandleGraph which is a {"id":2,"sequence":"ACA"}], "edge":[{"to":2,"from":1}]} )"; - - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Build the VG - vg::VG vg_graph(proto_graph); + + // Load the JSON directly into VG + vg::VG vg_graph; + vg::io::json2graph(graph_json, &vg_graph); // Save it stringstream ss; @@ -210,13 +204,10 @@ TEST_CASE("We can read VG from a VPKG-wrapped stream as a HandleGraph which is a TEST_CASE("We can read an empty VG as a HandleGraph", "[vpkg][handlegraph][vg][empty]") { string graph_json = "{}"; - - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Build the VG - vg::VG vg_graph(proto_graph); + + // Load the JSON directly into VG + vg::VG vg_graph; + vg::io::json2graph(graph_json, &vg_graph); // Save it stringstream ss; @@ -240,13 +231,10 @@ TEST_CASE("We prefer to read a graph as the first provided type that matches", " {"id":2,"sequence":"ACA"}], "edge":[{"to":2,"from":1}]} )"; - - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Build the VG - vg::VG vg_graph(proto_graph); + + // Load the JSON directly into VG + vg::VG vg_graph; + vg::io::json2graph(graph_json, &vg_graph); // Save it stringstream ss; diff --git a/src/unittest/xdrop_aligner.cpp b/src/unittest/xdrop_aligner.cpp index f745b8f66a..07577e4479 100644 --- a/src/unittest/xdrop_aligner.cpp +++ b/src/unittest/xdrop_aligner.cpp @@ -5,7 +5,7 @@ #include #include -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include "../alignment.hpp" #include "../vg.hpp" #include @@ -764,12 +764,9 @@ TEST_CASE("QualAdjXdropAligner will not penalize a low quality mismatch", "[xdro TEST_CASE("XdropAligner doesn't crash on a case where it is hard to find a seed", "[xdrop][alignment][mapping]") { string graph_json = R"({"edge": [{"from": "92345167", "to": "92345168"}, {"from": "92345182", "to": "92345183"}, {"from": "92345165", "to": "92345166"}, {"from": "92345177", "to": "92345178"}, {"from": "92345171", "to": "92345172"}, {"from": "92345161", "to": "92345162"}, {"from": "92345183", "to": "92345184"}, {"from": "92345181", "to": "92345182"}, {"from": "92345178", "to": "92345179"}, {"from": "92345166", "to": "92345167"}, {"from": "92345179", "to": "92345180"}, {"from": "92345173", "to": "92345174"}, {"from": "92345184", "to": "92345185"}, {"from": "92345169", "to": "92345170"}, {"from": "92345185", "to": "92345186"}, {"from": "92345160", "to": "92345161"}, {"from": "92345174", "to": "92345175"}, {"from": "92345162", "to": "92345163"}, {"from": "92345175", "to": "92345176"}, {"from": "92345168", "to": "92345169"}, {"from": "92345163", "to": "92345164"}, {"from": "92345172", "to": "92345173"}, {"from": "92345180", "to": "92345181"}, {"from": "92345176", "to": "92345177"}, {"from": "92345170", "to": "92345171"}, {"from": "92345164", "to": "92345165"}], "node": [{"id": "92345167", "sequence": "TTTATATATATATATTTATATATATATATTTA"}, {"id": "92345182", "sequence": "TATATATATTTATATATATATTTATATATATA"}, {"id": "92345165", "sequence": "ATATATATATATTTATATATATTTATATATTA"}, {"id": "92345177", "sequence": "TTTATATATATATTTATATATATATATTATAT"}, {"id": "92345171", "sequence": "TTATATATATATTTATATATATATTTATATAT"}, {"id": "92345161", "sequence": "ATATATTTATATATTTTTATATATTATATATT"}, {"id": "92345183", "sequence": "TTTATATATATTTATATATATATTTATATATA"}, {"id": "92345181", "sequence": "ATATATTATATATATATTTATATATATATTTA"}, {"id": "92345178", "sequence": "ATATATTTATATATATATTTATATATATATTT"}, {"id": "92345166", "sequence": "TTTATATATATTTATATATATATTTATATATA"}, {"id": "92345179", "sequence": "ATATATATATTTATATATATATTTATATATAT"}, {"id": "92345173", "sequence": "ATATTTATATATATATATTTATATATATATTT"}, {"id": "92345184", "sequence": "TATTTATATATATATTTATATATATTTATATA"}, {"id": "92345169", "sequence": "TTTATATATATATTTATATATATATTTATATA"}, {"id": "92345185", "sequence": "TATATTTATATATATATATATATATTTATATA"}, {"id": "92345160", "sequence": "ATTTATATATATATTTATATATATATTTATAT"}, {"id": "92345174", "sequence": "ATATATATATTTATATATATATTATTTATATA"}, {"id": "92345162", "sequence": "TATATATATATTTATATATTATATATATATTT"}, {"id": "92345175", "sequence": "TATATTTATATATATATTATATATATATTTAT"}, {"id": "92345168", "sequence": "TATATATATTTATATATATATTTATATATATA"}, {"id": "92345163", "sequence": "ATATATTTATATATATATTTATATATATTTAT"}, {"id": "92345172", "sequence": "ATATATATATATTTATATATATATTTATATAT"}, {"id": "92345180", "sequence": "ATTTATATATATATTTATATATATATTTATAT"}, {"id": "92345176", "sequence": "ATATATATATTATATATATATTTATATATATA"}, {"id": "92345170", "sequence": "TATATTTATATATATATATTATATATATATAT"}, {"id": "92345164", "sequence": "ATATATATTTATATATATTTATATATATATTT"}, {"id": "92345186", "sequence": "TATATTTATATATATTTATATATATATTTATA"}]})"; - - Graph source; - json2pb(source, graph_json.c_str(), graph_json.size()); - - VG graph; - graph.extend(source); + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); Alignment aln; aln.set_sequence("CAGCACTTTGGGAGGCCAAGGTGGGTGGATCATCTGAGGTCAGGAGTTTGAGACCAGCCTGACCAACATGGTGAAATCCTGTCTCTACTGAAAATACTAAAATTAGCCAGGCGTGGCGGCCAGTGCCTGTAATCCCGGCTACTGGGGAGG"); diff --git a/src/unittest/xg.cpp b/src/unittest/xg.cpp index d74db5d0b0..dfa913b8eb 100644 --- a/src/unittest/xg.cpp +++ b/src/unittest/xg.cpp @@ -8,7 +8,9 @@ #include "vg.hpp" #include "xg.hpp" #include "graph.hpp" +#include "../io/json2graph.hpp" #include "algorithms/subgraph.hpp" +#include "bdsg/hash_graph.hpp" #include namespace vg { @@ -22,19 +24,18 @@ TEST_CASE("We can build an xg index on a nice graph", "[xg]") { {"id":2,"sequence":"ACA"}], "edge":[{"to":2,"from":1}]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(source); VG vg_graph; algorithms::extract_context(xg_index, vg_graph, xg_index.get_handle(1), 0, 100); Graph& graph = vg_graph.graph; - sort_by_id_dedup_and_clean(graph); REQUIRE(graph.node_size() == 2); REQUIRE(graph.edge_size() == 1); @@ -49,19 +50,18 @@ TEST_CASE("We can build an xg index on a nasty graph", "[xg]") { {"id":9999,"sequence":"AAA"}], "edge":[{"to":2,"from":1}]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(source); VG vg_graph; algorithms::extract_context(xg_index, vg_graph, xg_index.get_handle(1), 0, 100); Graph& graph = vg_graph.graph; - sort_by_id_dedup_and_clean(graph); REQUIRE(graph.node_size() == 2); REQUIRE(graph.edge_size() == 1); @@ -161,15 +161,14 @@ TEST_CASE("We can build an xg index on a very nasty graph", "[xg]") { {"position":{"node_id":1444},"rank":1059}, {"position":{"node_id":1445},"rank":1060}]}]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); + VG source; + vg::io::json2graph(graph_json, &source); - sort_by_id_dedup_and_clean(proto_graph); // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(source); SECTION("Context extraction gets something") { VG graph; @@ -182,7 +181,7 @@ TEST_CASE("We can build an xg index on a very nasty graph", "[xg]") { SECTION("We can extract within a single node") { algorithms::extract_path_range(xg_index, xg_index.get_path_handle("17"), 5, 15, graph); - + // We should just get node 1416 REQUIRE(graph.graph.node_size() == 1); REQUIRE(graph.graph.node(0).id() == 1416); @@ -265,14 +264,14 @@ TEST_CASE("We can build and scan an XG index for a problematic graph", "[xg]") { ]} ]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); // Build the xg index (without any sorting) xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(source); REQUIRE(xg_index.get_node_count() == 5); @@ -300,18 +299,16 @@ TEST_CASE("We can build the xg index on a small graph with discontinuous node id )"; // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); + VG source; + vg::io::json2graph(graph_json, &source); - sort_by_id_dedup_and_clean(proto_graph); // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(source); VG vg_graph; algorithms::extract_context(xg_index, vg_graph, xg_index.get_handle(10), 0, 100); Graph& graph = vg_graph.graph; - sort_by_id_dedup_and_clean(graph); REQUIRE(graph.node_size() == 2); REQUIRE(graph.edge_size() == 1); @@ -326,14 +323,14 @@ TEST_CASE("Looping over XG handles in parallel works", "[xg]") { {"id":2,"sequence":"ACA"}], "edge":[{"to":2,"from":1}]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(source); size_t count = 0; @@ -341,7 +338,7 @@ TEST_CASE("Looping over XG handles in parallel works", "[xg]") { #pragma omp critical count++; }, true); - + REQUIRE(count == 2); } @@ -400,14 +397,14 @@ TEST_CASE("Vectorization of xg works correctly", "[xg]") { {"edit": [{"from_length": 11, "to_length": 11}], "position": {"node_id": "15"}, "rank": "10"} ], "name": "x"}]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); // Build the xg index (without any sorting) xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(source); REQUIRE(xg_index.get_node_count() == 15); diff --git a/src/unittest/zip_code.cpp b/src/unittest/zip_code.cpp index dc3255e984..1d0a2c39c7 100644 --- a/src/unittest/zip_code.cpp +++ b/src/unittest/zip_code.cpp @@ -117,6 +117,10 @@ using namespace std; bool chain_is_reversed = distance_index.is_reversed_in_parent( distance_index.get_node_net_handle(n1->id())); + // Node 4 is in snarl 3 to 6 which should be regular. + // The zip codes are going to encode this so it had better be true. + REQUIRE(distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n4->id()))))); + SECTION ("zip code for node on top-level chain") { ZipCode zipcode; zipcode.fill_in_zipcode_from_pos(distance_index, make_pos_t(n1->id(), 0, false)); diff --git a/src/zip_code.cpp b/src/zip_code.cpp index 4699a24494..051602443f 100644 --- a/src/zip_code.cpp +++ b/src/zip_code.cpp @@ -1,3 +1,5 @@ +#include "crash.hpp" + #include "zip_code.hpp" //#define DEBUG_ZIPCODE @@ -16,10 +18,11 @@ void ZipCode::fill_in_zipcode_from_pos(const SnarlDistanceIndex& distance_index, //Put all ancestors of the node in a vector, starting from the node, and not including the root while (!distance_index.is_root(current_handle)) { ancestors.emplace_back(distance_index.start_end_traversal_of(current_handle)); - current_handle = distance_index.get_parent(current_handle); + net_handle_t parent_handle = distance_index.get_parent(current_handle); + crash_unless(parent_handle != current_handle); + current_handle = parent_handle; } - //Now add the root-level snarl or chain if (distance_index.is_root_snarl(current_handle)) { //First thing is a snarl, so add the snarl's connected component number @@ -121,7 +124,7 @@ void ZipCode::fill_in_zipcode_from_pos(const SnarlDistanceIndex& distance_index, } return; } - } else if (distance_index.is_regular_snarl(current_ancestor, false, graph_ptr)) { + } else if (distance_index.is_regular_snarl(current_ancestor)) { snarl_code_t snarl_code = get_regular_snarl_code(current_ancestor, ancestors[i-1], distance_index); zipcode.add_value(snarl_code.get_raw_code_type()); zipcode.add_value(snarl_code.get_raw_prefix_sum_or_identifier()); @@ -1065,11 +1068,7 @@ ZipCode::snarl_code_t ZipCode::get_regular_snarl_code(const net_handle_t& snarl, snarl_code.set_code_type(1); //The number of children - size_t child_count = 0; - distance_index.for_each_child(snarl, [&] (const net_handle_t& child) { - child_count++; - }); - snarl_code.set_child_count(child_count); + snarl_code.set_child_count(distance_index.get_snarl_child_count(snarl)); //Chain prefix sum value for the start of the snarl, which is the prefix sum of the start node + length of the start node net_handle_t start_node = distance_index.get_node_from_sentinel(distance_index.get_bound(snarl, false, false)); @@ -1100,11 +1099,7 @@ ZipCode::snarl_code_t ZipCode::get_irregular_snarl_code(const net_handle_t& snar snarl_code.set_code_type(distance_index.is_dag(snarl) ? 0 : 2); //The number of children - size_t child_count = 0; - distance_index.for_each_child(snarl, [&] (const net_handle_t& child) { - child_count++; - }); - snarl_code.set_child_count(child_count); + snarl_code.set_child_count(distance_index.get_snarl_child_count(snarl)); //Chain prefix sum value for the start of the snarl, which is the prefix sum of the start node + length of the start node net_handle_t start_node = distance_index.get_node_from_sentinel(distance_index.get_bound(snarl, false, false));