diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py index d85a334493..2937a2d93f 100644 --- a/deepmd/pt_expt/utils/serialization.py +++ b/deepmd/pt_expt/utils/serialization.py @@ -442,6 +442,41 @@ def _collect_metadata(model: torch.nn.Module, is_spin: bool = False) -> dict: # (per-layer ghost-feature MPI exchange via deepmd_export::border_op). # The C++ DeepPotPTExpt / DeepSpinPTExpt loaders branch on this flag. meta["has_comm_artifact"] = _needs_with_comm_artifact(model) + + # Whether the model's regular .pt2 graph consumes the ``mapping`` + # tensor to gather per-layer ghost-atom features from local atoms. + # Mirrors the descriptor's ``has_message_passing()`` API: True for + # any message-passing descriptor (DPA2, DPA3, hybrids over those); + # False for non-message-passing descriptors (se_e2_a, DPA1, etc.). + # The C++ side gates its fail-fast on this — an absent mapping is + # fatal only for models that would silently corrupt ghost features + # otherwise. + # + # Lookup order: model -> atomic_model -> descriptor. Going through + # ``atomic_model.has_message_passing()`` is important for composite + # atomic models (e.g. ``LinearAtomicModel`` in DP-ZBL) which don't + # expose a single ``.descriptor`` but do aggregate the flag across + # their sub-models. ``descriptor.has_message_passing()`` is the + # fallback for any future wrapper that lacks the higher-level + # methods. + def _probe_has_message_passing(obj: object) -> bool | None: + if obj is None or not hasattr(obj, "has_message_passing"): + return None + try: + return bool(obj.has_message_passing()) + except (AttributeError, NotImplementedError): + return None + + result: bool | None = None + for obj in ( + model, + getattr(model, "atomic_model", None), + getattr(getattr(model, "atomic_model", None), "descriptor", None), + ): + result = _probe_has_message_passing(obj) + if result is not None: + break + meta["has_message_passing"] = result if result is not None else False return meta diff --git a/source/api_c/include/c_api.h b/source/api_c/include/c_api.h index 534ae94403..358480b0ad 100644 --- a/source/api_c/include/c_api.h +++ b/source/api_c/include/c_api.h @@ -52,6 +52,9 @@ extern DP_Nlist* DP_NewNlist(int inum_, * each swap. * @param[in] world Pointer to the MPI communicator or similar communication * world used for the operation. + * @param[in] nprocs Number of MPI ranks (1 = single-rank). Used by + * ``DeepPotPTExpt`` / ``DeepSpinPTExpt`` to choose between the regular + * and with-comm artifacts. Defaults to 1 if not supplied. * @returns A pointer to the initialized neighbor list with communication * capabilities. */ @@ -66,7 +69,8 @@ extern DP_Nlist* DP_NewNlist_comm(int inum_, int** sendlist, int* sendproc, int* recvproc, - void* world); + void* world, + int nprocs); /** * @brief Set mask for a neighbor list. diff --git a/source/api_c/include/deepmd.hpp b/source/api_c/include/deepmd.hpp index 2f120ee86c..c3ca40b75f 100644 --- a/source/api_c/include/deepmd.hpp +++ b/source/api_c/include/deepmd.hpp @@ -831,7 +831,8 @@ struct InputNlist { int** sendlist, int* sendproc, int* recvproc, - void* world) + void* world, + int nprocs = 1) : inum(inum_), ilist(ilist_), numneigh(numneigh_), @@ -847,7 +848,8 @@ struct InputNlist { sendlist, sendproc, recvproc, - world)) {}; + world, + nprocs)) {}; ~InputNlist() { DP_DeleteNlist(nl); }; /// @brief C API neighbor list. DP_Nlist* nl; diff --git a/source/api_c/src/c_api.cc b/source/api_c/src/c_api.cc index 3646eb33c5..b0e789648e 100644 --- a/source/api_c/src/c_api.cc +++ b/source/api_c/src/c_api.cc @@ -35,10 +35,11 @@ DP_Nlist* DP_NewNlist_comm(int inum_, int** sendlist, int* sendproc, int* recvproc, - void* world) { + void* world, + int nprocs) { deepmd::InputNlist nl(inum_, ilist_, numneigh_, firstneigh_, nswap, sendnum, - recvnum, firstrecv, sendlist, sendproc, recvproc, - world); + recvnum, firstrecv, sendlist, sendproc, recvproc, world, + nprocs); DP_Nlist* new_nl = new DP_Nlist(nl); return new_nl; } diff --git a/source/api_cc/include/DeepPotPTExpt.h b/source/api_cc/include/DeepPotPTExpt.h index 3559702f6a..f56ade376c 100644 --- a/source/api_cc/include/DeepPotPTExpt.h +++ b/source/api_cc/include/DeepPotPTExpt.h @@ -226,6 +226,15 @@ class DeepPotPTExpt : public DeepPotBackend { // passing. ``with_comm_tempfile_`` owns the extracted nested .pt2 // for the lifetime of ``with_comm_loader``. bool has_comm_artifact_ = false; + // Whether the regular .pt2 graph consumes the mapping tensor for + // ghost-feature gather (true for any message-passing descriptor: + // DPA2/DPA3/hybrids; false for se_e2_a/DPA1/etc.). Mirrors the + // descriptor's ``has_message_passing()`` API; read from the + // ``has_message_passing`` metadata field. Defaults to false for + // pre-PR .pt2 archives that lack the field so non-GNN archives + // continue to work; GNN archives must be regenerated to opt into + // the fail-fast guard against the silent-corruption bug. + bool has_message_passing_ = false; std::unique_ptr with_comm_tempfile_; std::unique_ptr with_comm_loader; diff --git a/source/api_cc/include/DeepSpinPTExpt.h b/source/api_cc/include/DeepSpinPTExpt.h index cc1304c69e..5cace36ad1 100644 --- a/source/api_cc/include/DeepSpinPTExpt.h +++ b/source/api_cc/include/DeepSpinPTExpt.h @@ -196,6 +196,9 @@ class DeepSpinPTExpt : public DeepSpinBackend { std::unique_ptr loader; // Optional with-comm artifact for multi-rank GNN spin inference. bool has_comm_artifact_ = false; + // Mirrors descriptor's has_message_passing(). See DeepPotPTExpt.h + // for the full rationale and gating role. + bool has_message_passing_ = false; std::unique_ptr with_comm_tempfile_; std::unique_ptr with_comm_loader; diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc index 910c2f6f7a..880eaabeb0 100644 --- a/source/api_cc/src/DeepPotPTExpt.cc +++ b/source/api_cc/src/DeepPotPTExpt.cc @@ -172,6 +172,18 @@ void DeepPotPTExpt::init(const std::string& model, // exchange and producing wrong results. has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") && metadata["has_comm_artifact"].as_bool(); + // Whether the regular .pt2 graph consumes ``mapping`` for ghost-atom + // feature gather. Mirrors the descriptor's ``has_message_passing()`` + // API: true for message-passing descriptors (DPA2, DPA3, hybrids + // over those), false for non-message-passing descriptors (se_e2_a, + // DPA1, etc.). Pre-PR .pt2 archives lack this field; default to + // false so they retain their previous behaviour (non-GNN archives + // continue to work; GNN archives that had the original + // silent-corruption bug must be regenerated to opt into the fail- + // fast guard). All in-tree fixtures are regenerated by the gen + // scripts and carry the explicit value. + has_message_passing_ = metadata.obj_val.count("has_message_passing") && + metadata["has_message_passing"].as_bool(); if (has_comm_artifact_) { try { // Extract the nested ``extra/forward_lower_with_comm.pt2`` into a @@ -353,6 +365,51 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, .clone() .to(device); + // Dispatch decision: use the with-comm artifact when LAMMPS is running + // multi-rank. ``lmp_list.nprocs > 1`` is the direct predicate; + // LAMMPS pair styles populate it by passing ``comm->nprocs`` to the + // ``InputNlist`` constructor. Earlier drafts used ``nswap > 0`` as a + // proxy, but that breaks for ``atom_style spin`` (which emits + // nswap > 0 even in single-rank to propagate PBC ghost spins). + // ``nprocs`` is unambiguous. + // + // The regular artifact uses ``mapping`` to gather ghost-atom features + // from local-atom embeddings (``index_select(node_ebd[1, nloc, dim], + // mapping)``). Identity-mapping for ghost slots is silently wrong, + // so fail-fast when the regular path would be taken without a real + // mapping — applies uniformly to every caller (LAMMPS pair, ctest + // fixtures, direct C++ API users). Callers that want the regular + // path must populate ``lmp_list.mapping``. + bool multi_rank = (lmp_list.nprocs > 1); + bool atom_map_present = (lmp_list.mapping != nullptr); + bool use_with_comm = has_comm_artifact_ && multi_rank; + // Decision matrix (see PR #5450 description): + // non-GNN model (has_message_passing_ == false): regular path is + // always safe. + // nghost == 0 (NoPbc, isolated cluster): always safe. + // GNN model, multi-rank: requires has_comm_artifact_ (cell C-mr / D-mr) + // else fail-fast (cell B-mr) + // GNN model, single-rank: requires atom_map_present (cell A / C) + // else fail-fast (cell B / D) + if (has_message_passing_ && nghost > 0) { + if (multi_rank && !has_comm_artifact_) { + throw deepmd::deepmd_exception( + "Multi-rank LAMMPS .pt2 inference requires the model to be " + "exported with `use_loc_mapping=False`, which compiles a " + "with-comm artifact for cross-rank ghost-feature exchange. " + "Re-export the model with use_loc_mapping=False and try again."); + } + if (!multi_rank && !atom_map_present) { + throw deepmd::deepmd_exception( + "Single-rank LAMMPS .pt2 inference requires `atom_modify map " + "yes` in the LAMMPS input (so InputNlist.mapping is populated " + "from the LAMMPS atom-map). The model gathers ghost-atom " + "features via this mapping; without it the C++ side has no " + "safe way to resolve ghost indices to local owners. C++ API " + "callers must set inlist.mapping explicitly before compute()."); + } + } + // LAMMPS sets ago=0 on every nlist rebuild (neighbor rebuild, re-partition, // atom exchange between subdomains), so `ago > 0` implies the cached // mapping and nlist tensors are still valid. Rebuild only on ago==0. @@ -372,7 +429,15 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, .clone() .to(device); } else { - // Default identity mapping for local atoms + // Identity fallback. The fail-fast above guarantees we only + // reach this branch when one of these is true: + // - The model is non-message-passing (mapping is unused). + // - ``nghost == 0`` (no ghosts to gather, identity is trivially + // correct). + // - ``use_with_comm`` is true (the with-comm graph fills ghost + // features via border_op and ignores this tensor for ghost + // gather — see deepmd/pt_expt/descriptor/ + // repflows.py::_exchange_ghosts). std::vector mapping(nall_real); for (int ii = 0; ii < nall_real; ii++) { mapping[ii] = ii; @@ -428,14 +493,11 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, aparam_tensor = torch::zeros({0}, options).to(device); } - // Phase 4 dispatch: use the with-comm artifact when LAMMPS is - // running multi-rank. ``lmp_list.nswap > 0`` is the proxy for - // "multi-rank with cross-domain communication"; in single-rank - // mode LAMMPS sets nswap=0. Falling back to the regular artifact - // for nswap=0 is correct because that artifact uses the mapping - // tensor to gather ghost embeddings from local atoms. + // ``use_with_comm`` was computed earlier alongside the fail-fast + // dispatch check. Use the with-comm artifact for the multi-rank case + // (the regular artifact uses the mapping tensor to gather ghost + // embeddings, which only works in single-rank). std::vector flat_outputs; - bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0; if (use_with_comm && !with_comm_loader) { throw deepmd::deepmd_exception( "Multi-rank LAMMPS requires the with-comm artifact, but it failed " diff --git a/source/api_cc/src/DeepSpinPTExpt.cc b/source/api_cc/src/DeepSpinPTExpt.cc index 2ac4369f5f..dac87369d9 100644 --- a/source/api_cc/src/DeepSpinPTExpt.cc +++ b/source/api_cc/src/DeepSpinPTExpt.cc @@ -179,6 +179,10 @@ void DeepSpinPTExpt::init(const std::string& model, // dropping the MPI exchange. has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") && metadata["has_comm_artifact"].as_bool(); + // See DeepPotPTExpt::init for rationale. Defaults to false for + // pre-PR archives so they retain their previous behaviour. + has_message_passing_ = metadata.obj_val.count("has_message_passing") && + metadata["has_message_passing"].as_bool(); if (has_comm_artifact_) { try { with_comm_tempfile_ = std::make_unique( @@ -372,6 +376,46 @@ void DeepSpinPTExpt::compute(ENERGYVTYPE& ener, .clone() .to(device); + // Dispatch decision: see DeepPotPTExpt.cc for the full rationale. + // Single-rank without atom-map cannot drive the regular path (no safe + // ghost→local mapping); multi-rank without a with-comm artifact cannot + // drive border_op (no inter-rank exchange tensor). Both unsupported + // combinations fail-fast for every caller. + // ``nprocs > 1`` is the direct multi-rank predicate (LAMMPS pair + // styles set it by passing ``comm->nprocs`` to the ``InputNlist`` + // constructor). Earlier drafts used ``nswap > 0`` as a proxy, but + // atom_style spin emits nswap > 0 even in single-rank, so the proxy + // is unsound. + bool multi_rank = (lmp_list.nprocs > 1); + bool atom_map_present = (lmp_list.mapping != nullptr); + bool use_with_comm = has_comm_artifact_ && multi_rank; + // Decision matrix (see PR #5450 description): + // non-GNN model (has_message_passing_ == false): regular path is + // always safe. + // nghost == 0 (NoPbc, isolated cluster): always safe. + // GNN model, multi-rank: requires has_comm_artifact_ (cell C-mr / D-mr) + // else fail-fast (cell B-mr) + // GNN model, single-rank: requires atom_map_present (cell A / C) + // else fail-fast (cell B / D) + if (has_message_passing_ && nghost > 0) { + if (multi_rank && !has_comm_artifact_) { + throw deepmd::deepmd_exception( + "Multi-rank LAMMPS .pt2 inference requires the model to be " + "exported with `use_loc_mapping=False`, which compiles a " + "with-comm artifact for cross-rank ghost-feature exchange. " + "Re-export the model with use_loc_mapping=False and try again."); + } + if (!multi_rank && !atom_map_present) { + throw deepmd::deepmd_exception( + "Single-rank LAMMPS .pt2 inference requires `atom_modify map " + "yes` in the LAMMPS input (so InputNlist.mapping is populated " + "from the LAMMPS atom-map). The model gathers ghost-atom " + "features via this mapping; without it the C++ side has no " + "safe way to resolve ghost indices to local owners. C++ API " + "callers must set inlist.mapping explicitly before compute()."); + } + } + // LAMMPS sets ago=0 on every nlist rebuild, so ago>0 implies the cached // mapping and nlist tensors are still valid — see DeepPotPTExpt.cc for // the same rationale. @@ -391,6 +435,11 @@ void DeepSpinPTExpt::compute(ENERGYVTYPE& ener, .clone() .to(device); } else { + // Identity fallback. See DeepPotPTExpt::compute_inner for the + // invariant rationale: this branch is only reached when the + // model is non-message-passing, nghost==0, or use_with_comm is + // true (border_op fills ghosts); other configurations were + // rejected by the fail-fast above. std::vector mapping(nall_real); for (int ii = 0; ii < nall_real; ii++) { mapping[ii] = ii; @@ -452,8 +501,10 @@ void DeepSpinPTExpt::compute(ENERGYVTYPE& ener, // _with_comm), so C++ supplies the same 8 comm tensors as the // non-spin path. ``nlocal``/``nghost`` carry the real-atom counts // (pre atom-doubling); the spin override halves them internally. + // + // ``use_with_comm`` was computed earlier alongside the fail-fast + // dispatch check. std::vector flat_outputs; - bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0; if (use_with_comm && !with_comm_loader) { throw deepmd::deepmd_exception( "Multi-rank LAMMPS requires the with-comm artifact, but it failed " diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h index 5b39ea7454..39682bcd9a 100644 --- a/source/lib/include/neighbor_list.h +++ b/source/lib/include/neighbor_list.h @@ -46,6 +46,15 @@ struct InputNlist { int mask = 0xFFFFFFFF; /// mapping from all atoms to real atoms, in the size of nall int* mapping = nullptr; + /// number of MPI ranks (1 = single-rank). Settable only via the + /// trailing ``nprocs_`` argument of the comm-aware constructor (LAMMPS + /// pair styles pass ``comm->nprocs``). The lightweight constructors + /// leave it at 1 by construction — they carry no comm metadata + /// (``world``, ``sendlist``, ...), so they cannot drive the with-comm + /// dispatch path even if a non-1 value were forced here. Use this — + /// NOT ``nswap > 0`` — as the "is multi-rank?" predicate: ``atom_style + /// spin`` populates ``nswap`` even in single-rank. + int nprocs = 1; InputNlist() : inum(0), ilist(NULL), @@ -83,7 +92,8 @@ struct InputNlist { int** sendlist, int* sendproc, int* recvproc, - void* world) + void* world, + int nprocs_ = 1) : inum(inum_), ilist(ilist_), numneigh(numneigh_), @@ -95,7 +105,8 @@ struct InputNlist { sendlist(sendlist), sendproc(sendproc), recvproc(recvproc), - world(world) {}; + world(world), + nprocs(nprocs_) {}; ~InputNlist() {}; /** * @brief Set mask for this neighbor list. diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp index 9ea2021570..12b2b5a538 100644 --- a/source/lmp/pair_deepmd.cpp +++ b/source/lmp/pair_deepmd.cpp @@ -237,7 +237,7 @@ void PairDeepMD::compute(int eflag, int vflag) { list->inum, list->ilist, list->numneigh, list->firstneigh, commdata_->nswap, commdata_->sendnum, commdata_->recvnum, commdata_->firstrecv, commdata_->sendlist, commdata_->sendproc, - commdata_->recvproc, &world); + commdata_->recvproc, &world, comm->nprocs); lmp_list.set_mask(NEIGHMASK); if (comm->nprocs == 1 && atom->map_style != Atom::MAP_NONE) { lmp_list.set_mapping(mapping_vec.data()); diff --git a/source/lmp/pair_deepspin.cpp b/source/lmp/pair_deepspin.cpp index eddcb2eef4..30ca48576f 100644 --- a/source/lmp/pair_deepspin.cpp +++ b/source/lmp/pair_deepspin.cpp @@ -201,6 +201,15 @@ void PairDeepSpin::compute(int eflag, int vflag) { } } + // mapping (for DPA-2/3 .pt2 GNN models that gather ghost features via + // the LAMMPS atom-map; harmless for other models). + std::vector mapping_vec(nall, -1); + if (comm->nprocs == 1 && atom->map_style != Atom::MAP_NONE) { + for (size_t ii = 0; ii < nall; ++ii) { + mapping_vec[ii] = atom->map(atom->tag[ii]); + } + } + if (do_compute_aparam) { make_aparam_from_compute(daparam); } else if (aparam.size() > 0) { @@ -242,8 +251,11 @@ void PairDeepSpin::compute(int eflag, int vflag) { list->inum, list->ilist, list->numneigh, list->firstneigh, commdata_->nswap, commdata_->sendnum, commdata_->recvnum, commdata_->firstrecv, commdata_->sendlist, commdata_->sendproc, - commdata_->recvproc, &world); + commdata_->recvproc, &world, comm->nprocs); lmp_list.set_mask(NEIGHMASK); + if (comm->nprocs == 1 && atom->map_style != Atom::MAP_NONE) { + lmp_list.set_mapping(mapping_vec.data()); + } if (single_model || multi_models_no_mod_devi) { // cvflag_atom is the right flag for the cvatom matrix if (!(eflag_atom || cvflag_atom)) { diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py index 042f47c56c..f5ccb4a377 100644 --- a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py +++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py @@ -77,6 +77,15 @@ "trigger nlist rebuilds on every step (and run a small ``--nsteps`` " "to keep wall time low while still exercising the rebuild path).", ) +parser.add_argument( + "--no-atom-map", + action="store_true", + help="When set, omit ``atom_modify map yes`` from the LAMMPS input. " + "Used by the no-atom-map fail-fast / with-comm fallback tests; " + "with this flag the C++ DeepPotPTExpt sees inlist.mapping == " + "nullptr and either fails fast (no with-comm artifact) or routes " + "to with-comm (multi-rank, with-comm artifact present).", +) parser.add_argument( "--null-vx", type=float, @@ -124,8 +133,13 @@ # ``atom_modify map yes`` is required when single-rank dispatch goes # through the regular artifact of a use_loc_mapping=False .pt2: the # C++ side needs the LAMMPS global-id->local-index map to build the -# ``mapping`` tensor. It is harmless under multi-rank. -lammps.atom_modify("map yes") +# ``mapping`` tensor. It is harmless under multi-rank. The +# ``--no-atom-map`` flag omits this line so the no-atom-map fallback +# (multi-rank with-comm path) and fail-fast (no with-comm artifact) +# branches can be exercised — LAMMPS rejects ``atom_modify map no``, +# so omitting the command is the only way to leave the map disabled. +if not args.no_atom_map: + lammps.atom_modify("map yes") lammps.neighbor("2.0 bin") lammps.neigh_modify(f"every {args.neigh_every} delay 0 check no") lammps.read_data(args.DATAFILE) diff --git a/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py index 3637238968..5befcf0f79 100644 --- a/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py +++ b/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py @@ -76,6 +76,14 @@ help="Optional mass for LAMMPS atom type 3 (and any higher types). " "Used by the NULL-type fixture; ignored when only 2 types exist.", ) +parser.add_argument( + "--no-atom-map", + action="store_true", + help="Omit ``atom_modify map yes`` from the LAMMPS input. Used by " + "the spin no-atom-map fail-fast / with-comm-fallback tests; LAMMPS " + "rejects ``atom_modify map no`` so omitting the command is the only " + "way to leave the map disabled.", +) args = parser.parse_args() lammps = PyLammps() @@ -83,7 +91,8 @@ lammps.units("metal") lammps.boundary("p p p") lammps.atom_style("spin") -lammps.atom_modify("map yes") +if not args.no_atom_map: + lammps.atom_modify("map yes") lammps.neighbor("2.0 bin") lammps.neigh_modify("every 10 delay 0 check no") lammps.read_data(args.DATAFILE) diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py index ecabe25a28..a55d80561c 100644 --- a/source/lmp/tests/test_lammps_dpa3_pt2.py +++ b/source/lmp/tests/test_lammps_dpa3_pt2.py @@ -180,12 +180,16 @@ def teardown_module() -> None: os.remove(f) -def _lammps(data_file, units="metal") -> PyLammps: +def _lammps(data_file, units="metal", atom_map: str = "yes") -> PyLammps: lammps = PyLammps() lammps.units(units) lammps.boundary("p p p") lammps.atom_style("atomic") - lammps.atom_modify("map yes") + # LAMMPS rejects ``atom_modify map no``; the supported way to leave + # the atom-map disabled is to simply omit the command (default for + # ``atom_style atomic``). + if atom_map != "no": + lammps.atom_modify(f"map {atom_map}") if units == "metal" or units == "real": lammps.neighbor("2.0 bin") elif units == "si": @@ -242,7 +246,22 @@ def lammps_si(): lmp.close() +@pytest.fixture +def lammps_no_atom_map(): + # Same as the default ``lammps`` fixture but with the LAMMPS atom-map + # disabled (``atom_modify map no``). Exercises the C++ fail-fast + # branch in DeepPotPTExpt::compute_inner — single-rank .pt2 GNN + # inference without atom-map cannot resolve ghost-to-local mapping, + # so the regular path throws with an actionable error message. + lmp = _lammps(data_file=data_file, atom_map="no") + yield lmp + lmp.close() + + def test_pair_deepmd(lammps) -> None: + # Cell A: use_loc_mapping=True (deeppot_dpa3.pt2, no with-comm artifact), + # atom_modify map yes, single-rank. Regular path uses the correct + # mapping built from LAMMPS atom-map; ghost-feature gather works. lammps.pair_style(f"deepmd {pb_file.resolve()}") lammps.pair_coeff("* *") lammps.run(0) @@ -254,6 +273,46 @@ def test_pair_deepmd(lammps) -> None: lammps.run(1) +def test_pair_deepmd_no_atom_map_fails_fast(lammps_no_atom_map) -> None: + # Cell B: use_loc_mapping=True (no with-comm artifact), atom_modify + # map no, single-rank. Regular path needs a correct mapping for + # ghost-feature gather but atom-map is absent and we deliberately do + # not build one ourselves. Must fail fast with an actionable message. + lammps_no_atom_map.pair_style(f"deepmd {pb_file.resolve()}") + lammps_no_atom_map.pair_coeff("* *") + with pytest.raises(Exception, match=r"atom_modify map yes"): + lammps_no_atom_map.run(0) + + +def test_pair_deepmd_with_comm(lammps) -> None: + # Cell C single-rank: use_loc_mapping=False (deeppot_dpa3_mpi.pt2, + # has with-comm artifact), atom_modify map yes, single-rank. + # Dispatch picks the regular path because nswap==0; the regular + # artifact uses the correct mapping built from LAMMPS atom-map. + # Forces must match the same baseline as the use_loc_mapping=True + # variant (gen_dpa3.py exports both .pt2s from identical weights). + lammps.pair_style(f"deepmd {pb_file_mpi.resolve()}") + lammps.pair_coeff("* *") + lammps.run(0) + assert lammps.eval("pe") == pytest.approx(expected_e) + for ii in range(6): + assert lammps.atoms[ii].force == pytest.approx( + expected_f[lammps.atoms[ii].id - 1] + ) + + +def test_pair_deepmd_with_comm_no_atom_map_fails_fast(lammps_no_atom_map) -> None: + # Cell D: use_loc_mapping=False (with-comm artifact available), + # atom_modify map no, single-rank. Despite the with-comm artifact + # being available, single-rank PBC has empty CommBrick sendlist + # (nswap==0), so border_op cannot fill ghost features. Must fail + # fast with the same single-rank message as cell B. + lammps_no_atom_map.pair_style(f"deepmd {pb_file_mpi.resolve()}") + lammps_no_atom_map.pair_coeff("* *") + with pytest.raises(Exception, match=r"atom_modify map yes"): + lammps_no_atom_map.run(0) + + def test_pair_deepmd_virial(lammps) -> None: lammps.pair_style(f"deepmd {pb_file.resolve()}") lammps.pair_coeff("* *") @@ -361,6 +420,8 @@ def _run_mpi_subprocess( data_path: Path | None = None, processors: str | None = None, runner_args: list[str] | None = None, + pb_path: Path | None = None, + capture: bool = False, ) -> dict: """Helper: invoke run_mpi_pair_deepmd_dpa3_pt2.py under ``mpirun -n `` and return @@ -380,6 +441,8 @@ def _run_mpi_subprocess( """ if data_path is None: data_path = data_file + if pb_path is None: + pb_path = pb_file_mpi with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f: out_path = f.name try: @@ -390,7 +453,7 @@ def _run_mpi_subprocess( sys.executable, str(Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"), str(data_path.resolve()), - str(pb_file_mpi.resolve()), + str(pb_path.resolve()), out_path, ] if processors is not None: @@ -401,6 +464,15 @@ def _run_mpi_subprocess( argv.extend(extra_args) if runner_args: argv.extend(runner_args) + if capture: + # Return raw process info instead of parsing output — used by + # tests that expect the subprocess to fail (the fail-fast cases). + proc = sp.run(argv, capture_output=True, text=True) + return { + "returncode": proc.returncode, + "stdout": proc.stdout, + "stderr": proc.stderr, + } sp.check_call(argv) with open(out_path) as fh: lines = fh.read().strip().splitlines() @@ -468,6 +540,56 @@ def test_pair_deepmd_mpi_dpa3() -> None: ) +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_no_with_comm_fails_fast() -> None: + """Cell B-mr: use_loc_mapping=True (no with-comm artifact) under + ``mpirun -n 2``. Multi-rank dispatch cannot use the regular path + (mapping is unreliable across ranks) and there is no with-comm + artifact to fall back to. Must fail-fast with a message naming + ``use_loc_mapping=False`` as the user-facing fix. + """ + out = _run_mpi_subprocess(pb_path=pb_file, capture=True) + assert out["returncode"] != 0, ( + "Expected subprocess to fail-fast for " + "use_loc_mapping=True .pt2 + multi-rank, but it exited 0." + ) + combined = out["stdout"] + out["stderr"] + assert "use_loc_mapping=False" in combined, ( + "Expected error message mentioning use_loc_mapping=False, got:\n" + + combined[-500:] + ) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_no_atom_map() -> None: + """Cell D-mr: use_loc_mapping=False (with-comm artifact present) + under ``mpirun -n 2`` WITHOUT ``atom_modify map yes``. Multi-rank + dispatch routes to the with-comm artifact whose graph fills ghost + features via ``border_op`` and does not consume the mapping tensor + — so atom-map is not required. Forces must match the same baseline + as the with-atom-map multi-rank run (cell C-mr). + """ + out = _run_mpi_subprocess(runner_args=["--no-atom-map"]) + assert out["pe"] == pytest.approx(expected_e) + for ii in range(6): + np.testing.assert_allclose( + out["forces"][ii], + expected_f[ii], + atol=1e-8, + rtol=0, + ) + + @pytest.mark.skipif( shutil.which("mpirun") is None, reason="MPI is not installed on this system" ) diff --git a/source/lmp/tests/test_lammps_spin_dpa3_pt2.py b/source/lmp/tests/test_lammps_spin_dpa3_pt2.py index 7c7c5787a7..5429fbb516 100644 --- a/source/lmp/tests/test_lammps_spin_dpa3_pt2.py +++ b/source/lmp/tests/test_lammps_spin_dpa3_pt2.py @@ -56,6 +56,12 @@ / "infer" / "deeppot_dpa3_spin_mpi.pt2" ) +# Single-artifact DPA3 spin fixture (use_loc_mapping=True; no with-comm +# artifact). Counterpart to ``pb_file_mpi`` for the spin fail-fast +# cells where the C++ side has no fallback to border_op. +pb_file_single = ( + Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot_dpa3_spin.pt2" +) data_file = Path(__file__).parent / "data_dpa3_spin_pt2.lmp" # Elongated-box fixture for the spin empty-subdomain MPI test: x is # extended to 30 A while atoms remain in x in [3, 13]. Combined with @@ -140,6 +146,8 @@ def _run_mpi_subprocess( processors: str | None = None, data_path: Path | None = None, runner_args: list[str] | None = None, + pb_path: Path | None = None, + capture: bool = False, ) -> dict: """Run ``run_mpi_pair_deepmd_spin_dpa3_pt2.py`` under ``mpirun -n `` and return @@ -152,6 +160,8 @@ def _run_mpi_subprocess( """ if data_path is None: data_path = data_file + if pb_path is None: + pb_path = pb_file_mpi with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f: out_path = f.name try: @@ -162,7 +172,7 @@ def _run_mpi_subprocess( sys.executable, str(Path(__file__).parent / "run_mpi_pair_deepmd_spin_dpa3_pt2.py"), str(data_path.resolve()), - str(pb_file_mpi.resolve()), + str(pb_path.resolve()), out_path, ] if processors is not None: @@ -173,6 +183,15 @@ def _run_mpi_subprocess( argv.extend(extra_args) if runner_args: argv.extend(runner_args) + if capture: + # Used by fail-fast tests: return raw subprocess info instead of + # parsing output (the subprocess is expected to exit non-zero). + proc = sp.run(argv, capture_output=True, text=True) + return { + "returncode": proc.returncode, + "stdout": proc.stdout, + "stderr": proc.stderr, + } sp.check_call(argv) with open(out_path) as fh: lines = fh.read().strip().splitlines() @@ -302,3 +321,159 @@ def test_pair_deepmd_mpi_dpa3_spin_null_type() -> None: # real Ni-O atoms). np.testing.assert_array_equal(out_mpi["forces"][4:], np.zeros((2, 3))) np.testing.assert_array_equal(out_mpi["force_mag"][4:], np.zeros((2, 3))) + + +# --------------------------------------------------------------------------- +# Four-cell coverage matrix for the spin path — mirrors the non-spin matrix +# in ``test_lammps_dpa3_pt2.py``. Verifies the fail-fast in +# ``DeepSpinPTExpt::compute_inner`` against the silent-corruption bug. +# +# Cell use_loc_mapping atom-map nprocs Outcome +# ---- --------------- -------- ------ ------------------------------- +# A True yes 1 succeeds (regular w/ map) +# B True no 1 fail-fast (single-rank msg) +# B-mr True any >1 fail-fast (multi-rank msg) +# C False yes 1 succeeds (regular w/ map; nswap=0) +# D False no 1 fail-fast (single-rank msg) +# D-mr False no >1 succeeds (with-comm; border_op) +# +# Cell C-mr is already covered by ``test_pair_deepmd_mpi_dpa3_spin`` above. +# --------------------------------------------------------------------------- + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepspin_single_artifact_with_atom_map() -> None: + """Cell A (spin): use_loc_mapping=True, single-rank, atom-map yes + -> regular path with correct mapping, runs cleanly. + """ + out = _run_mpi_subprocess(nprocs=1, pb_path=pb_file_single) + # No hardcoded reference here — we only assert the run completes + # and produces finite numbers. Numerical correctness of the + # single-artifact spin GNN is validated by the eager-parity test + # in source/tests/pt_expt/model/. + assert np.isfinite(out["pe"]) + assert np.all(np.isfinite(out["forces"])) + assert np.all(np.isfinite(out["force_mag"])) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepspin_no_atom_map_fails_fast() -> None: + """Cell B (spin): use_loc_mapping=True, single-rank, no atom-map + -> fail-fast with the single-rank message (no with-comm fallback). + """ + out = _run_mpi_subprocess( + nprocs=1, pb_path=pb_file_single, runner_args=["--no-atom-map"], capture=True + ) + assert out["returncode"] != 0, ( + "Expected subprocess to fail-fast for single-rank spin GNN " + "without atom-map, but it exited 0." + ) + combined = out["stdout"] + out["stderr"] + assert "atom_modify map yes" in combined, ( + "Expected single-rank fail-fast message, got:\n" + combined[-500:] + ) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepspin_mpi_no_with_comm_fails_fast() -> None: + """Cell B-mr (spin): use_loc_mapping=True (no with-comm artifact), + multi-rank -> fail-fast with the multi-rank message. atom-map + setting is irrelevant: ``pair_deepspin`` never propagates atom-map + for multi-rank, and the new predicate fails-fast unconditionally + on atom_map_present when multi_rank && !has_comm_artifact. + """ + out = _run_mpi_subprocess(nprocs=2, pb_path=pb_file_single, capture=True) + assert out["returncode"] != 0, ( + "Expected subprocess to fail-fast for multi-rank spin GNN " + "without with-comm artifact, but it exited 0." + ) + combined = out["stdout"] + out["stderr"] + assert "use_loc_mapping=False" in combined, ( + "Expected multi-rank fail-fast message, got:\n" + combined[-500:] + ) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepspin_with_comm_single_atom_map() -> None: + """Cell C (spin): use_loc_mapping=False (has with-comm artifact), + single-rank, atom-map yes -> regular path takes the artifact's + non-comm trace (nswap=0) and uses the LAMMPS atom-map. + """ + out = _run_mpi_subprocess(nprocs=1, pb_path=pb_file_mpi) + assert np.isfinite(out["pe"]) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepspin_with_comm_no_atom_map_fails_fast() -> None: + """Cell D (spin): use_loc_mapping=False (with-comm artifact), + single-rank, no atom-map -> fail-fast. Even though with-comm is + available, single-rank LAMMPS has nswap=0, so border_op cannot + drive the per-layer ghost exchange; the regular path needs the + mapping but atom-map is absent. + """ + out = _run_mpi_subprocess( + nprocs=1, pb_path=pb_file_mpi, runner_args=["--no-atom-map"], capture=True + ) + assert out["returncode"] != 0, ( + "Expected subprocess to fail-fast for single-rank spin GNN + " + "with-comm artifact + no atom-map, but it exited 0." + ) + combined = out["stdout"] + out["stderr"] + assert "atom_modify map yes" in combined, ( + "Expected single-rank fail-fast message, got:\n" + combined[-500:] + ) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepspin_mpi_no_atom_map() -> None: + """Cell D-mr (spin): use_loc_mapping=False (with-comm artifact), + multi-rank, no atom-map -> succeeds. The with-comm path drives + ghost-feature exchange via ``deepmd_export::border_op`` and does + NOT consume the mapping tensor for ghost gather, so atom-map is + unnecessary. Forces / force_mag must match the same-archive + atom-map-yes multi-rank baseline. + """ + out_no_map = _run_mpi_subprocess( + nprocs=2, pb_path=pb_file_mpi, runner_args=["--no-atom-map"] + ) + out_baseline = _run_mpi_subprocess(nprocs=2, pb_path=pb_file_mpi) + assert out_no_map["pe"] == pytest.approx(out_baseline["pe"], rel=1e-10, abs=1e-12) + np.testing.assert_allclose( + out_no_map["forces"], out_baseline["forces"], atol=1e-8, rtol=0 + ) + np.testing.assert_allclose( + out_no_map["force_mag"], out_baseline["force_mag"], atol=1e-8, rtol=0 + ) + np.testing.assert_allclose( + out_no_map["virials"], out_baseline["virials"], atol=1e-8, rtol=0 + ) diff --git a/source/tests/infer/gen_spin.py b/source/tests/infer/gen_spin.py index c17546504b..b08d45060d 100644 --- a/source/tests/infer/gen_spin.py +++ b/source/tests/infer/gen_spin.py @@ -144,6 +144,68 @@ def _build_dpa3_mpi_yaml(yaml_path: str) -> None: save_dp_model(yaml_path, data) +def _build_dpa3_single_yaml(yaml_path: str) -> None: + """Build a DPA3 spin model with ``use_loc_mapping=True`` — the + single-artifact GNN counterpart to ``_build_dpa3_mpi_yaml``. + + ``use_loc_mapping=True`` keeps per-layer messaging local to each + rank, so no with-comm AOTI artifact is needed for single-rank + inference. But the regular path still consumes ``mapping`` to + gather ghost features, so this fixture is the canonical test + case for the cells where the C++ ``DeepSpinPTExpt`` fail-fast + must fire (single-rank without atom-map, or multi-rank without + a with-comm artifact). + """ + from deepmd.dpmodel.model.model import ( + get_model, + ) + from deepmd.dpmodel.utils.serialization import ( + save_dp_model, + ) + + config = { + "type_map": ["Ni", "O"], + "descriptor": { + "type": "dpa3", + "repflow": { + "n_dim": 8, + "e_dim": 6, + "a_dim": 4, + "nlayers": 1, + "e_rcut": 4.0, + "e_rcut_smth": 0.5, + "e_sel": 8, + "a_rcut": 3.5, + "a_rcut_smth": 0.5, + "a_sel": 4, + "axis_neuron": 4, + "update_angle": False, + }, + "use_loc_mapping": True, + "precision": "float64", + "seed": 1, + }, + "fitting_net": {"neuron": [5, 5, 5], "resnet_dt": True, "seed": 1}, + "spin": {"use_spin": [True, False], "virtual_scale": [0.3140, 0.0]}, + } + + model = get_model(copy.deepcopy(config)) + model_dict = model.serialize() + + data = { + "model": model_dict, + "model_def_script": config, + "backend": "dpmodel", + "software": "deepmd-kit", + "version": "3.0.0", + } + + print( # noqa: T201 + f"Building single-artifact DPA3 spin dpmodel and saving to {yaml_path} ..." + ) + save_dp_model(yaml_path, data) + + def main(): from deepmd.entrypoints.convert_backend import ( convert_backend, @@ -162,6 +224,12 @@ def main(): yaml_dpa3_path = os.path.join(base_dir, "deeppot_dpa3_spin_mpi.yaml") pt2_dpa3_path = os.path.join(base_dir, "deeppot_dpa3_spin_mpi.pt2") + # Single-artifact GNN spin variant (DPA3 + use_loc_mapping=True). + # No with-comm artifact; needed by tests covering the spin fail-fast + # cells in test_lammps_spin_dpa3_pt2.py. + yaml_dpa3_single_path = os.path.join(base_dir, "deeppot_dpa3_spin.yaml") + pt2_dpa3_single_path = os.path.join(base_dir, "deeppot_dpa3_spin.pt2") + # ---- 1. Build .yamls if they don't exist ---- if not os.path.exists(yaml_path): _build_yaml(yaml_path) @@ -173,6 +241,11 @@ def main(): else: print(f"Using existing {yaml_dpa3_path}") # noqa: T201 + if not os.path.exists(yaml_dpa3_single_path): + _build_dpa3_single_yaml(yaml_dpa3_single_path) + else: + print(f"Using existing {yaml_dpa3_single_path}") # noqa: T201 + # ---- 2. Convert .yaml -> .pth and .yaml -> .pt2 ---- # Import deepmd.pt to register the backend (needed for convert_backend) import deepmd.pt # noqa: F401 @@ -185,6 +258,13 @@ def main(): print(f"Converting to {pt2_path} ...") # noqa: T201 convert_backend(INPUT=yaml_path, OUTPUT=pt2_path, atomic_virial=True) + print(f"Converting to {pt2_dpa3_single_path} ...") # noqa: T201 + convert_backend( + INPUT=yaml_dpa3_single_path, + OUTPUT=pt2_dpa3_single_path, + atomic_virial=True, + ) + print(f"Converting to {pt2_dpa3_path} ...") # noqa: T201 convert_backend(INPUT=yaml_dpa3_path, OUTPUT=pt2_dpa3_path, atomic_virial=True)