From 2190c7015607eceddf7cb289ba45f543fbf2f66d Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Thu, 23 Apr 2026 11:11:50 +0200 Subject: [PATCH 01/23] Removed unuseful checks --- .../join/DiversifyingNearestChildrenKnnCollector.java | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java index 59cb58d22888..034a77dd31de 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java @@ -93,9 +93,6 @@ public String toString() { @Override public TopDocs topDocs() { assert heap.size() <= k() : "Tried to collect more results than the maximum number allowed"; - while (heap.size() > k()) { - heap.popToDrain(); - } ScoreDoc[] scoreDocs = new ScoreDoc[heap.size()]; for (int i = 1; i <= scoreDocs.length; i++) { scoreDocs[scoreDocs.length - i] = new ScoreDoc(heap.topNode(), heap.topScore()); @@ -184,12 +181,7 @@ private void updateElement(int heapIndex, int nodeId, int parentId, float score) float oldScore = scores[heapIndex]; childNodes[heapIndex] = nodeId; scores[heapIndex] = score; - // Since we are a min heap, if the new value is less, we need to make sure to bubble it up - if (score < oldScore) { - upHeap(heapIndex); - } else { - downHeap(heapIndex); - } + downHeap(heapIndex); } /** From 66390c8a8dd7e25529c1b04e4ec5e12a49b39fd6 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Thu, 23 Apr 2026 11:12:48 +0200 Subject: [PATCH 02/23] Removed unused variable --- .../search/join/DiversifyingNearestChildrenKnnCollector.java | 1 - 1 file changed, 1 deletion(-) diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java index 034a77dd31de..de5b225e109d 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java @@ -178,7 +178,6 @@ private void pushIn(int nodeId, int parentId, float score) { private void updateElement(int heapIndex, int nodeId, int parentId, float score) { assert parentNodes[heapIndex] == parentId : "attempted to update heap element value but with a different parent id"; - float oldScore = scores[heapIndex]; childNodes[heapIndex] = nodeId; scores[heapIndex] = score; downHeap(heapIndex); From da4ec3d9a60675d46c187523f62b2aa763bd7869 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Thu, 23 Apr 2026 11:14:01 +0200 Subject: [PATCH 03/23] Moved downHeap from int to void since the returned value is never used --- .../search/join/DiversifyingNearestChildrenKnnCollector.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java index de5b225e109d..41b12308043c 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java @@ -275,7 +275,7 @@ private void upHeap(int origPos) { scores[i] = savedScore; } - private int downHeap(int i) { + private void downHeap(int i) { int savedChild = childNodes[i]; int savedParent = parentNodes[i]; float savedScore = scores[i]; @@ -300,7 +300,6 @@ private int downHeap(int i) { childNodes[i] = savedChild; parentNodes[i] = savedParent; scores[i] = savedScore; - return i; } // Used only during popToDrain: the index map is never read again after closed=true, From 2f33476373e3163f2ffeba2d6732ec885b1c6dd0 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Mon, 4 May 2026 12:14:03 +0200 Subject: [PATCH 04/23] Implemented siblings exploration --- .../util/hnsw/AbstractHnswGraphSearcher.java | 58 +++ .../util/hnsw/ChildrenSiblingExpansion.java | 51 ++ .../lucene/util/hnsw/DocSiblingExpansion.java | 50 ++ .../lucene/util/hnsw/HnswGraphSearcher.java | 26 + .../hnsw/OrdinalTranslatedKnnCollector.java | 35 +- ...iversifyingChildrenByteKnnVectorQuery.java | 2 +- ...versifyingChildrenFloatKnnVectorQuery.java | 2 +- ...versifyingNearestChildrenKnnCollector.java | 76 ++- ...ingNearestChildrenKnnCollectorManager.java | 46 +- ...ersifyingChildrenKnnCollectorTestCase.java | 71 +++ ...versifyingChildrenKnnSiblingExpansion.java | 468 ++++++++++++++++++ ...earestChildrenKnnCollectorPerformance.java | 21 +- .../join/TestToParentJoinKnnResults.java | 8 +- 13 files changed, 871 insertions(+), 43 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java create mode 100644 lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java create mode 100644 lucene/join/src/test/org/apache/lucene/search/join/DiversifyingChildrenKnnCollectorTestCase.java create mode 100644 lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java index 2f668d966d41..03836fa8d006 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java @@ -96,14 +96,72 @@ protected static void scoreEntryPoints( assert scores != null && scores.length >= eps.length; scorer.bulkScore(eps, scores, eps.length); results.incVisitedCount(eps.length); + float[] siblingScores = null; for (int i = 0; i < eps.length; i++) { float score = scores[i]; int ep = eps[i]; visited.set(ep); candidates.add(ep, score); if (acceptOrds == null || acceptOrds.get(ep)) { + // Fetch siblings BEFORE collect() so the parent is not yet in the heap + int[] siblings = null; + int numSiblingsToVisit = 0; + if (results instanceof ChildrenSiblingExpansion expander) { + siblings = expander.pendingSiblingOrdinals(ep, visited); + if (siblings != null) { + // how many siblings are actually scored to avoid exceeding the visit budget. + numSiblingsToVisit = + (int) Math.min(siblings.length, results.visitLimit() - results.visitedCount()); + // Only mark as visited the siblings we will actually score; the rest remain + // reachable via normal graph traversal so a better child can still be found + for (int s = 0; s < numSiblingsToVisit; s++) visited.set(siblings[s]); + } + } + // Collect the ep node here so after we have a correctly updated minCompetitiveSimilarity results.collect(ep, score); + if (numSiblingsToVisit > 0) { + siblingScores = + scoreSiblings( + results, scorer, candidates, acceptOrds, siblings, numSiblingsToVisit, siblingScores); + } } } } + + /** + * Scores and collects siblings, adding competitive ones to the candidate queue. Reuses and + * returns the siblingScores buffer, reallocating only if too small. + */ + protected static float[] scoreSiblings( + KnnCollector results, + RandomVectorScorer scorer, + NeighborQueue candidates, + Bits acceptOrds, + int[] siblings, + int numSiblings, + float[] siblingScores) + throws IOException { + // If siblingScores not defined yet or too small to collect scores a new one is created + // Otherwise we reuse the old one that will be overridden in bulkScore with new scores + if (siblingScores == null || siblingScores.length < numSiblings) { + siblingScores = new float[numSiblings]; + } + float maxScore = scorer.bulkScore(siblings, siblingScores, numSiblings); + results.incVisitedCount(numSiblings); + if (maxScore > results.minCompetitiveSimilarity()) { + float minSimilarity = Math.nextUp(results.minCompetitiveSimilarity()); + for (int j = 0; j < numSiblings; j++) { + float sibScore = siblingScores[j]; + // We avoid adding to candidates a sibling with a bad score + if (sibScore >= minSimilarity) { + candidates.add(siblings[j], sibScore); + if (acceptOrds == null || acceptOrds.get(siblings[j])) { + results.collect(siblings[j], sibScore); + minSimilarity = Math.nextUp(results.minCompetitiveSimilarity()); + } + } + } + } + return siblingScores; + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java new file mode 100644 index 000000000000..c176c166ceab --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.hnsw; + +import org.apache.lucene.util.BitSet; + +/** + * Implemented by {@link org.apache.lucene.search.KnnCollector} instances that support dynamic + * sibling expansion during HNSW graph search. When the graph searcher collects a child node + * belonging to a newly-discovered parent, all siblings of that parent are immediately scored and + * collected without requiring full graph traversal. + * + *

The graph searcher calls {@link #pendingSiblingOrdinals} before invoking {@link + * org.apache.lucene.search.KnnCollector#collect} for the triggering node, so that the collector + * can safely inspect its internal state before the parent is registered. + * + * @lucene.experimental + */ +public interface ChildrenSiblingExpansion { + + /** + * Called by the HNSW graph searcher before {@link + * org.apache.lucene.search.KnnCollector#collect} is invoked for {@code collectedOrdinal}. The + * implementation inspects the node's parent and returns the ordinals of unvisited siblings that + * should be scored immediately. + * + *

The caller is responsible for marking the returned ordinals as visited in {@code + * visitedOrds} and for scoring and collecting them. + * + * @param collectedOrdinal the vector ordinal about to be collected + * @param visitedOrds the graph searcher's current visited bitset; returned siblings are not yet + * marked here — the caller does that + * @return ordinals of sibling nodes to score next, or {@code null} if none + */ + int[] pendingSiblingOrdinals(int collectedOrdinal, BitSet visitedOrds); +} \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java new file mode 100644 index 000000000000..4882ade8c2ee --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.hnsw; + +/** + * Doc-ID-level companion to {@link ChildrenSiblingExpansion}. Implemented by collectors that + * understand parent-child document relationships and can enumerate sibling document ids for a given + * child document id, as well as translate document ids back to vector ordinals. + * + *

This interface is used internally by {@link OrdinalTranslatedKnnCollector} to bridge between + * the ordinal space of the HNSW graph and the document-id space of the collector. + * + * @lucene.experimental + */ +public interface DocSiblingExpansion { + + /** + * Returns the doc ids of all siblings of {@code childDocId} whose parent has not yet been added + * to the result heap, or {@code null} if the parent was already processed or there are no other + * siblings. + * + * @param childDocId the document id of the child that is about to be collected + * @return sibling doc ids, or {@code null} + */ + int[] findSiblingDocIds(int childDocId); + + /** + * Translates a document id to its vector ordinal, or returns {@code -1} if the document has no + * vector in this field. + * + * @param docId the document id + * @return the vector ordinal, or {@code -1} + */ + int docIdToOrdinal(int docId); +} \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java index d739915ca078..2bac2e046788 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java @@ -44,6 +44,7 @@ public class HnswGraphSearcher extends AbstractHnswGraphSearcher { protected int[] bulkNodes = null; protected float[] bulkScores = null; + protected float[] siblingScores = null; /** * HNSW search is roughly logarithmic. This doesn't take maxConn into account, but it is a pretty @@ -347,6 +348,20 @@ void searchLevel( if (score >= minAcceptedSimilarity) { candidates.add(node, score); if (acceptOrds == null || acceptOrds.get(node)) { + // Fetch siblings BEFORE collect() so the parent is not yet in the heap + int[] siblings = null; + int numSiblingsToVisit = 0; + if (results instanceof ChildrenSiblingExpansion expander) { + siblings = expander.pendingSiblingOrdinals(node, visited); + if (siblings != null) { + numSiblingsToVisit = + (int) + Math.min(siblings.length, results.visitLimit() - results.visitedCount()); + // Only mark as visited the siblings we will actually score; the rest remain + // reachable via normal graph traversal so a better child can still be found + for (int s = 0; s < numSiblingsToVisit; s++) visited.set(siblings[s]); + } + } if (results.collect(node, score)) { float oldMinAcceptedSimilarity = minAcceptedSimilarity; minAcceptedSimilarity = Math.nextUp(results.minCompetitiveSimilarity()); @@ -356,6 +371,17 @@ void searchLevel( shouldExploreMinSim = true; } } + // Score and collect all siblings of the newly-discovered parent + if (numSiblingsToVisit > 0) { + float prevMinSim = results.minCompetitiveSimilarity(); + siblingScores = + scoreSiblings( + results, scorer, candidates, acceptOrds, siblings, numSiblingsToVisit, siblingScores); + if (results.minCompetitiveSimilarity() > prevMinSim) { + minAcceptedSimilarity = Math.nextUp(results.minCompetitiveSimilarity()); + shouldExploreMinSim = true; + } + } } } } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java index 5225fe700ab9..35f15e81193c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java @@ -17,14 +17,19 @@ package org.apache.lucene.util.hnsw; +import java.util.Arrays; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TotalHits; +import org.apache.lucene.util.BitSet; /** - * Wraps a provided KnnCollector object, translating the provided vectorId ordinal to a documentId + * Wraps a provided KnnCollector object, translating the provided vectorId ordinal to a documentId. + * This wrapper implements {@link ChildrenSiblingExpansion}; sibling expansion is active only + * when the wrapped collector also implements {@link DocSiblingExpansion}. */ -public final class OrdinalTranslatedKnnCollector extends KnnCollector.Decorator { +public final class OrdinalTranslatedKnnCollector extends KnnCollector.Decorator + implements ChildrenSiblingExpansion { private final IntToIntFunction vectorOrdinalToDocId; @@ -50,4 +55,30 @@ public TopDocs topDocs() { : TotalHits.Relation.EQUAL_TO), td.scoreDocs); } + + @Override + public int[] pendingSiblingOrdinals(int collectedOrdinal, BitSet visitedOrds) { + if (!(collector instanceof DocSiblingExpansion docExpander)) { + return null; + } + int docId = vectorOrdinalToDocId.apply(collectedOrdinal); + int[] siblingDocIds = docExpander.findSiblingDocIds(docId); + if (siblingDocIds == null) { + return null; + } + int[] siblingOrdinals = new int[siblingDocIds.length]; + // siblingOrdinals is pre-allocated to siblingDocIds.length and Java initializes int arrays to 0 so this variable is necessary. + int count = 0; + for (int sibDocId : siblingDocIds) { + int sibOrd = docExpander.docIdToOrdinal(sibDocId); + // sibOrd = -1 when a document has no vector for this field. + // Such a doc has no node in the HNSW graph and can't be scored, so it must be skipped. + // If a sibling was reached via normal graph traversal before sibling expansion triggered, re-adding it would + // cause it to be scored twice. !visitedOrds.get(sibOrd) filters those out. + if (sibOrd >= 0 && !visitedOrds.get(sibOrd)) { + siblingOrdinals[count++] = sibOrd; + } + } + return count > 0 ? Arrays.copyOf(siblingOrdinals, count) : null; + } } diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenByteKnnVectorQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenByteKnnVectorQuery.java index 877d004d5c78..77dde715426e 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenByteKnnVectorQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenByteKnnVectorQuery.java @@ -156,7 +156,7 @@ protected TopDocs exactSearch( @Override protected KnnCollectorManager getKnnCollectorManager(int k, IndexSearcher searcher) { - return new DiversifyingNearestChildrenKnnCollectorManager(k, parentsFilter, searcher); + return new DiversifyingNearestChildrenKnnCollectorManager(k, parentsFilter, searcher, field); } @Override diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenFloatKnnVectorQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenFloatKnnVectorQuery.java index e2d6179ec426..c5f2f95ddabf 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenFloatKnnVectorQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenFloatKnnVectorQuery.java @@ -155,7 +155,7 @@ protected TopDocs exactSearch( @Override protected KnnCollectorManager getKnnCollectorManager(int k, IndexSearcher searcher) { - return new DiversifyingNearestChildrenKnnCollectorManager(k, parentsFilter, searcher); + return new DiversifyingNearestChildrenKnnCollectorManager(k, parentsFilter, searcher, field); } @Override diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java index 41b12308043c..e2ca8fbdb10e 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java @@ -25,40 +25,42 @@ import org.apache.lucene.search.knn.KnnSearchStrategy; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitSet; +import org.apache.lucene.util.hnsw.DocSiblingExpansion; /** * This collects the nearest children vectors. Diversifying the results over the provided parent * filter. This means the nearest children vectors are returned, but only one per parent */ -class DiversifyingNearestChildrenKnnCollector extends AbstractKnnCollector { +class DiversifyingNearestChildrenKnnCollector extends AbstractKnnCollector + implements DocSiblingExpansion { private final BitSet parentBitSet; private final NodeIdCachingHeap heap; + // docId → vector ordinal mapping; -1 means the doc has no vector. Null when not needed. + private final int[] docToOrd; /** - * Create a new object for joining nearest child kNN documents with a parent bitset - * - * @param k The number of joined parent documents to collect - * @param visitLimit how many child vectors can be visited - * @param parentBitSet The leaf parent bitset - */ - public DiversifyingNearestChildrenKnnCollector(int k, int visitLimit, BitSet parentBitSet) { - this(k, visitLimit, null, parentBitSet); - } - - /** - * Create a new object for joining nearest child kNN documents with a parent bitset + * Create a new object for joining nearest child kNN documents with a parent bitset and a + * precomputed docId-to-ordinal mapping that enables dynamic sibling expansion during HNSW graph + * search. * * @param k The number of joined parent documents to collect * @param visitLimit how many child vectors can be visited * @param searchStrategy The search strategy to use * @param parentBitSet The leaf parent bitset + * @param docToOrd precomputed array mapping docId to vector ordinal; {@code -1} entries mean the + * document has no vector. May be {@code null} to disable sibling expansion. */ public DiversifyingNearestChildrenKnnCollector( - int k, int visitLimit, KnnSearchStrategy searchStrategy, BitSet parentBitSet) { + int k, + int visitLimit, + KnnSearchStrategy searchStrategy, + BitSet parentBitSet, + int[] docToOrd) { super(k, visitLimit, searchStrategy); this.parentBitSet = parentBitSet; this.heap = new NodeIdCachingHeap(k); + this.docToOrd = docToOrd; } /** @@ -111,6 +113,47 @@ public int numCollected() { return heap.size(); } + @Override + public int[] findSiblingDocIds(int childDocId) { + int parent = parentBitSet.nextSetBit(childDocId); + // Skip if this parent's entry is already in the heap + if (heap.containsParent(parent)) { + return null; + } + // Find siblings range(prevParent, parent). If parent is 0 there are not prevParents so -1 + int prevParent = parent > 0 ? parentBitSet.prevSetBit(parent - 1) : -1; + int from = prevParent + 1; + // Children of parent are all docIds in [from, parent); exclude childDocId itself + int capacity = parent - from - 1; + // One child case + if (capacity == 0) { + return null; + } + int[] siblings = new int[capacity]; + int idx = 0; + for (int docId = from; docId < parent; docId++) { + if (docId != childDocId) { + siblings[idx++] = docId; + } + } + return idx > 0 ? siblings : null; + } + + @Override + public int docIdToOrdinal(int docId) { + // Conditions explanation: + // docToOrd == null — buildDocToOrd returns null when the segment has no vector values at all for the field (line 113). Without this guard you'd get a NullPointerException. + // docId >= docToOrd.length — In production, docToOrd is sized exactly maxDoc for that segment (line 116), so every valid docId in that segment fits. This check is therefore a pure defensive guard — it can't be triggered by + // correct production code, but it protects against: + // - bugs in how the array is built or passed + // - future refactoring that changes the array size + // - misuse when constructing the collector manually (as in tests, e.g. the docIdToOrdinal(9999) test case) + if (docToOrd == null || docId >= docToOrd.length) { + return -1; + } + return docToOrd[docId]; + } + /** * This is a minimum binary heap, inspired by {@link org.apache.lucene.util.LongHeap}. But instead * of encoding and using `long` values. Node ids and scores are kept separate. Additionally, this @@ -245,6 +288,11 @@ public final int size() { return size; } + /** Returns {@code true} if a child of the given parent is already in the heap. */ + public boolean containsParent(int parentDocId) { + return nodeIdHeapIndex.containsKey(parentDocId); + } + /** * Returns true if the element at heap position {@code k} should be evicted before the element * at position {@code j} (i.e. a is "less than" b in the min-heap ordering). Lower score loses diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java index 7ebacdd8194d..13071bc84fac 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java @@ -18,7 +18,10 @@ package org.apache.lucene.search.join; import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.knn.KnnCollectorManager; @@ -35,17 +38,21 @@ public class DiversifyingNearestChildrenKnnCollectorManager implements KnnCollec private final int k; // filter identifying the parent documents. private final BitSetProducer parentsFilter; + // vector field name; used to build the docId-to-ordinal mapping for sibling expansion + private final String field; /** * Constructor * * @param k - the number of top k vectors to collect * @param parentsFilter Filter identifying the parent documents. + * @param field the vector field name */ public DiversifyingNearestChildrenKnnCollectorManager( - int k, BitSetProducer parentsFilter, IndexSearcher indexSearcher) { + int k, BitSetProducer parentsFilter, IndexSearcher indexSearcher, String field) { this.k = k; this.parentsFilter = parentsFilter; + this.field = field; } /** @@ -62,8 +69,9 @@ public KnnCollector newCollector( if (parentBitSet == null) { return null; } + int[] docToOrd = buildDocToOrd(context); return new DiversifyingNearestChildrenKnnCollector( - k, visitedLimit, searchStrategy, parentBitSet); + k, visitedLimit, searchStrategy, parentBitSet, docToOrd); } @Override @@ -74,12 +82,44 @@ public KnnCollector newOptimisticCollector( if (parentBitSet == null) { return null; } + int[] docToOrd = buildDocToOrd(context); return new DiversifyingNearestChildrenKnnCollector( - k, visitedLimit, searchStrategy, parentBitSet); + k, visitedLimit, searchStrategy, parentBitSet, docToOrd); } @Override public boolean isOptimistic() { return true; } + + /** + * Builds a docId-to-ordinal array for the given leaf, mapping each docId to its vector ordinal. + * + *

Returns {@code null} if the field has no vector values in this segment at all — sibling + * expansion will be disabled for this leaf. + * + *

Otherwise returns an array of size {@code maxDoc} where each entry is the vector ordinal for + * that docId, or {@code -1} if that specific document has no vector (sparse indexing). + */ + private int[] buildDocToOrd(LeafReaderContext context) throws IOException { + FieldInfo fi = context.reader().getFieldInfos().fieldInfo(field); + // fi = null if the field doesn't exist in this segment at all. + // fi.getVectorDimension() = 0 if the field exist in the segment but was not indexed as a vector field. + if (fi == null || fi.getVectorDimension() == 0) { + return null; + } + DocIdSetIterator iter = + switch (fi.getVectorEncoding()) { + case FLOAT32 -> context.reader().getFloatVectorValues(field).iterator(); + case BYTE -> context.reader().getByteVectorValues(field).iterator(); + }; + int maxDoc = context.reader().maxDoc(); + int[] docToOrd = new int[maxDoc]; + Arrays.fill(docToOrd, -1); + int ord = 0; + while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + docToOrd[iter.docID()] = ord++; + } + return docToOrd; + } } diff --git a/lucene/join/src/test/org/apache/lucene/search/join/DiversifyingChildrenKnnCollectorTestCase.java b/lucene/join/src/test/org/apache/lucene/search/join/DiversifyingChildrenKnnCollectorTestCase.java new file mode 100644 index 000000000000..3d7fdecf31ca --- /dev/null +++ b/lucene/join/src/test/org/apache/lucene/search/join/DiversifyingChildrenKnnCollectorTestCase.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.join; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.search.knn.KnnSearchStrategy; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BitSet; + +/** + * Base class for {@link DiversifyingNearestChildrenKnnCollector} tests. Provides shared helpers + * for building the block-join parent {@link BitSet} used across test subclasses. + */ +abstract class DiversifyingChildrenKnnCollectorTestCase extends LuceneTestCase { + + /** + * Builds a {@link BitSet} whose set bits are the parent doc ids in a contiguous block-join + * layout: {@code [child_0 … child_{C-1}, parent_C]}, repeated {@code numParents} times. + * + *

Example with {@code childrenPerParent=3}: parent doc ids are 3, 7, 11, … + */ + static BitSet parentBitSet(int numParents, int childrenPerParent) throws IOException { + int[] parentDocIds = new int[numParents]; + for (int p = 1; p <= numParents; p++) { + parentDocIds[p - 1] = p * (childrenPerParent + 1) - 1; + } + int totalDocs = numParents * (childrenPerParent + 1); + return BitSet.of( + new TestToParentJoinKnnResults.IntArrayDocIdSetIterator(parentDocIds, numParents), + totalDocs + 1); + } + + /** + * Builds a docId-to-ordinal array for the block-join layout. Parent docs get -1 (no vector); + * child docs get consecutive ordinals starting from 0. + */ + static int[] buildDocToOrd(int numParents, int childrenPerParent) { + int totalDocs = numParents * (childrenPerParent + 1); + int[] docToOrd = new int[totalDocs]; + Arrays.fill(docToOrd, -1); + int ord = 0; + for (int d = 0; d < totalDocs; d++) { + if ((d + 1) % (childrenPerParent + 1) != 0) { + docToOrd[d] = ord++; + } + } + return docToOrd; + } + + static DiversifyingNearestChildrenKnnCollector makeCollector( + int k, BitSet parents, int[] docToOrd) { + return new DiversifyingNearestChildrenKnnCollector( + k, Integer.MAX_VALUE, (KnnSearchStrategy) null, parents, docToOrd); + } +} diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java new file mode 100644 index 000000000000..bb170ec3d79e --- /dev/null +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java @@ -0,0 +1,468 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.join; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BitSet; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.hnsw.OrdinalTranslatedKnnCollector; + +/** + * Tests for sibling expansion in {@link DiversifyingNearestChildrenKnnCollector} ({@link + * org.apache.lucene.util.hnsw.DocSiblingExpansion} contract) and end-to-end correctness via + * {@link DiversifyingChildrenFloatKnnVectorQuery}. + */ +public class TestDiversifyingChildrenKnnSiblingExpansion + extends DiversifyingChildrenKnnCollectorTestCase { + + // --------------------------------------------------------------------------- + // Unit tests: DocSiblingExpansion — findSiblingDocIds + // --------------------------------------------------------------------------- + + public void testFindSiblingDocIds_returnsAllSiblings() throws IOException { + // 2 parents, 2 children each → blocks: [0,1|2], [3,4|5] + int numParents = 2, childrenPerParent = 2; + BitSet parents = parentBitSet(numParents, childrenPerParent); + int[] docToOrd = buildDocToOrd(numParents, childrenPerParent); + DiversifyingNearestChildrenKnnCollector c = makeCollector(10, parents, docToOrd); + + // child 0: sibling is 1 + int[] s0 = c.findSiblingDocIds(0); + assertNotNull(s0); + assertArrayEquals(new int[] {1}, s0); + + // child 1: sibling is 0 + int[] s1 = c.findSiblingDocIds(1); + assertNotNull(s1); + assertArrayEquals(new int[] {0}, s1); + + // child 3 (second block): sibling is 4 + int[] s3 = c.findSiblingDocIds(3); + assertNotNull(s3); + assertArrayEquals(new int[] {4}, s3); + + // child 4 (second block): sibling is 3 + int[] s4 = c.findSiblingDocIds(4); + assertNotNull(s4); + assertArrayEquals(new int[] {3}, s4); + } + + public void testFindSiblingDocIds_parentAlreadyInHeap_returnsNull() throws IOException { + // 1 parents, 2 children → block: [0,1|2] + int numParents = 1, childrenPerParent = 2; + BitSet parents = parentBitSet(numParents, childrenPerParent); + int[] docToOrd = buildDocToOrd(numParents, childrenPerParent); + DiversifyingNearestChildrenKnnCollector c = makeCollector(10, parents, docToOrd); + + // Collect child 0 → parent 2 enters the heap + c.collect(0, 0.8f); + + // Now asking for siblings of child 1 (same parent 2) must return null + assertNull(c.findSiblingDocIds(1)); + } + + public void testFindSiblingDocIds_singleChildParent_returnsNull() throws IOException { + // 1 parents, 1 child each → blocks: [0|1] + int numParents = 1, childrenPerParent = 1; + BitSet parents = parentBitSet(numParents, childrenPerParent); + int[] docToOrd = buildDocToOrd(numParents, childrenPerParent); + DiversifyingNearestChildrenKnnCollector c = makeCollector(10, parents, docToOrd); + + assertNull("sole child has no siblings", c.findSiblingDocIds(0)); + } + + /** + * The trigger child must not appear in its own sibling list. Only the *other* children of the + * same parent are returned. + */ + public void testFindSiblingDocIds_excludesTriggerChild() throws IOException { + // 1 parent, 3 children: [C0, C1, C2 | P0=3] + int numParents = 1, childrenPerParent = 3; + BitSet parents = parentBitSet(numParents, childrenPerParent); + int[] docToOrd = buildDocToOrd(numParents, childrenPerParent); + DiversifyingNearestChildrenKnnCollector c = makeCollector(10, parents, docToOrd); + + // Trigger is C1 (docId=1); expected siblings: C0 and C2 only + int[] siblings = c.findSiblingDocIds(1); + assertNotNull(siblings); + assertEquals(2, siblings.length); + for (int s : siblings) { + assertNotEquals("trigger child must not appear in sibling list", 1, s); + } + } + + // --------------------------------------------------------------------------- + // Unit tests: DocSiblingExpansion — docIdToOrdinal + // --------------------------------------------------------------------------- + + public void testDocIdToOrdinal_correctMapping() throws IOException { + // 2 parents, 2 children: docToOrd = [0, 1, -1, 2, 3, -1] + int numParents = 2, childrenPerParent = 2; + BitSet parents = parentBitSet(numParents, childrenPerParent); + int[] docToOrd = buildDocToOrd(numParents, childrenPerParent); + DiversifyingNearestChildrenKnnCollector c = makeCollector(10, parents, docToOrd); + + assertEquals(0, c.docIdToOrdinal(0)); + assertEquals(1, c.docIdToOrdinal(1)); + assertEquals(-1, c.docIdToOrdinal(2)); // parent → no vector + assertEquals(2, c.docIdToOrdinal(3)); + assertEquals(3, c.docIdToOrdinal(4)); + assertEquals(-1, c.docIdToOrdinal(5)); // parent → no vector + assertEquals(-1, c.docIdToOrdinal(9999)); // beyond array bounds + } + + public void testDocIdToOrdinal_nullMapping_alwaysMinusOne() throws IOException { + // Collector created without a docToOrd array (sibling expansion disabled) + BitSet parents = parentBitSet(2, 2); + DiversifyingNearestChildrenKnnCollector c = + new DiversifyingNearestChildrenKnnCollector(5, Integer.MAX_VALUE, null, parents, null); + + assertEquals(-1, c.docIdToOrdinal(0)); + assertEquals(-1, c.docIdToOrdinal(1)); + } + + // --------------------------------------------------------------------------- + // Unit tests: heap replacement behaviour + // --------------------------------------------------------------------------- + + /** + * C3 is the entry point (score 0.85) but its siblings C4 (0.95) + * and C5 (0.90) are expanded immediately. The parent must be represented by C4, not C3. + */ + public void testBestSiblingReplacesFirstFoundChild() throws IOException { + // 1 parent, 3 children: [C0=0, C1=1, C2=2 | P0=3] + int numParents = 1, childrenPerParent = 3; + BitSet parents = parentBitSet(numParents, childrenPerParent); + int[] docToOrd = buildDocToOrd(numParents, childrenPerParent); + DiversifyingNearestChildrenKnnCollector c = makeCollector(1, parents, docToOrd); + + // C0 found first as entry point (like C3=0.85 in the example) + c.collect(0, 0.85f); + // Sibling expansion scores C1 (like C4=0.95) and C2 (like C5=0.90) + c.collect(1, 0.95f); + c.collect(2, 0.90f); + + TopDocs td = c.topDocs(); + assertEquals(1, td.scoreDocs.length); + assertEquals("best sibling must win over first-found child", 0.95f, td.scoreDocs[0].score, 1e-5f); + assertEquals("best child doc id must be C1", 1, td.scoreDocs[0].doc); + } + + /** + * When two parents are found and the heap is full (k=2), a sibling of the second parent that + * scores better than the trigger child replaces it. + */ + public void testBestSiblingReplacesWorseChildWhenHeapFull() throws IOException { + // 2 parents, 2 children each: [C0, C1 | P0=2], [C2, C3 | P1=5] + int numParents = 2, childrenPerParent = 2; + BitSet parents = parentBitSet(numParents, childrenPerParent); + int[] docToOrd = buildDocToOrd(numParents, childrenPerParent); + DiversifyingNearestChildrenKnnCollector c = makeCollector(2, parents, docToOrd); + + // P0: C0 found first (0.85), C1 is better sibling (0.95) → P0 represented by C1 + c.collect(0, 0.85f); + c.collect(1, 0.95f); + + // P1: C2 found first (0.60), C3 is better sibling (0.65) + c.collect(3, 0.60f); // trigger child → P1 enters heap, now FULL (size=2=k) + c.collect(4, 0.65f); // better sibling → heap full but P1 already present, updates entry + + TopDocs td = c.topDocs(); + assertEquals(2, td.scoreDocs.length); + assertEquals("P0 best child score", 0.95f, td.scoreDocs[0].score, 1e-5f); + assertEquals("P1 best child score", 0.65f, td.scoreDocs[1].score, 1e-5f); + assertEquals("P0 best child doc", 1, td.scoreDocs[0].doc); + assertEquals("P1 best child doc", 4, td.scoreDocs[1].doc); + } + + // --------------------------------------------------------------------------- + // Unit tests: OrdinalTranslatedKnnCollector — pendingSiblingOrdinals + // --------------------------------------------------------------------------- + + /** + * Siblings whose ordinal is already in visitedOrds must be filtered out to prevent + * double-scoring when the sibling was independently discovered via normal graph traversal. + */ + public void testPendingSiblingOrdinals_filtersAlreadyVisited() throws IOException { + // 1 parent, 3 children: [C0=doc0, C1=doc1, C2=doc2 | P0=doc3] + // docToOrd=[0,1,2,-1]; ordToDoc=[0,1,2] + BitSet parents = parentBitSet(1, 3); + int[] docToOrd = buildDocToOrd(1, 3); + int[] ordToDoc = {0, 1, 2}; + OrdinalTranslatedKnnCollector collector = + new OrdinalTranslatedKnnCollector(makeCollector(10, parents, docToOrd), ord -> ordToDoc[ord]); + + // Mark C1 (ordinal 1) as already visited by normal graph traversal + FixedBitSet visited = new FixedBitSet(4); + visited.set(1); + + // Trigger on C0 (ordinal 0 → docId 0); siblings are C1 and C2 + int[] result = collector.pendingSiblingOrdinals(0, visited); + assertNotNull(result); + assertArrayEquals("C1 must be filtered (visited); only C2 remains", new int[]{2}, result); + } + + /** + * Siblings with no vector in this field (docIdToOrdinal returns -1) must be skipped because + * they have no node in the HNSW graph and cannot be scored. + */ + public void testPendingSiblingOrdinals_filtersSparseSiblings() throws IOException { + // 1 parent, 3 children but C1 has no vector: + // [C0=doc0, C1(sparse)=doc1, C2=doc2 | P0=doc3] + // docToOrd=[0,-1,1,-1]; ordToDoc=[0,2] + BitSet parents = parentBitSet(1, 3); + int[] docToOrd = {0, -1, 1, -1}; + int[] ordToDoc = {0, 2}; + OrdinalTranslatedKnnCollector collector = + new OrdinalTranslatedKnnCollector(makeCollector(10, parents, docToOrd), ord -> ordToDoc[ord]); + + FixedBitSet visited = new FixedBitSet(4); + + // Trigger on C0 (ordinal 0 → docId 0); siblings are C1 (sparse) and C2 + int[] result = collector.pendingSiblingOrdinals(0, visited); + assertNotNull(result); + assertArrayEquals("C1 (no vector) must be filtered; only C2 remains", new int[]{1}, result); + } + + // --------------------------------------------------------------------------- + // Index fixtures (used by integration tests only) + // --------------------------------------------------------------------------- + + private static final String FIELD = "vec"; + private static final int DIM = 4; + + /** + * Builds a block-join float-vector index. Parent p has numChildren children; child c of parent p + * gets vector: v[i] = (p * numChildren + c + i) * 0.1, then normalised for COSINE/DOT_PRODUCT. + */ + private Directory buildIndex( + int numParents, int numChildren, VectorSimilarityFunction sim) throws IOException { + Directory dir = newDirectory(); + try (IndexWriter w = + new IndexWriter( + dir, new IndexWriterConfig().setMergePolicy(newMergePolicy(random(), false)))) { + for (int p = 0; p < numParents; p++) { + List block = new ArrayList<>(); + for (int c = 0; c < numChildren; c++) { + float[] vec = childVector(p, c, numChildren); + if (sim == VectorSimilarityFunction.COSINE || sim == VectorSimilarityFunction.DOT_PRODUCT) { + normalise(vec); + } + Document child = new Document(); + child.add(new KnnFloatVectorField(FIELD, vec, sim)); + child.add(new StoredField("parent", p)); + block.add(child); + } + Document parent = new Document(); + parent.add(new StringField("docType", "_parent", Field.Store.NO)); + parent.add(new StoredField("parent", p)); + block.add(parent); + w.addDocuments(block); + } + } + return dir; + } + + private static float[] childVector(int parent, int child, int numChildren) { + float[] vec = new float[DIM]; + for (int i = 0; i < DIM; i++) { + vec[i] = (parent * numChildren + child + i + 1) * 0.1f; + } + return vec; + } + + private static void normalise(float[] vec) { + float norm = 0; + for (float v : vec) norm += v * v; + norm = (float) Math.sqrt(norm); + if (norm > 0) { + for (int i = 0; i < DIM; i++) vec[i] /= norm; + } + } + + /** Computes brute-force top-k scores: max similarity per parent, sorted descending. */ + private static float[] bruteForceTopK( + float[] query, + int numParents, + int numChildren, + VectorSimilarityFunction sim, + int k) { + float[] parentBest = new float[numParents]; + Arrays.fill(parentBest, Float.NEGATIVE_INFINITY); + for (int p = 0; p < numParents; p++) { + for (int c = 0; c < numChildren; c++) { + float[] vec = childVector(p, c, numChildren); + if (sim == VectorSimilarityFunction.COSINE || sim == VectorSimilarityFunction.DOT_PRODUCT) { + normalise(vec); + } + float score = sim.compare(query, vec); + if (score > parentBest[p]) parentBest[p] = score; + } + } + float[] sorted = parentBest.clone(); + Arrays.sort(sorted); + int n = Math.min(k, numParents); + float[] top = new float[n]; + for (int i = 0; i < n; i++) top[i] = sorted[numParents - 1 - i]; + return top; + } + + // --------------------------------------------------------------------------- + // Integration tests: end-to-end correctness via DiversifyingChildrenFloatKnnVectorQuery + // --------------------------------------------------------------------------- + + /** + * Each result doc must belong to a distinct parent. Sibling expansion must not cause the same + * parent to appear multiple times. + */ + public void testSiblingExpansionNoDuplicateParents() throws Exception { + int numParents = 15, numChildren = 4, k = 8; + VectorSimilarityFunction sim = VectorSimilarityFunction.EUCLIDEAN; + float[] query = {1f, 0f, 0f, 0f}; + + try (Directory dir = buildIndex(numParents, numChildren, sim); + IndexReader reader = DirectoryReader.open(dir)) { + IndexSearcher searcher = newSearcher(reader); + BitSetProducer parentFilter = + new QueryBitSetProducer(new TermQuery(new Term("docType", "_parent"))); + + Query knnQuery = + new DiversifyingChildrenFloatKnnVectorQuery(FIELD, query, null, k, parentFilter); + TopDocs results = searcher.search(knnQuery, k); + + Set seenParents = new HashSet<>(); + for (ScoreDoc sd : results.scoreDocs) { + int parentId = + reader.storedFields().document(sd.doc).getField("parent").numericValue().intValue(); + assertTrue("parent " + parentId + " appeared more than once", seenParents.add(parentId)); + } + } + } + + /** + * When every parent has exactly one child there are no siblings to expand. Results must still be + * correct. + */ + public void testSiblingExpansionSingleChildParents() throws Exception { + int numParents = 12, numChildren = 1, k = 5; + VectorSimilarityFunction sim = VectorSimilarityFunction.EUCLIDEAN; + float[] query = {0.2f, 0.3f, 0.4f, 0.5f}; + + try (Directory dir = buildIndex(numParents, numChildren, sim); + IndexReader reader = DirectoryReader.open(dir)) { + IndexSearcher searcher = newSearcher(reader); + BitSetProducer parentFilter = + new QueryBitSetProducer(new TermQuery(new Term("docType", "_parent"))); + + Query knnQuery = + new DiversifyingChildrenFloatKnnVectorQuery(FIELD, query, null, k, parentFilter); + TopDocs results = searcher.search(knnQuery, k); + + float[] expected = bruteForceTopK(query, numParents, numChildren, sim, k); + assertEquals(Math.min(k, numParents), results.scoreDocs.length); + for (int i = 0; i < results.scoreDocs.length; i++) { + assertEquals("score at rank " + i, expected[i], results.scoreDocs[i].score, 1e-4f); + } + } + } + + /** + * The query is close to one parent's children; sibling expansion must find the best child of each discovered parent + * rather than whichever child the graph traversal happens to reach first. + */ + public void testSiblingExpansion_bestChildPerParentFound() throws Exception { + int numParents = 3, numChildren = 3, k = 2; + VectorSimilarityFunction sim = VectorSimilarityFunction.EUCLIDEAN; + float[] query = {0.9f, 0.9f, 0.9f, 0.9f}; + + try (Directory dir = buildIndex(numParents, numChildren, sim); + IndexReader reader = DirectoryReader.open(dir)) { + IndexSearcher searcher = newSearcher(reader); + BitSetProducer parentFilter = + new QueryBitSetProducer(new TermQuery(new Term("docType", "_parent"))); + CheckJoinIndex.check(reader, parentFilter); + + Query knnQuery = + new DiversifyingChildrenFloatKnnVectorQuery(FIELD, query, null, k, parentFilter); + TopDocs results = searcher.search(knnQuery, k); + + assertEquals(k, results.scoreDocs.length); + for (ScoreDoc sd : results.scoreDocs) { + int parentIdx = reader.storedFields().document(sd.doc) + .getField("parent").numericValue().intValue(); + // verify no other child of the same parent scores higher than the returned one + for (int c = 0; c < numChildren; c++) { + float[] vec = childVector(parentIdx, c, numChildren); + float cScore = sim.compare(query, vec); + assertTrue( + "parent " + parentIdx + " has a better child (score " + cScore + + ") than returned doc " + sd.doc + " (score " + sd.score + ")", + cScore <= sd.score + 1e-4f); + } + } + } + } + + /** + * With a single parent and many children, sibling expansion must score all children so the best + * one is returned. Without expansion the graph might stop early and miss the best child. + */ + public void testSiblingExpansion_singleParentManyChildren() throws Exception { + int numParents = 1, numChildren = 8, k = 1; + VectorSimilarityFunction sim = VectorSimilarityFunction.EUCLIDEAN; + float[] query = {0.9f, 0.8f, 0.7f, 0.6f}; + + try (Directory dir = buildIndex(numParents, numChildren, sim); + IndexReader reader = DirectoryReader.open(dir)) { + IndexSearcher searcher = newSearcher(reader); + BitSetProducer parentFilter = + new QueryBitSetProducer(new TermQuery(new Term("docType", "_parent"))); + CheckJoinIndex.check(reader, parentFilter); + + Query knnQuery = + new DiversifyingChildrenFloatKnnVectorQuery(FIELD, query, null, k, parentFilter); + TopDocs results = searcher.search(knnQuery, k); + + float[] expected = bruteForceTopK(query, numParents, numChildren, sim, k); + assertEquals(1, results.scoreDocs.length); + assertEquals("best child of single parent", expected[0], results.scoreDocs[0].score, 1e-4f); + } + } +} diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingNearestChildrenKnnCollectorPerformance.java b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingNearestChildrenKnnCollectorPerformance.java index 99a1384144b1..14addde94549 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingNearestChildrenKnnCollectorPerformance.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingNearestChildrenKnnCollectorPerformance.java @@ -19,7 +19,6 @@ import java.io.IOException; import org.apache.lucene.search.TopDocs; -import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.BitSet; /** @@ -28,27 +27,13 @@ *

Correctness tests verify behaviour of the collector in various scenarios, including edge * cases. */ -public class TestDiversifyingNearestChildrenKnnCollectorPerformance extends LuceneTestCase { - - /** Builds a BitSet whose set bits are the parent doc ids in a contiguous block-join layout. */ - private static BitSet parentBitSet(int numParents, int childrenPerParent) throws IOException { - int[] parentDocIds = new int[numParents]; - for (int p = 1; p <= numParents; p++) { - // layout: [child_0 … child_{C-1}, parent_C], repeated - // e.g. with 3 children per parent: [0,1,2,3, 4,5,6,7, 8,9,10,11, ...] → parent doc ids are - // 3,7,11,... - parentDocIds[p - 1] = p * (childrenPerParent + 1) - 1; - } - int totalDocs = numParents * (childrenPerParent + 1); // children + 1 parent per block - return BitSet.of( - new TestToParentJoinKnnResults.IntArrayDocIdSetIterator(parentDocIds, numParents), - totalDocs + 1); - } +public class TestDiversifyingNearestChildrenKnnCollectorPerformance + extends DiversifyingChildrenKnnCollectorTestCase { /** Collects all children in order and returns topDocs. */ private static TopDocs collectAll(int k, BitSet parents, int[] childIds, float[] scores) { DiversifyingNearestChildrenKnnCollector collector = - new DiversifyingNearestChildrenKnnCollector(k, Integer.MAX_VALUE, parents); + new DiversifyingNearestChildrenKnnCollector(k, Integer.MAX_VALUE, null, parents, null); for (int i = 0; i < childIds.length; i++) { collector.collect(childIds[i], scores[i]); } diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestToParentJoinKnnResults.java b/lucene/join/src/test/org/apache/lucene/search/join/TestToParentJoinKnnResults.java index 175e1f264400..212990bfac88 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestToParentJoinKnnResults.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestToParentJoinKnnResults.java @@ -33,7 +33,7 @@ public void testNeighborsProduct() throws IOException { // make sure we have the sign correct BitSet parentBitSet = BitSet.of(new IntArrayDocIdSetIterator(new int[] {1, 3, 5}, 3), 6); DiversifyingNearestChildrenKnnCollector nn = - new DiversifyingNearestChildrenKnnCollector(2, Integer.MAX_VALUE, parentBitSet); + new DiversifyingNearestChildrenKnnCollector(2, Integer.MAX_VALUE, null, parentBitSet, null); assertTrue(nn.collect(2, 0.5f)); assertTrue(nn.collect(0, 0.2f)); assertTrue(nn.collect(4, 1f)); @@ -48,7 +48,7 @@ public void testInsertions() throws IOException { float[] scores = new float[] {1f, 0.5f, 0.6f, 2f, 2f, 1.2f, 4f}; BitSet parentBitSet = BitSet.of(new IntArrayDocIdSetIterator(new int[] {3, 6, 9, 12}, 4), 13); DiversifyingNearestChildrenKnnCollector results = - new DiversifyingNearestChildrenKnnCollector(7, Integer.MAX_VALUE, parentBitSet); + new DiversifyingNearestChildrenKnnCollector(7, Integer.MAX_VALUE, null, parentBitSet, null); for (int i = 0; i < nodes.length; i++) { results.collect(nodes[i], scores[i]); } @@ -69,7 +69,7 @@ public void testInsertionWithOverflow() throws IOException { BitSet parentBitSet = BitSet.of(new IntArrayDocIdSetIterator(new int[] {3, 6, 9, 11, 13, 15}, 6), 16); DiversifyingNearestChildrenKnnCollector results = - new DiversifyingNearestChildrenKnnCollector(5, Integer.MAX_VALUE, parentBitSet); + new DiversifyingNearestChildrenKnnCollector(5, Integer.MAX_VALUE, null, parentBitSet, null); for (int i = 0; i < nodes.length - 1; i++) { results.collect(nodes[i], scores[i]); } @@ -104,7 +104,7 @@ public void testRandomInsertionsWithOverflow() throws IOException { BitSet parentBitSet = BitSet.of(new IntArrayDocIdSetIterator(parents, parents.length), nextParent + 1); DiversifyingNearestChildrenKnnCollector results = - new DiversifyingNearestChildrenKnnCollector(20, Integer.MAX_VALUE, parentBitSet); + new DiversifyingNearestChildrenKnnCollector(20, Integer.MAX_VALUE, null, parentBitSet, null); for (int i = 0; i < children.size(); i++) { results.collect(children.get(i), childrenScores.get(i)); } From b1f6b868a0e0a5a8334138b733cd7f4089dceb53 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Mon, 4 May 2026 15:52:00 +0200 Subject: [PATCH 05/23] Added benchmark for measuring speed --- lucene/benchmark-jmh/build.gradle | 1 + .../benchmark-jmh/src/java/module-info.java | 1 + ...DiversifyingChildrenKnnQueryBenchmark.java | 163 ++++++++++++++++++ 3 files changed, 165 insertions(+) create mode 100644 lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java diff --git a/lucene/benchmark-jmh/build.gradle b/lucene/benchmark-jmh/build.gradle index 6f874e410b9b..6d64bb7a4f72 100644 --- a/lucene/benchmark-jmh/build.gradle +++ b/lucene/benchmark-jmh/build.gradle @@ -20,6 +20,7 @@ description = 'Lucene JMH micro-benchmarking module' dependencies { moduleImplementation project(':lucene:core') moduleImplementation project(':lucene:expressions') + moduleImplementation project(':lucene:join') moduleImplementation project(':lucene:sandbox') moduleTestImplementation project(':lucene:test-framework') diff --git a/lucene/benchmark-jmh/src/java/module-info.java b/lucene/benchmark-jmh/src/java/module-info.java index bb6b9d516bb0..8090c7554739 100644 --- a/lucene/benchmark-jmh/src/java/module-info.java +++ b/lucene/benchmark-jmh/src/java/module-info.java @@ -24,6 +24,7 @@ requires jdk.unsupported; requires org.apache.lucene.core; requires org.apache.lucene.expressions; + requires org.apache.lucene.join; requires org.apache.lucene.sandbox; requires commons.math3; diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java new file mode 100644 index 000000000000..7940b1fd2bc6 --- /dev/null +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.jmh; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.join.DiversifyingChildrenFloatKnnVectorQuery; +import org.apache.lucene.search.join.QueryBitSetProducer; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.MMapDirectory; +import org.apache.lucene.util.IOUtils; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Warmup; + +/** + * Benchmarks end-to-end latency of {@link DiversifyingChildrenFloatKnnVectorQuery} with sibling + * expansion enabled. Run with: + * + *

+ *   ./gradlew -p lucene/benchmark-jmh jmh --args="DiversifyingChildrenKnnQueryBenchmark"
+ * 
+ */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Benchmark) +@Warmup(iterations = 4, time = 1) +@Measurement(iterations = 5, time = 1) +@Fork( + value = 3, + jvmArgsAppend = {"-Xmx4g", "-Xms4g", "-XX:+AlwaysPreTouch"}) +public class DiversifyingChildrenKnnQueryBenchmark { + + private static final String FIELD = "vec"; + private static final String PARENT_FIELD = "docType"; + private static final String PARENT_VALUE = "_parent"; + private static final int NUM_QUERY_VECTORS = 256; + + @Param({"5000"}) + public int numParents; + + @Param({"4", "8"}) + public int childrenPerParent; + + @Param({"10"}) + public int k; + + @Param({"128"}) + public int dim; + + private Path tmpDir; + private Directory dir; + private IndexReader reader; + private IndexSearcher searcher; + private QueryBitSetProducer parentFilter; + private float[][] queryVectors; + private int queryIdx; + + @Setup(Level.Trial) + public void setup() throws IOException { + tmpDir = Files.createTempDirectory("DiversifyingChildrenKnnQueryBenchmark"); + dir = MMapDirectory.open(tmpDir); + + Random rnd = new Random(42); + try (IndexWriter w = new IndexWriter(dir, new IndexWriterConfig())) { + for (int p = 0; p < numParents; p++) { + List block = new ArrayList<>(); + for (int c = 0; c < childrenPerParent; c++) { + Document child = new Document(); + child.add( + new KnnFloatVectorField(FIELD, randomUnitVector(dim, rnd), VectorSimilarityFunction.DOT_PRODUCT)); + block.add(child); + } + Document parent = new Document(); + parent.add(new StringField(PARENT_FIELD, PARENT_VALUE, Field.Store.NO)); + block.add(parent); + w.addDocuments(block); + } + w.forceMerge(1); + } + + reader = DirectoryReader.open(dir); + searcher = new IndexSearcher(reader); + parentFilter = new QueryBitSetProducer(new TermQuery(new Term(PARENT_FIELD, PARENT_VALUE))); + + Random qrnd = new Random(123); + queryVectors = new float[NUM_QUERY_VECTORS][]; + for (int i = 0; i < NUM_QUERY_VECTORS; i++) { + queryVectors[i] = randomUnitVector(dim, qrnd); + } + } + + @TearDown(Level.Trial) + public void teardown() throws IOException { + IOUtils.close(reader, dir); + IOUtils.rm(tmpDir); + } + + @Benchmark + public TopDocs search() throws IOException { + float[] query = queryVectors[queryIdx++ & (NUM_QUERY_VECTORS - 1)]; + Query knnQuery = + new DiversifyingChildrenFloatKnnVectorQuery(FIELD, query, null, k, parentFilter); + return searcher.search(knnQuery, k); + } + + private static float[] randomUnitVector(int dim, Random rnd) { + float[] v = new float[dim]; + float norm = 0; + for (int i = 0; i < dim; i++) { + v[i] = rnd.nextFloat() * 2 - 1; + norm += v[i] * v[i]; + } + norm = (float) Math.sqrt(norm); + for (int i = 0; i < dim; i++) { + v[i] /= norm; + } + return v; + } +} \ No newline at end of file From 9ab84ad1334cfc1cc99d86035f13fe120636d0c7 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Mon, 4 May 2026 16:23:46 +0200 Subject: [PATCH 06/23] Added cache and update benchmark for testing different scenarios --- ...DiversifyingChildrenKnnQueryBenchmark.java | 63 +++++++++++++++---- ...ingNearestChildrenKnnCollectorManager.java | 43 ++++++++++++- 2 files changed, 93 insertions(+), 13 deletions(-) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java index 7940b1fd2bc6..f31e74de75a1 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java @@ -58,10 +58,21 @@ /** * Benchmarks end-to-end latency of {@link DiversifyingChildrenFloatKnnVectorQuery} with sibling - * expansion enabled. Run with: + * expansion enabled across three sibling-correlation scenarios: + * + *
    + *
  • best — siblings are nearly identical (small noise around a parent centroid). + * Expansion finds the best sibling immediately; HNSW terminates early. + *
  • standard — siblings have moderate correlation (realistic use case). + *
  • worst — siblings are fully independent random vectors. Expansion fires but adds no + * recall benefit; measures pure overhead. + *
+ * + * Run with: * *
- *   ./gradlew -p lucene/benchmark-jmh jmh --args="DiversifyingChildrenKnnQueryBenchmark"
+ *   ./gradlew -p lucene/benchmark-jmh assemble
+ *   java -jar lucene/benchmark-jmh/build/benchmarks/lucene-benchmark-jmh-*.jar DiversifyingChildrenKnnQueryBenchmark
  * 
*/ @BenchmarkMode(Mode.AverageTime) @@ -79,6 +90,17 @@ public class DiversifyingChildrenKnnQueryBenchmark { private static final String PARENT_VALUE = "_parent"; private static final int NUM_QUERY_VECTORS = 256; + /** + * Sibling correlation scenario: + *
    + *
  • {@code best} — siblings nearly identical (noise = 0.05); best case for expansion. + *
  • {@code standard} — siblings moderately correlated (noise = 0.3); realistic case. + *
  • {@code worst} — siblings fully random; pure overhead, no recall benefit. + *
+ */ + @Param({"best", "standard", "worst"}) + public String siblingCorrelation; + @Param({"5000"}) public int numParents; @@ -104,14 +126,24 @@ public void setup() throws IOException { tmpDir = Files.createTempDirectory("DiversifyingChildrenKnnQueryBenchmark"); dir = MMapDirectory.open(tmpDir); + float noiseLevel = + switch (siblingCorrelation) { + case "best" -> 0.05f; + case "standard" -> 0.30f; + default -> Float.NaN; // worst: fully random, no centroid + }; + Random rnd = new Random(42); try (IndexWriter w = new IndexWriter(dir, new IndexWriterConfig())) { for (int p = 0; p < numParents; p++) { + float[] centroid = Float.isNaN(noiseLevel) ? null : randomUnitVector(dim, rnd); List block = new ArrayList<>(); for (int c = 0; c < childrenPerParent; c++) { + float[] vec = centroid == null + ? randomUnitVector(dim, rnd) + : perturbedUnitVector(centroid, noiseLevel, rnd); Document child = new Document(); - child.add( - new KnnFloatVectorField(FIELD, randomUnitVector(dim, rnd), VectorSimilarityFunction.DOT_PRODUCT)); + child.add(new KnnFloatVectorField(FIELD, vec, VectorSimilarityFunction.DOT_PRODUCT)); block.add(child); } Document parent = new Document(); @@ -149,15 +181,24 @@ public TopDocs search() throws IOException { private static float[] randomUnitVector(int dim, Random rnd) { float[] v = new float[dim]; - float norm = 0; - for (int i = 0; i < dim; i++) { - v[i] = rnd.nextFloat() * 2 - 1; - norm += v[i] * v[i]; + for (int i = 0; i < dim; i++) v[i] = rnd.nextFloat() * 2 - 1; + return normalise(v); + } + + /** Returns a unit vector near {@code centroid} with per-dimension noise scaled by noiseLevel. */ + private static float[] perturbedUnitVector(float[] centroid, float noiseLevel, Random rnd) { + float[] v = new float[centroid.length]; + for (int i = 0; i < centroid.length; i++) { + v[i] = centroid[i] + noiseLevel * (rnd.nextFloat() * 2 - 1); } + return normalise(v); + } + + private static float[] normalise(float[] v) { + float norm = 0; + for (float x : v) norm += x * x; norm = (float) Math.sqrt(norm); - for (int i = 0; i < dim; i++) { - v[i] /= norm; - } + for (int i = 0; i < v.length; i++) v[i] /= norm; return v; } } \ No newline at end of file diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java index 13071bc84fac..03e1267b8c47 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java @@ -19,7 +19,9 @@ import java.io.IOException; import java.util.Arrays; +import java.util.concurrent.ConcurrentHashMap; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; @@ -34,6 +36,15 @@ */ public class DiversifyingNearestChildrenKnnCollectorManager implements KnnCollectorManager { + // Sentinel stored in the cache to represent "this segment has no vectors for the field", + // because ConcurrentHashMap does not allow null values. + private static final int[] NO_VECTORS = new int[0]; + + // Cache keyed by (segment core cache key → (field name → docToOrd array)). + // Entries are evicted automatically when the segment is closed via addClosedListener. + private static final ConcurrentHashMap> + DOC_TO_ORD_CACHE = new ConcurrentHashMap<>(); + // the number of docs to collect private final int k; // filter identifying the parent documents. @@ -69,7 +80,7 @@ public KnnCollector newCollector( if (parentBitSet == null) { return null; } - int[] docToOrd = buildDocToOrd(context); + int[] docToOrd = getCachedDocToOrd(context); return new DiversifyingNearestChildrenKnnCollector( k, visitedLimit, searchStrategy, parentBitSet, docToOrd); } @@ -82,7 +93,7 @@ public KnnCollector newOptimisticCollector( if (parentBitSet == null) { return null; } - int[] docToOrd = buildDocToOrd(context); + int[] docToOrd = getCachedDocToOrd(context); return new DiversifyingNearestChildrenKnnCollector( k, visitedLimit, searchStrategy, parentBitSet, docToOrd); } @@ -92,6 +103,34 @@ public boolean isOptimistic() { return true; } + /** + * Returns the docId-to-ordinal array for the given leaf, building and caching it on first access. + * The cached array is evicted automatically when the segment closes. + */ + private int[] getCachedDocToOrd(LeafReaderContext context) throws IOException { + IndexReader.CacheHelper cacheHelper = context.reader().getCoreCacheHelper(); + if (cacheHelper == null) { + return buildDocToOrd(context); + } + IndexReader.CacheKey cacheKey = cacheHelper.getKey(); + ConcurrentHashMap fieldMap = new ConcurrentHashMap<>(); + ConcurrentHashMap existing = DOC_TO_ORD_CACHE.putIfAbsent(cacheKey, fieldMap); + if (existing == null) { + // We inserted the new entry — register cleanup when the segment closes + cacheHelper.addClosedListener(DOC_TO_ORD_CACHE::remove); + } else { + fieldMap = existing; + } + int[] cached = fieldMap.get(field); + if (cached != null) { + return cached == NO_VECTORS ? null : cached; + } + int[] built = buildDocToOrd(context); + int[] stored = built != null ? built : NO_VECTORS; + int[] race = fieldMap.putIfAbsent(field, stored); + return race != null ? (race == NO_VECTORS ? null : race) : built; + } + /** * Builds a docId-to-ordinal array for the given leaf, mapping each docId to its vector ordinal. * From 07421fb2148f2c6bd3d3af0684e5f12f8420ded0 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Mon, 4 May 2026 17:22:55 +0200 Subject: [PATCH 07/23] Updated tests for triggering early termination --- .../benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java index f31e74de75a1..649c58e9fb56 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java @@ -104,10 +104,10 @@ public class DiversifyingChildrenKnnQueryBenchmark { @Param({"5000"}) public int numParents; - @Param({"4", "8"}) + @Param({"4", "8", "16"}) public int childrenPerParent; - @Param({"10"}) + @Param({"10", "100"}) public int k; @Param({"128"}) From 8fae7fbd1cfe1c8fd56340e0d16b29e6aef2e1af Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Tue, 5 May 2026 14:50:14 +0200 Subject: [PATCH 08/23] Added comments to tests --- ...DiversifyingChildrenKnnQueryBenchmark.java | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java index 649c58e9fb56..d99424466304 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java @@ -78,8 +78,11 @@ @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.MILLISECONDS) @State(Scope.Benchmark) +// 4 iterations 1 second each - results discarded @Warmup(iterations = 4, time = 1) +// 5 iterations 1 second each - results recorded (how many calls we can do in 1 sec) @Measurement(iterations = 5, time = 1) +// 3 separate JVM processes @Fork( value = 3, jvmArgsAppend = {"-Xmx4g", "-Xms4g", "-XX:+AlwaysPreTouch"}) @@ -126,41 +129,55 @@ public void setup() throws IOException { tmpDir = Files.createTempDirectory("DiversifyingChildrenKnnQueryBenchmark"); dir = MMapDirectory.open(tmpDir); + // How much siblings are near to each other float noiseLevel = switch (siblingCorrelation) { - case "best" -> 0.05f; - case "standard" -> 0.30f; + case "best" -> 0.05f; // nearly identical + case "standard" -> 0.30f; // moderately correlated default -> Float.NaN; // worst: fully random, no centroid }; Random rnd = new Random(42); + // index creation try (IndexWriter w = new IndexWriter(dir, new IndexWriterConfig())) { + // 5000 parents for (int p = 0; p < numParents; p++) { + // vector of 128-dim float[] centroid = Float.isNaN(noiseLevel) ? null : randomUnitVector(dim, rnd); List block = new ArrayList<>(); + // 4 - 8 - 16 children per parent for (int c = 0; c < childrenPerParent; c++) { float[] vec = centroid == null ? randomUnitVector(dim, rnd) : perturbedUnitVector(centroid, noiseLevel, rnd); + // create child doc Document child = new Document(); child.add(new KnnFloatVectorField(FIELD, vec, VectorSimilarityFunction.DOT_PRODUCT)); + // add to the index block block.add(child); } + // create parent document Document parent = new Document(); + // docType = _parent parent.add(new StringField(PARENT_FIELD, PARENT_VALUE, Field.Store.NO)); + // add to the index block block.add(parent); + // add to the index writer w.addDocuments(block); } + // compress to one segment w.forceMerge(1); } reader = DirectoryReader.open(dir); searcher = new IndexSearcher(reader); + // parent filter docType = _parent parentFilter = new QueryBitSetProducer(new TermQuery(new Term(PARENT_FIELD, PARENT_VALUE))); Random qrnd = new Random(123); queryVectors = new float[NUM_QUERY_VECTORS][]; for (int i = 0; i < NUM_QUERY_VECTORS; i++) { + // random query vectors queryVectors[i] = randomUnitVector(dim, qrnd); } } @@ -173,6 +190,8 @@ public void teardown() throws IOException { @Benchmark public TopDocs search() throws IOException { + // benchmarked part - search + // iterates on all the queries in a round-robin float[] query = queryVectors[queryIdx++ & (NUM_QUERY_VECTORS - 1)]; Query knnQuery = new DiversifyingChildrenFloatKnnVectorQuery(FIELD, query, null, k, parentFilter); @@ -194,6 +213,7 @@ private static float[] perturbedUnitVector(float[] centroid, float noiseLevel, R return normalise(v); } + // Since we use DOT PRODUCT private static float[] normalise(float[] v) { float norm = 0; for (float x : v) norm += x * x; @@ -201,4 +221,4 @@ private static float[] normalise(float[] v) { for (int i = 0; i < v.length; i++) v[i] /= norm; return v; } -} \ No newline at end of file +} From 32d406fe0616360039054328f7a924fd6c8d3369 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Tue, 5 May 2026 15:57:37 +0200 Subject: [PATCH 09/23] Gradlew tidy --- ...DiversifyingChildrenKnnQueryBenchmark.java | 12 ++-- .../util/hnsw/AbstractHnswGraphSearcher.java | 8 ++- .../util/hnsw/ChildrenSiblingExpansion.java | 13 ++-- .../lucene/util/hnsw/DocSiblingExpansion.java | 2 +- .../lucene/util/hnsw/HnswGraphSearcher.java | 8 ++- .../hnsw/OrdinalTranslatedKnnCollector.java | 10 +-- ...versifyingNearestChildrenKnnCollector.java | 10 ++- ...ingNearestChildrenKnnCollectorManager.java | 3 +- ...ersifyingChildrenKnnCollectorTestCase.java | 4 +- ...versifyingChildrenKnnSiblingExpansion.java | 69 ++++++++++--------- .../join/TestToParentJoinKnnResults.java | 3 +- 11 files changed, 85 insertions(+), 57 deletions(-) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java index d99424466304..f6f0f10358c8 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DiversifyingChildrenKnnQueryBenchmark.java @@ -95,6 +95,7 @@ public class DiversifyingChildrenKnnQueryBenchmark { /** * Sibling correlation scenario: + * *
    *
  • {@code best} — siblings nearly identical (noise = 0.05); best case for expansion. *
  • {@code standard} — siblings moderately correlated (noise = 0.3); realistic case. @@ -132,9 +133,9 @@ public void setup() throws IOException { // How much siblings are near to each other float noiseLevel = switch (siblingCorrelation) { - case "best" -> 0.05f; // nearly identical + case "best" -> 0.05f; // nearly identical case "standard" -> 0.30f; // moderately correlated - default -> Float.NaN; // worst: fully random, no centroid + default -> Float.NaN; // worst: fully random, no centroid }; Random rnd = new Random(42); @@ -147,9 +148,10 @@ public void setup() throws IOException { List block = new ArrayList<>(); // 4 - 8 - 16 children per parent for (int c = 0; c < childrenPerParent; c++) { - float[] vec = centroid == null - ? randomUnitVector(dim, rnd) - : perturbedUnitVector(centroid, noiseLevel, rnd); + float[] vec = + centroid == null + ? randomUnitVector(dim, rnd) + : perturbedUnitVector(centroid, noiseLevel, rnd); // create child doc Document child = new Document(); child.add(new KnnFloatVectorField(FIELD, vec, VectorSimilarityFunction.DOT_PRODUCT)); diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java index 03836fa8d006..dc6ae9560706 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java @@ -122,7 +122,13 @@ protected static void scoreEntryPoints( if (numSiblingsToVisit > 0) { siblingScores = scoreSiblings( - results, scorer, candidates, acceptOrds, siblings, numSiblingsToVisit, siblingScores); + results, + scorer, + candidates, + acceptOrds, + siblings, + numSiblingsToVisit, + siblingScores); } } } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java index c176c166ceab..ee9823cee925 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java @@ -26,18 +26,17 @@ * collected without requiring full graph traversal. * *

    The graph searcher calls {@link #pendingSiblingOrdinals} before invoking {@link - * org.apache.lucene.search.KnnCollector#collect} for the triggering node, so that the collector - * can safely inspect its internal state before the parent is registered. + * org.apache.lucene.search.KnnCollector#collect} for the triggering node, so that the collector can + * safely inspect its internal state before the parent is registered. * * @lucene.experimental */ public interface ChildrenSiblingExpansion { /** - * Called by the HNSW graph searcher before {@link - * org.apache.lucene.search.KnnCollector#collect} is invoked for {@code collectedOrdinal}. The - * implementation inspects the node's parent and returns the ordinals of unvisited siblings that - * should be scored immediately. + * Called by the HNSW graph searcher before {@link org.apache.lucene.search.KnnCollector#collect} + * is invoked for {@code collectedOrdinal}. The implementation inspects the node's parent and + * returns the ordinals of unvisited siblings that should be scored immediately. * *

    The caller is responsible for marking the returned ordinals as visited in {@code * visitedOrds} and for scoring and collecting them. @@ -48,4 +47,4 @@ public interface ChildrenSiblingExpansion { * @return ordinals of sibling nodes to score next, or {@code null} if none */ int[] pendingSiblingOrdinals(int collectedOrdinal, BitSet visitedOrds); -} \ No newline at end of file +} diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java index 4882ade8c2ee..6832948f7386 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java @@ -47,4 +47,4 @@ public interface DocSiblingExpansion { * @return the vector ordinal, or {@code -1} */ int docIdToOrdinal(int docId); -} \ No newline at end of file +} diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java index 2bac2e046788..e64cbed10a0c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java @@ -376,7 +376,13 @@ void searchLevel( float prevMinSim = results.minCompetitiveSimilarity(); siblingScores = scoreSiblings( - results, scorer, candidates, acceptOrds, siblings, numSiblingsToVisit, siblingScores); + results, + scorer, + candidates, + acceptOrds, + siblings, + numSiblingsToVisit, + siblingScores); if (results.minCompetitiveSimilarity() > prevMinSim) { minAcceptedSimilarity = Math.nextUp(results.minCompetitiveSimilarity()); shouldExploreMinSim = true; diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java index 35f15e81193c..4814e799826b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java @@ -25,8 +25,8 @@ /** * Wraps a provided KnnCollector object, translating the provided vectorId ordinal to a documentId. - * This wrapper implements {@link ChildrenSiblingExpansion}; sibling expansion is active only - * when the wrapped collector also implements {@link DocSiblingExpansion}. + * This wrapper implements {@link ChildrenSiblingExpansion}; sibling expansion is active only when + * the wrapped collector also implements {@link DocSiblingExpansion}. */ public final class OrdinalTranslatedKnnCollector extends KnnCollector.Decorator implements ChildrenSiblingExpansion { @@ -67,13 +67,15 @@ public int[] pendingSiblingOrdinals(int collectedOrdinal, BitSet visitedOrds) { return null; } int[] siblingOrdinals = new int[siblingDocIds.length]; - // siblingOrdinals is pre-allocated to siblingDocIds.length and Java initializes int arrays to 0 so this variable is necessary. + // siblingOrdinals is pre-allocated to siblingDocIds.length and Java initializes int arrays to 0 + // so this variable is necessary. int count = 0; for (int sibDocId : siblingDocIds) { int sibOrd = docExpander.docIdToOrdinal(sibDocId); // sibOrd = -1 when a document has no vector for this field. // Such a doc has no node in the HNSW graph and can't be scored, so it must be skipped. - // If a sibling was reached via normal graph traversal before sibling expansion triggered, re-adding it would + // If a sibling was reached via normal graph traversal before sibling expansion triggered, + // re-adding it would // cause it to be scored twice. !visitedOrds.get(sibOrd) filters those out. if (sibOrd >= 0 && !visitedOrds.get(sibOrd)) { siblingOrdinals[count++] = sibOrd; diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java index e2ca8fbdb10e..1cd2d8f32ea7 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java @@ -142,12 +142,16 @@ public int[] findSiblingDocIds(int childDocId) { @Override public int docIdToOrdinal(int docId) { // Conditions explanation: - // docToOrd == null — buildDocToOrd returns null when the segment has no vector values at all for the field (line 113). Without this guard you'd get a NullPointerException. - // docId >= docToOrd.length — In production, docToOrd is sized exactly maxDoc for that segment (line 116), so every valid docId in that segment fits. This check is therefore a pure defensive guard — it can't be triggered by + // docToOrd == null — buildDocToOrd returns null when the segment has no vector values at all + // for the field (line 113). Without this guard you'd get a NullPointerException. + // docId >= docToOrd.length — In production, docToOrd is sized exactly maxDoc for that segment + // (line 116), so every valid docId in that segment fits. This check is therefore a pure + // defensive guard — it can't be triggered by // correct production code, but it protects against: // - bugs in how the array is built or passed // - future refactoring that changes the array size - // - misuse when constructing the collector manually (as in tests, e.g. the docIdToOrdinal(9999) test case) + // - misuse when constructing the collector manually (as in tests, e.g. the + // docIdToOrdinal(9999) test case) if (docToOrd == null || docId >= docToOrd.length) { return -1; } diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java index 03e1267b8c47..9914b285e0ac 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java @@ -143,7 +143,8 @@ private int[] getCachedDocToOrd(LeafReaderContext context) throws IOException { private int[] buildDocToOrd(LeafReaderContext context) throws IOException { FieldInfo fi = context.reader().getFieldInfos().fieldInfo(field); // fi = null if the field doesn't exist in this segment at all. - // fi.getVectorDimension() = 0 if the field exist in the segment but was not indexed as a vector field. + // fi.getVectorDimension() = 0 if the field exist in the segment but was not indexed as a vector + // field. if (fi == null || fi.getVectorDimension() == 0) { return null; } diff --git a/lucene/join/src/test/org/apache/lucene/search/join/DiversifyingChildrenKnnCollectorTestCase.java b/lucene/join/src/test/org/apache/lucene/search/join/DiversifyingChildrenKnnCollectorTestCase.java index 3d7fdecf31ca..82ce76196a73 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/DiversifyingChildrenKnnCollectorTestCase.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/DiversifyingChildrenKnnCollectorTestCase.java @@ -24,8 +24,8 @@ import org.apache.lucene.util.BitSet; /** - * Base class for {@link DiversifyingNearestChildrenKnnCollector} tests. Provides shared helpers - * for building the block-join parent {@link BitSet} used across test subclasses. + * Base class for {@link DiversifyingNearestChildrenKnnCollector} tests. Provides shared helpers for + * building the block-join parent {@link BitSet} used across test subclasses. */ abstract class DiversifyingChildrenKnnCollectorTestCase extends LuceneTestCase { diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java index bb170ec3d79e..934ff8f593de 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java @@ -46,8 +46,8 @@ /** * Tests for sibling expansion in {@link DiversifyingNearestChildrenKnnCollector} ({@link - * org.apache.lucene.util.hnsw.DocSiblingExpansion} contract) and end-to-end correctness via - * {@link DiversifyingChildrenFloatKnnVectorQuery}. + * org.apache.lucene.util.hnsw.DocSiblingExpansion} contract) and end-to-end correctness via {@link + * DiversifyingChildrenFloatKnnVectorQuery}. */ public class TestDiversifyingChildrenKnnSiblingExpansion extends DiversifyingChildrenKnnCollectorTestCase { @@ -141,10 +141,10 @@ public void testDocIdToOrdinal_correctMapping() throws IOException { assertEquals(0, c.docIdToOrdinal(0)); assertEquals(1, c.docIdToOrdinal(1)); - assertEquals(-1, c.docIdToOrdinal(2)); // parent → no vector + assertEquals(-1, c.docIdToOrdinal(2)); // parent → no vector assertEquals(2, c.docIdToOrdinal(3)); assertEquals(3, c.docIdToOrdinal(4)); - assertEquals(-1, c.docIdToOrdinal(5)); // parent → no vector + assertEquals(-1, c.docIdToOrdinal(5)); // parent → no vector assertEquals(-1, c.docIdToOrdinal(9999)); // beyond array bounds } @@ -163,8 +163,8 @@ public void testDocIdToOrdinal_nullMapping_alwaysMinusOne() throws IOException { // --------------------------------------------------------------------------- /** - * C3 is the entry point (score 0.85) but its siblings C4 (0.95) - * and C5 (0.90) are expanded immediately. The parent must be represented by C4, not C3. + * C3 is the entry point (score 0.85) but its siblings C4 (0.95) and C5 (0.90) are expanded + * immediately. The parent must be represented by C4, not C3. */ public void testBestSiblingReplacesFirstFoundChild() throws IOException { // 1 parent, 3 children: [C0=0, C1=1, C2=2 | P0=3] @@ -181,7 +181,8 @@ public void testBestSiblingReplacesFirstFoundChild() throws IOException { TopDocs td = c.topDocs(); assertEquals(1, td.scoreDocs.length); - assertEquals("best sibling must win over first-found child", 0.95f, td.scoreDocs[0].score, 1e-5f); + assertEquals( + "best sibling must win over first-found child", 0.95f, td.scoreDocs[0].score, 1e-5f); assertEquals("best child doc id must be C1", 1, td.scoreDocs[0].doc); } @@ -201,8 +202,8 @@ public void testBestSiblingReplacesWorseChildWhenHeapFull() throws IOException { c.collect(1, 0.95f); // P1: C2 found first (0.60), C3 is better sibling (0.65) - c.collect(3, 0.60f); // trigger child → P1 enters heap, now FULL (size=2=k) - c.collect(4, 0.65f); // better sibling → heap full but P1 already present, updates entry + c.collect(3, 0.60f); // trigger child → P1 enters heap, now FULL (size=2=k) + c.collect(4, 0.65f); // better sibling → heap full but P1 already present, updates entry TopDocs td = c.topDocs(); assertEquals(2, td.scoreDocs.length); @@ -217,8 +218,8 @@ public void testBestSiblingReplacesWorseChildWhenHeapFull() throws IOException { // --------------------------------------------------------------------------- /** - * Siblings whose ordinal is already in visitedOrds must be filtered out to prevent - * double-scoring when the sibling was independently discovered via normal graph traversal. + * Siblings whose ordinal is already in visitedOrds must be filtered out to prevent double-scoring + * when the sibling was independently discovered via normal graph traversal. */ public void testPendingSiblingOrdinals_filtersAlreadyVisited() throws IOException { // 1 parent, 3 children: [C0=doc0, C1=doc1, C2=doc2 | P0=doc3] @@ -227,7 +228,8 @@ public void testPendingSiblingOrdinals_filtersAlreadyVisited() throws IOExceptio int[] docToOrd = buildDocToOrd(1, 3); int[] ordToDoc = {0, 1, 2}; OrdinalTranslatedKnnCollector collector = - new OrdinalTranslatedKnnCollector(makeCollector(10, parents, docToOrd), ord -> ordToDoc[ord]); + new OrdinalTranslatedKnnCollector( + makeCollector(10, parents, docToOrd), ord -> ordToDoc[ord]); // Mark C1 (ordinal 1) as already visited by normal graph traversal FixedBitSet visited = new FixedBitSet(4); @@ -236,12 +238,12 @@ public void testPendingSiblingOrdinals_filtersAlreadyVisited() throws IOExceptio // Trigger on C0 (ordinal 0 → docId 0); siblings are C1 and C2 int[] result = collector.pendingSiblingOrdinals(0, visited); assertNotNull(result); - assertArrayEquals("C1 must be filtered (visited); only C2 remains", new int[]{2}, result); + assertArrayEquals("C1 must be filtered (visited); only C2 remains", new int[] {2}, result); } /** - * Siblings with no vector in this field (docIdToOrdinal returns -1) must be skipped because - * they have no node in the HNSW graph and cannot be scored. + * Siblings with no vector in this field (docIdToOrdinal returns -1) must be skipped because they + * have no node in the HNSW graph and cannot be scored. */ public void testPendingSiblingOrdinals_filtersSparseSiblings() throws IOException { // 1 parent, 3 children but C1 has no vector: @@ -251,14 +253,15 @@ public void testPendingSiblingOrdinals_filtersSparseSiblings() throws IOExceptio int[] docToOrd = {0, -1, 1, -1}; int[] ordToDoc = {0, 2}; OrdinalTranslatedKnnCollector collector = - new OrdinalTranslatedKnnCollector(makeCollector(10, parents, docToOrd), ord -> ordToDoc[ord]); + new OrdinalTranslatedKnnCollector( + makeCollector(10, parents, docToOrd), ord -> ordToDoc[ord]); FixedBitSet visited = new FixedBitSet(4); // Trigger on C0 (ordinal 0 → docId 0); siblings are C1 (sparse) and C2 int[] result = collector.pendingSiblingOrdinals(0, visited); assertNotNull(result); - assertArrayEquals("C1 (no vector) must be filtered; only C2 remains", new int[]{1}, result); + assertArrayEquals("C1 (no vector) must be filtered; only C2 remains", new int[] {1}, result); } // --------------------------------------------------------------------------- @@ -272,8 +275,8 @@ public void testPendingSiblingOrdinals_filtersSparseSiblings() throws IOExceptio * Builds a block-join float-vector index. Parent p has numChildren children; child c of parent p * gets vector: v[i] = (p * numChildren + c + i) * 0.1, then normalised for COSINE/DOT_PRODUCT. */ - private Directory buildIndex( - int numParents, int numChildren, VectorSimilarityFunction sim) throws IOException { + private Directory buildIndex(int numParents, int numChildren, VectorSimilarityFunction sim) + throws IOException { Directory dir = newDirectory(); try (IndexWriter w = new IndexWriter( @@ -282,7 +285,8 @@ dir, new IndexWriterConfig().setMergePolicy(newMergePolicy(random(), false)))) { List block = new ArrayList<>(); for (int c = 0; c < numChildren; c++) { float[] vec = childVector(p, c, numChildren); - if (sim == VectorSimilarityFunction.COSINE || sim == VectorSimilarityFunction.DOT_PRODUCT) { + if (sim == VectorSimilarityFunction.COSINE + || sim == VectorSimilarityFunction.DOT_PRODUCT) { normalise(vec); } Document child = new Document(); @@ -319,11 +323,7 @@ private static void normalise(float[] vec) { /** Computes brute-force top-k scores: max similarity per parent, sorted descending. */ private static float[] bruteForceTopK( - float[] query, - int numParents, - int numChildren, - VectorSimilarityFunction sim, - int k) { + float[] query, int numParents, int numChildren, VectorSimilarityFunction sim, int k) { float[] parentBest = new float[numParents]; Arrays.fill(parentBest, Float.NEGATIVE_INFINITY); for (int p = 0; p < numParents; p++) { @@ -404,8 +404,8 @@ public void testSiblingExpansionSingleChildParents() throws Exception { } /** - * The query is close to one parent's children; sibling expansion must find the best child of each discovered parent - * rather than whichever child the graph traversal happens to reach first. + * The query is close to one parent's children; sibling expansion must find the best child of each + * discovered parent rather than whichever child the graph traversal happens to reach first. */ public void testSiblingExpansion_bestChildPerParentFound() throws Exception { int numParents = 3, numChildren = 3, k = 2; @@ -425,15 +425,22 @@ public void testSiblingExpansion_bestChildPerParentFound() throws Exception { assertEquals(k, results.scoreDocs.length); for (ScoreDoc sd : results.scoreDocs) { - int parentIdx = reader.storedFields().document(sd.doc) - .getField("parent").numericValue().intValue(); + int parentIdx = + reader.storedFields().document(sd.doc).getField("parent").numericValue().intValue(); // verify no other child of the same parent scores higher than the returned one for (int c = 0; c < numChildren; c++) { float[] vec = childVector(parentIdx, c, numChildren); float cScore = sim.compare(query, vec); assertTrue( - "parent " + parentIdx + " has a better child (score " + cScore - + ") than returned doc " + sd.doc + " (score " + sd.score + ")", + "parent " + + parentIdx + + " has a better child (score " + + cScore + + ") than returned doc " + + sd.doc + + " (score " + + sd.score + + ")", cScore <= sd.score + 1e-4f); } } diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestToParentJoinKnnResults.java b/lucene/join/src/test/org/apache/lucene/search/join/TestToParentJoinKnnResults.java index 212990bfac88..fe1b008b5f4d 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestToParentJoinKnnResults.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestToParentJoinKnnResults.java @@ -104,7 +104,8 @@ public void testRandomInsertionsWithOverflow() throws IOException { BitSet parentBitSet = BitSet.of(new IntArrayDocIdSetIterator(parents, parents.length), nextParent + 1); DiversifyingNearestChildrenKnnCollector results = - new DiversifyingNearestChildrenKnnCollector(20, Integer.MAX_VALUE, null, parentBitSet, null); + new DiversifyingNearestChildrenKnnCollector( + 20, Integer.MAX_VALUE, null, parentBitSet, null); for (int i = 0; i < children.size(); i++) { results.collect(children.get(i), childrenScores.get(i)); } From 5dbf0f7dbe836f6a37424240c276281484032199 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Tue, 5 May 2026 16:02:57 +0200 Subject: [PATCH 10/23] Gradlew check --- .../lucene/util/hnsw/OrdinalTranslatedKnnCollector.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java index 4814e799826b..308a3b9a0ded 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java @@ -17,10 +17,10 @@ package org.apache.lucene.util.hnsw; -import java.util.Arrays; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TotalHits; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitSet; /** @@ -81,6 +81,6 @@ public int[] pendingSiblingOrdinals(int collectedOrdinal, BitSet visitedOrds) { siblingOrdinals[count++] = sibOrd; } } - return count > 0 ? Arrays.copyOf(siblingOrdinals, count) : null; + return count > 0 ? ArrayUtil.copyOfSubArray(siblingOrdinals, 0, count) : null; } } From 0448dcf94941930c457135e1ac4eacb04531776e Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Wed, 6 May 2026 15:42:37 +0200 Subject: [PATCH 11/23] Small names refactoring --- .../util/hnsw/AbstractHnswGraphSearcher.java | 34 ++++++++++++------- .../util/hnsw/ChildrenSiblingExpansion.java | 4 +-- .../lucene/util/hnsw/HnswGraphSearcher.java | 5 +-- .../hnsw/OrdinalTranslatedKnnCollector.java | 8 ++--- ...versifyingNearestChildrenKnnCollector.java | 9 +++-- ...versifyingChildrenKnnSiblingExpansion.java | 10 +++--- 6 files changed, 39 insertions(+), 31 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java index dc6ae9560706..6ab42b9808fe 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java @@ -106,10 +106,18 @@ protected static void scoreEntryPoints( // Fetch siblings BEFORE collect() so the parent is not yet in the heap int[] siblings = null; int numSiblingsToVisit = 0; + // This check is needed since this method is also called by the GraphBuilderKnnCollector if (results instanceof ChildrenSiblingExpansion expander) { - siblings = expander.pendingSiblingOrdinals(ep, visited); + siblings = expander.getSiblingOrdinals(ep, visited); if (siblings != null) { // how many siblings are actually scored to avoid exceeding the visit budget. + // controls the early termination condition. early terminates the search if we reach + // visitLimit nodes + // if this visit limit is high we just navigate the graph until we do not have any node + // with a score higher than the ones already collected + // Current values it could assume: + // - No filter → Integer.MAX_VALUE (no constraint tighter than the full segment) + // - With filter → cardinality (no constraint tighter than the full accepted set) numSiblingsToVisit = (int) Math.min(siblings.length, results.visitLimit() - results.visitedCount()); // Only mark as visited the siblings we will actually score; the rest remain @@ -121,7 +129,7 @@ protected static void scoreEntryPoints( results.collect(ep, score); if (numSiblingsToVisit > 0) { siblingScores = - scoreSiblings( + scoreHnswNodes( results, scorer, candidates, @@ -138,36 +146,36 @@ protected static void scoreEntryPoints( * Scores and collects siblings, adding competitive ones to the candidate queue. Reuses and * returns the siblingScores buffer, reallocating only if too small. */ - protected static float[] scoreSiblings( + protected static float[] scoreHnswNodes( KnnCollector results, RandomVectorScorer scorer, NeighborQueue candidates, Bits acceptOrds, - int[] siblings, + int[] hnswNodes, int numSiblings, - float[] siblingScores) + float[] scores) throws IOException { // If siblingScores not defined yet or too small to collect scores a new one is created // Otherwise we reuse the old one that will be overridden in bulkScore with new scores - if (siblingScores == null || siblingScores.length < numSiblings) { - siblingScores = new float[numSiblings]; + if (scores == null || scores.length < numSiblings) { + scores = new float[numSiblings]; } - float maxScore = scorer.bulkScore(siblings, siblingScores, numSiblings); + float maxScore = scorer.bulkScore(hnswNodes, scores, numSiblings); results.incVisitedCount(numSiblings); if (maxScore > results.minCompetitiveSimilarity()) { float minSimilarity = Math.nextUp(results.minCompetitiveSimilarity()); for (int j = 0; j < numSiblings; j++) { - float sibScore = siblingScores[j]; + float sibScore = scores[j]; // We avoid adding to candidates a sibling with a bad score if (sibScore >= minSimilarity) { - candidates.add(siblings[j], sibScore); - if (acceptOrds == null || acceptOrds.get(siblings[j])) { - results.collect(siblings[j], sibScore); + candidates.add(hnswNodes[j], sibScore); + if (acceptOrds == null || acceptOrds.get(hnswNodes[j])) { + results.collect(hnswNodes[j], sibScore); minSimilarity = Math.nextUp(results.minCompetitiveSimilarity()); } } } } - return siblingScores; + return scores; } } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java index ee9823cee925..763e3503a5d2 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java @@ -25,7 +25,7 @@ * belonging to a newly-discovered parent, all siblings of that parent are immediately scored and * collected without requiring full graph traversal. * - *

    The graph searcher calls {@link #pendingSiblingOrdinals} before invoking {@link + *

    The graph searcher calls {@link #getSiblingOrdinals} before invoking {@link * org.apache.lucene.search.KnnCollector#collect} for the triggering node, so that the collector can * safely inspect its internal state before the parent is registered. * @@ -46,5 +46,5 @@ public interface ChildrenSiblingExpansion { * marked here — the caller does that * @return ordinals of sibling nodes to score next, or {@code null} if none */ - int[] pendingSiblingOrdinals(int collectedOrdinal, BitSet visitedOrds); + int[] getSiblingOrdinals(int collectedOrdinal, BitSet visitedOrds); } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java index e64cbed10a0c..1686b581a1bb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java @@ -351,8 +351,9 @@ void searchLevel( // Fetch siblings BEFORE collect() so the parent is not yet in the heap int[] siblings = null; int numSiblingsToVisit = 0; + // This check is needed since this method is also called by the GraphBuilderKnnCollector if (results instanceof ChildrenSiblingExpansion expander) { - siblings = expander.pendingSiblingOrdinals(node, visited); + siblings = expander.getSiblingOrdinals(node, visited); if (siblings != null) { numSiblingsToVisit = (int) @@ -375,7 +376,7 @@ void searchLevel( if (numSiblingsToVisit > 0) { float prevMinSim = results.minCompetitiveSimilarity(); siblingScores = - scoreSiblings( + scoreHnswNodes( results, scorer, candidates, diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java index 308a3b9a0ded..def69b981df4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java @@ -57,11 +57,11 @@ public TopDocs topDocs() { } @Override - public int[] pendingSiblingOrdinals(int collectedOrdinal, BitSet visitedOrds) { + public int[] getSiblingOrdinals(int hnswNode, BitSet visitedHnswNodes) { if (!(collector instanceof DocSiblingExpansion docExpander)) { return null; } - int docId = vectorOrdinalToDocId.apply(collectedOrdinal); + int docId = vectorOrdinalToDocId.apply(hnswNode); int[] siblingDocIds = docExpander.findSiblingDocIds(docId); if (siblingDocIds == null) { return null; @@ -76,8 +76,8 @@ public int[] pendingSiblingOrdinals(int collectedOrdinal, BitSet visitedOrds) { // Such a doc has no node in the HNSW graph and can't be scored, so it must be skipped. // If a sibling was reached via normal graph traversal before sibling expansion triggered, // re-adding it would - // cause it to be scored twice. !visitedOrds.get(sibOrd) filters those out. - if (sibOrd >= 0 && !visitedOrds.get(sibOrd)) { + // cause it to be scored twice. !visitedHnswNodes.get(sibOrd) filters those out. + if (sibOrd >= 0 && !visitedHnswNodes.get(sibOrd)) { siblingOrdinals[count++] = sibOrd; } } diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java index 1cd2d8f32ea7..eba2181676ad 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java @@ -113,7 +113,7 @@ public int numCollected() { return heap.size(); } - @Override + // SOLO PER VISITARE LA FAMIGLIA ALTRIMENTI NON LO CHIAMIAMO PROPRIO public int[] findSiblingDocIds(int childDocId) { int parent = parentBitSet.nextSetBit(childDocId); // Skip if this parent's entry is already in the heap @@ -124,12 +124,12 @@ public int[] findSiblingDocIds(int childDocId) { int prevParent = parent > 0 ? parentBitSet.prevSetBit(parent - 1) : -1; int from = prevParent + 1; // Children of parent are all docIds in [from, parent); exclude childDocId itself - int capacity = parent - from - 1; + int siblingsSize = parent - from - 1; // One child case - if (capacity == 0) { + if (siblingsSize == 0) { return null; } - int[] siblings = new int[capacity]; + int[] siblings = new int[siblingsSize]; int idx = 0; for (int docId = from; docId < parent; docId++) { if (docId != childDocId) { @@ -139,7 +139,6 @@ public int[] findSiblingDocIds(int childDocId) { return idx > 0 ? siblings : null; } - @Override public int docIdToOrdinal(int docId) { // Conditions explanation: // docToOrd == null — buildDocToOrd returns null when the segment has no vector values at all diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java index 934ff8f593de..94e2f085503d 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java @@ -53,7 +53,7 @@ public class TestDiversifyingChildrenKnnSiblingExpansion extends DiversifyingChildrenKnnCollectorTestCase { // --------------------------------------------------------------------------- - // Unit tests: DocSiblingExpansion — findSiblingDocIds + // Unit tests: DiversifyingNearestChildrenKnnCollector — findSiblingDocIds // --------------------------------------------------------------------------- public void testFindSiblingDocIds_returnsAllSiblings() throws IOException { @@ -129,7 +129,7 @@ public void testFindSiblingDocIds_excludesTriggerChild() throws IOException { } // --------------------------------------------------------------------------- - // Unit tests: DocSiblingExpansion — docIdToOrdinal + // Unit tests: DiversifyingNearestChildrenKnnCollector — docIdToOrdinal // --------------------------------------------------------------------------- public void testDocIdToOrdinal_correctMapping() throws IOException { @@ -214,7 +214,7 @@ public void testBestSiblingReplacesWorseChildWhenHeapFull() throws IOException { } // --------------------------------------------------------------------------- - // Unit tests: OrdinalTranslatedKnnCollector — pendingSiblingOrdinals + // Unit tests: OrdinalTranslatedKnnCollector — getSiblingOrdinals // --------------------------------------------------------------------------- /** @@ -236,7 +236,7 @@ public void testPendingSiblingOrdinals_filtersAlreadyVisited() throws IOExceptio visited.set(1); // Trigger on C0 (ordinal 0 → docId 0); siblings are C1 and C2 - int[] result = collector.pendingSiblingOrdinals(0, visited); + int[] result = collector.getSiblingOrdinals(0, visited); assertNotNull(result); assertArrayEquals("C1 must be filtered (visited); only C2 remains", new int[] {2}, result); } @@ -259,7 +259,7 @@ public void testPendingSiblingOrdinals_filtersSparseSiblings() throws IOExceptio FixedBitSet visited = new FixedBitSet(4); // Trigger on C0 (ordinal 0 → docId 0); siblings are C1 (sparse) and C2 - int[] result = collector.pendingSiblingOrdinals(0, visited); + int[] result = collector.getSiblingOrdinals(0, visited); assertNotNull(result); assertArrayEquals("C1 (no vector) must be filtered; only C2 remains", new int[] {1}, result); } From ebc30a08ed30e04fc9dd26325bca7530217ef2a3 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Wed, 6 May 2026 15:48:44 +0200 Subject: [PATCH 12/23] Removed ChildrenSiblingExpansion interface --- .../util/hnsw/AbstractHnswGraphSearcher.java | 4 +- .../util/hnsw/ChildrenSiblingExpansion.java | 50 ------------------- .../lucene/util/hnsw/DocSiblingExpansion.java | 2 +- .../lucene/util/hnsw/HnswGraphSearcher.java | 4 +- .../hnsw/OrdinalTranslatedKnnCollector.java | 8 ++- 5 files changed, 8 insertions(+), 60 deletions(-) delete mode 100644 lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java index 6ab42b9808fe..e95101efc054 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java @@ -107,8 +107,8 @@ protected static void scoreEntryPoints( int[] siblings = null; int numSiblingsToVisit = 0; // This check is needed since this method is also called by the GraphBuilderKnnCollector - if (results instanceof ChildrenSiblingExpansion expander) { - siblings = expander.getSiblingOrdinals(ep, visited); + if (results instanceof OrdinalTranslatedKnnCollector collector) { + siblings = collector.getSiblingOrdinals(ep, visited); if (siblings != null) { // how many siblings are actually scored to avoid exceeding the visit budget. // controls the early termination condition. early terminates the search if we reach diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java deleted file mode 100644 index 763e3503a5d2..000000000000 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/ChildrenSiblingExpansion.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.util.hnsw; - -import org.apache.lucene.util.BitSet; - -/** - * Implemented by {@link org.apache.lucene.search.KnnCollector} instances that support dynamic - * sibling expansion during HNSW graph search. When the graph searcher collects a child node - * belonging to a newly-discovered parent, all siblings of that parent are immediately scored and - * collected without requiring full graph traversal. - * - *

    The graph searcher calls {@link #getSiblingOrdinals} before invoking {@link - * org.apache.lucene.search.KnnCollector#collect} for the triggering node, so that the collector can - * safely inspect its internal state before the parent is registered. - * - * @lucene.experimental - */ -public interface ChildrenSiblingExpansion { - - /** - * Called by the HNSW graph searcher before {@link org.apache.lucene.search.KnnCollector#collect} - * is invoked for {@code collectedOrdinal}. The implementation inspects the node's parent and - * returns the ordinals of unvisited siblings that should be scored immediately. - * - *

    The caller is responsible for marking the returned ordinals as visited in {@code - * visitedOrds} and for scoring and collecting them. - * - * @param collectedOrdinal the vector ordinal about to be collected - * @param visitedOrds the graph searcher's current visited bitset; returned siblings are not yet - * marked here — the caller does that - * @return ordinals of sibling nodes to score next, or {@code null} if none - */ - int[] getSiblingOrdinals(int collectedOrdinal, BitSet visitedOrds); -} diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java index 6832948f7386..53b9fe9447b9 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java @@ -18,7 +18,7 @@ package org.apache.lucene.util.hnsw; /** - * Doc-ID-level companion to {@link ChildrenSiblingExpansion}. Implemented by collectors that + * Implemented by collectors that * understand parent-child document relationships and can enumerate sibling document ids for a given * child document id, as well as translate document ids back to vector ordinals. * diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java index 1686b581a1bb..a0f5cf8525e8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java @@ -352,8 +352,8 @@ void searchLevel( int[] siblings = null; int numSiblingsToVisit = 0; // This check is needed since this method is also called by the GraphBuilderKnnCollector - if (results instanceof ChildrenSiblingExpansion expander) { - siblings = expander.getSiblingOrdinals(node, visited); + if (results instanceof OrdinalTranslatedKnnCollector collector) { + siblings = collector.getSiblingOrdinals(node, visited); if (siblings != null) { numSiblingsToVisit = (int) diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java index def69b981df4..1ee335dbbd9e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java @@ -25,11 +25,10 @@ /** * Wraps a provided KnnCollector object, translating the provided vectorId ordinal to a documentId. - * This wrapper implements {@link ChildrenSiblingExpansion}; sibling expansion is active only when - * the wrapped collector also implements {@link DocSiblingExpansion}. + * Sibling expansion is active only when the wrapped collector also implements {@link + * DocSiblingExpansion}. */ -public final class OrdinalTranslatedKnnCollector extends KnnCollector.Decorator - implements ChildrenSiblingExpansion { +public final class OrdinalTranslatedKnnCollector extends KnnCollector.Decorator { private final IntToIntFunction vectorOrdinalToDocId; @@ -56,7 +55,6 @@ public TopDocs topDocs() { td.scoreDocs); } - @Override public int[] getSiblingOrdinals(int hnswNode, BitSet visitedHnswNodes) { if (!(collector instanceof DocSiblingExpansion docExpander)) { return null; From ace844f82fdc4bc0cf430e98d0ee79181f414e61 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Wed, 6 May 2026 16:10:35 +0200 Subject: [PATCH 13/23] Analyzed if DocSiblingExpansion could be removed -> NO --- .../lucene/util/hnsw/AbstractHnswGraphSearcher.java | 3 ++- .../apache/lucene/util/hnsw/DocSiblingExpansion.java | 8 ++++++++ .../apache/lucene/util/hnsw/HnswGraphSearcher.java | 6 ++++-- .../util/hnsw/OrdinalTranslatedKnnCollector.java | 12 +++++++----- 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java index e95101efc054..527929969220 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java @@ -107,7 +107,8 @@ protected static void scoreEntryPoints( int[] siblings = null; int numSiblingsToVisit = 0; // This check is needed since this method is also called by the GraphBuilderKnnCollector - if (results instanceof OrdinalTranslatedKnnCollector collector) { + if (results instanceof OrdinalTranslatedKnnCollector collector + && collector.supportsSiblingExpansion()) { siblings = collector.getSiblingOrdinals(ep, visited); if (siblings != null) { // how many siblings are actually scored to avoid exceeding the visit budget. diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java index 53b9fe9447b9..bc4226f21c28 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java @@ -27,6 +27,14 @@ * * @lucene.experimental */ +// The interface cannot be removed. It exists for a module-boundary reason. +// DocSiblingExpansion is in lucene/core, while DiversifyingNearestChildrenKnnCollector is in lucene/join. +// The dependency is one-way: join depends on core, never the reverse. So OrdinalTranslatedKnnCollector (in core) +// has no way to reference DiversifyingNearestChildrenKnnCollector directly. +// +// The interface is the bridge — it lets core call findSiblingDocIds and docIdToOrdinal on the collector without +// creating a circular dependency. Removing it would require either moving OrdinalTranslatedKnnCollector into +// join (bigger refactor) or adding a core → join dependency (illegal in this architecture). public interface DocSiblingExpansion { /** diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java index a0f5cf8525e8..cb005361397c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java @@ -351,8 +351,10 @@ void searchLevel( // Fetch siblings BEFORE collect() so the parent is not yet in the heap int[] siblings = null; int numSiblingsToVisit = 0; - // This check is needed since this method is also called by the GraphBuilderKnnCollector - if (results instanceof OrdinalTranslatedKnnCollector collector) { + // The first check is needed since this method is also called by the GraphBuilderKnnCollector + // The second check is needed since we could have a TopKnnCollector at this point + if (results instanceof OrdinalTranslatedKnnCollector collector + && collector.supportsSiblingExpansion()) { siblings = collector.getSiblingOrdinals(node, visited); if (siblings != null) { numSiblingsToVisit = diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java index 1ee335dbbd9e..e20bc343046e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java @@ -55,12 +55,14 @@ public TopDocs topDocs() { td.scoreDocs); } + public boolean supportsSiblingExpansion() { + return collector instanceof DocSiblingExpansion; + } + public int[] getSiblingOrdinals(int hnswNode, BitSet visitedHnswNodes) { - if (!(collector instanceof DocSiblingExpansion docExpander)) { - return null; - } + DocSiblingExpansion docExpanderCollector = (DocSiblingExpansion) collector; int docId = vectorOrdinalToDocId.apply(hnswNode); - int[] siblingDocIds = docExpander.findSiblingDocIds(docId); + int[] siblingDocIds = docExpanderCollector.findSiblingDocIds(docId); if (siblingDocIds == null) { return null; } @@ -69,7 +71,7 @@ public int[] getSiblingOrdinals(int hnswNode, BitSet visitedHnswNodes) { // so this variable is necessary. int count = 0; for (int sibDocId : siblingDocIds) { - int sibOrd = docExpander.docIdToOrdinal(sibDocId); + int sibOrd = docExpanderCollector.docIdToOrdinal(sibDocId); // sibOrd = -1 when a document has no vector for this field. // Such a doc has no node in the HNSW graph and can't be scored, so it must be skipped. // If a sibling was reached via normal graph traversal before sibling expansion triggered, From 315462217e48d6e70aad5ef5eaeec5d454cde989 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Wed, 6 May 2026 17:54:24 +0200 Subject: [PATCH 14/23] Removed return null from getSiblingOrdinals. Empty array given. --- .../util/hnsw/AbstractHnswGraphSearcher.java | 37 ++++++++-------- .../lucene/util/hnsw/DocSiblingExpansion.java | 5 +-- .../lucene/util/hnsw/HnswGraphSearcher.java | 24 +++++------ .../hnsw/OrdinalTranslatedKnnCollector.java | 23 ++++++---- ...versifyingNearestChildrenKnnCollector.java | 6 +-- ...versifyingChildrenKnnSiblingExpansion.java | 43 ++++++++++++------- 6 files changed, 76 insertions(+), 62 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java index 527929969220..bb8c36a2e3b6 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java @@ -106,24 +106,25 @@ protected static void scoreEntryPoints( // Fetch siblings BEFORE collect() so the parent is not yet in the heap int[] siblings = null; int numSiblingsToVisit = 0; - // This check is needed since this method is also called by the GraphBuilderKnnCollector - if (results instanceof OrdinalTranslatedKnnCollector collector - && collector.supportsSiblingExpansion()) { - siblings = collector.getSiblingOrdinals(ep, visited); - if (siblings != null) { - // how many siblings are actually scored to avoid exceeding the visit budget. - // controls the early termination condition. early terminates the search if we reach - // visitLimit nodes - // if this visit limit is high we just navigate the graph until we do not have any node - // with a score higher than the ones already collected - // Current values it could assume: - // - No filter → Integer.MAX_VALUE (no constraint tighter than the full segment) - // - With filter → cardinality (no constraint tighter than the full accepted set) - numSiblingsToVisit = - (int) Math.min(siblings.length, results.visitLimit() - results.visitedCount()); - // Only mark as visited the siblings we will actually score; the rest remain - // reachable via normal graph traversal so a better child can still be found - for (int s = 0; s < numSiblingsToVisit; s++) visited.set(siblings[s]); + // The instanceof check is needed: this method is also called with a GraphBuilderKnnCollector + if (results instanceof OrdinalTranslatedKnnCollector collector) { + if (collector.isSiblingExpansionCollector()) { + siblings = collector.getSiblingOrdinals(ep, visited); + if (siblings.length > 0) { + // how many siblings are actually scored to avoid exceeding the visit budget. + // controls the early termination condition. early terminates the search if we reach + // visitLimit nodes + // if this visit limit is high we just navigate the graph until we do not have any node + // with a score higher than the ones already collected + // Current values it could assume: + // - No filter → Integer.MAX_VALUE (no constraint tighter than the full segment) + // - With filter → cardinality (no constraint tighter than the full accepted set) + numSiblingsToVisit = + (int) Math.min(siblings.length, results.visitLimit() - results.visitedCount()); + // Only mark as visited the siblings we will actually score; the rest remain + // reachable via normal graph traversal so a better child can still be found + for (int s = 0; s < numSiblingsToVisit; s++) visited.set(siblings[s]); + } } } // Collect the ep node here so after we have a correctly updated minCompetitiveSimilarity diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java index bc4226f21c28..4942f3a006eb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java @@ -38,9 +38,8 @@ public interface DocSiblingExpansion { /** - * Returns the doc ids of all siblings of {@code childDocId} whose parent has not yet been added - * to the result heap, or {@code null} if the parent was already processed or there are no other - * siblings. + * Returns the doc ids of all siblings of {@code childDocId}, or {@code null} if there are no + * other siblings. * * @param childDocId the document id of the child that is about to be collected * @return sibling doc ids, or {@code null} diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java index cb005361397c..5a516e435930 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java @@ -351,18 +351,18 @@ void searchLevel( // Fetch siblings BEFORE collect() so the parent is not yet in the heap int[] siblings = null; int numSiblingsToVisit = 0; - // The first check is needed since this method is also called by the GraphBuilderKnnCollector - // The second check is needed since we could have a TopKnnCollector at this point - if (results instanceof OrdinalTranslatedKnnCollector collector - && collector.supportsSiblingExpansion()) { - siblings = collector.getSiblingOrdinals(node, visited); - if (siblings != null) { - numSiblingsToVisit = - (int) - Math.min(siblings.length, results.visitLimit() - results.visitedCount()); - // Only mark as visited the siblings we will actually score; the rest remain - // reachable via normal graph traversal so a better child can still be found - for (int s = 0; s < numSiblingsToVisit; s++) visited.set(siblings[s]); + // The instanceof check is needed: this method is also called with a GraphBuilderKnnCollector + if (results instanceof OrdinalTranslatedKnnCollector collector) { + if (collector.isSiblingExpansionCollector()) { + siblings = collector.getSiblingOrdinals(node, visited); + if (siblings.length > 0) { + numSiblingsToVisit = + (int) + Math.min(siblings.length, results.visitLimit() - results.visitedCount()); + // Only mark as visited the siblings we will actually score; the rest remain + // reachable via normal graph traversal so a better child can still be found + for (int s = 0; s < numSiblingsToVisit; s++) visited.set(siblings[s]); + } } } if (results.collect(node, score)) { diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java index e20bc343046e..18c04a40a210 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java @@ -55,32 +55,39 @@ public TopDocs topDocs() { td.scoreDocs); } - public boolean supportsSiblingExpansion() { + // Needed since we could have a TopKnnCollector at this point + public boolean isSiblingExpansionCollector() { return collector instanceof DocSiblingExpansion; } public int[] getSiblingOrdinals(int hnswNode, BitSet visitedHnswNodes) { DocSiblingExpansion docExpanderCollector = (DocSiblingExpansion) collector; int docId = vectorOrdinalToDocId.apply(hnswNode); + // We do not check if parent is in heap since if we already seed A + // - A was found and scored (parent added), but we reach the budget limit and were not able to score B, + // we then found B through graph traversal. We want to visit it even if we already visited A. + // We do not visit siblings in score order. int[] siblingDocIds = docExpanderCollector.findSiblingDocIds(docId); if (siblingDocIds == null) { - return null; + return new int[0]; } int[] siblingOrdinals = new int[siblingDocIds.length]; // siblingOrdinals is pre-allocated to siblingDocIds.length and Java initializes int arrays to 0 // so this variable is necessary. + // due to visited result we could have a partial array to return int count = 0; for (int sibDocId : siblingDocIds) { int sibOrd = docExpanderCollector.docIdToOrdinal(sibDocId); - // sibOrd = -1 when a document has no vector for this field. - // Such a doc has no node in the HNSW graph and can't be scored, so it must be skipped. - // If a sibling was reached via normal graph traversal before sibling expansion triggered, - // re-adding it would - // cause it to be scored twice. !visitedHnswNodes.get(sibOrd) filters those out. + // sibOrd = -1: sibling has no vector for this field → no HNSW node, cannot be scored. + // + // !visitedHnswNodes: sibling was already reached via normal graph traversal: + // - B was scored via traversal but with score < minAcceptedSimilarity, so collect() + // was never called and the parent is not in the heap. Expansion from a different + // child A finds B already visited. if (sibOrd >= 0 && !visitedHnswNodes.get(sibOrd)) { siblingOrdinals[count++] = sibOrd; } } - return count > 0 ? ArrayUtil.copyOfSubArray(siblingOrdinals, 0, count) : null; + return count > 0 ? ArrayUtil.copyOfSubArray(siblingOrdinals, 0, count) : new int[0]; } } diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java index eba2181676ad..46fe7b6975ef 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java @@ -113,13 +113,9 @@ public int numCollected() { return heap.size(); } - // SOLO PER VISITARE LA FAMIGLIA ALTRIMENTI NON LO CHIAMIAMO PROPRIO + @Override public int[] findSiblingDocIds(int childDocId) { int parent = parentBitSet.nextSetBit(childDocId); - // Skip if this parent's entry is already in the heap - if (heap.containsParent(parent)) { - return null; - } // Find siblings range(prevParent, parent). If parent is 0 there are not prevParents so -1 int prevParent = parent > 0 ? parentBitSet.prevSetBit(parent - 1) : -1; int from = prevParent + 1; diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java index 94e2f085503d..5a8afcb4c1a8 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java @@ -84,20 +84,6 @@ public void testFindSiblingDocIds_returnsAllSiblings() throws IOException { assertArrayEquals(new int[] {3}, s4); } - public void testFindSiblingDocIds_parentAlreadyInHeap_returnsNull() throws IOException { - // 1 parents, 2 children → block: [0,1|2] - int numParents = 1, childrenPerParent = 2; - BitSet parents = parentBitSet(numParents, childrenPerParent); - int[] docToOrd = buildDocToOrd(numParents, childrenPerParent); - DiversifyingNearestChildrenKnnCollector c = makeCollector(10, parents, docToOrd); - - // Collect child 0 → parent 2 enters the heap - c.collect(0, 0.8f); - - // Now asking for siblings of child 1 (same parent 2) must return null - assertNull(c.findSiblingDocIds(1)); - } - public void testFindSiblingDocIds_singleChildParent_returnsNull() throws IOException { // 1 parents, 1 child each → blocks: [0|1] int numParents = 1, childrenPerParent = 1; @@ -217,6 +203,31 @@ public void testBestSiblingReplacesWorseChildWhenHeapFull() throws IOException { // Unit tests: OrdinalTranslatedKnnCollector — getSiblingOrdinals // --------------------------------------------------------------------------- + /** + * Even when the parent is already in the heap (from a prior expansion), getSiblingOrdinals must + * still return unvisited siblings. This allows a budget-truncated first expansion to be continued + * when additional children of the same parent are later discovered via normal graph traversal. + */ + public void testGetSiblingOrdinals_parentAlreadyInHeap_returnsUnvisitedSiblings() + throws IOException { + // 1 parent, 2 children: [C0=doc0, C1=doc1 | P0=doc2]; ordToDoc=[0,1] + BitSet parents = parentBitSet(1, 2); + int[] docToOrd = buildDocToOrd(1, 2); + int[] ordToDoc = {0, 1}; + OrdinalTranslatedKnnCollector collector = + new OrdinalTranslatedKnnCollector( + makeCollector(10, parents, docToOrd), ord -> ordToDoc[ord]); + + // Collect C0 → parent 2 enters the heap + collector.collect(0, 0.8f); + + // C1 (ordinal 1) is not yet visited; must still be returned as an expansion candidate + FixedBitSet visited = new FixedBitSet(3); + int[] result = collector.getSiblingOrdinals(0, visited); + assertNotEquals("unvisited sibling must be returned even when parent is already in heap", 0, result.length); + assertArrayEquals(new int[] {1}, result); + } + /** * Siblings whose ordinal is already in visitedOrds must be filtered out to prevent double-scoring * when the sibling was independently discovered via normal graph traversal. @@ -237,7 +248,7 @@ public void testPendingSiblingOrdinals_filtersAlreadyVisited() throws IOExceptio // Trigger on C0 (ordinal 0 → docId 0); siblings are C1 and C2 int[] result = collector.getSiblingOrdinals(0, visited); - assertNotNull(result); + assertNotEquals(0, result.length); assertArrayEquals("C1 must be filtered (visited); only C2 remains", new int[] {2}, result); } @@ -260,7 +271,7 @@ public void testPendingSiblingOrdinals_filtersSparseSiblings() throws IOExceptio // Trigger on C0 (ordinal 0 → docId 0); siblings are C1 (sparse) and C2 int[] result = collector.getSiblingOrdinals(0, visited); - assertNotNull(result); + assertNotEquals(0, result.length); assertArrayEquals("C1 (no vector) must be filtered; only C2 remains", new int[] {1}, result); } From 286d7ab94a2986c12f8209a88723ce0ed66f99d1 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Thu, 7 May 2026 14:42:40 +0200 Subject: [PATCH 15/23] Returned empty array instead of null in findiSiblingDocIds --- .../org/apache/lucene/util/hnsw/DocSiblingExpansion.java | 4 ++-- .../lucene/util/hnsw/OrdinalTranslatedKnnCollector.java | 3 --- .../search/join/DiversifyingNearestChildrenKnnCollector.java | 5 +++-- .../join/TestDiversifyingChildrenKnnSiblingExpansion.java | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java index 4942f3a006eb..41112e35ccb1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java @@ -38,11 +38,11 @@ public interface DocSiblingExpansion { /** - * Returns the doc ids of all siblings of {@code childDocId}, or {@code null} if there are no + * Returns the doc ids of all siblings of {@code childDocId}, or an empty array if there are no * other siblings. * * @param childDocId the document id of the child that is about to be collected - * @return sibling doc ids, or {@code null} + * @return sibling doc ids, or an empty array */ int[] findSiblingDocIds(int childDocId); diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java index 18c04a40a210..9f3324a567f3 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java @@ -68,9 +68,6 @@ public int[] getSiblingOrdinals(int hnswNode, BitSet visitedHnswNodes) { // we then found B through graph traversal. We want to visit it even if we already visited A. // We do not visit siblings in score order. int[] siblingDocIds = docExpanderCollector.findSiblingDocIds(docId); - if (siblingDocIds == null) { - return new int[0]; - } int[] siblingOrdinals = new int[siblingDocIds.length]; // siblingOrdinals is pre-allocated to siblingDocIds.length and Java initializes int arrays to 0 // so this variable is necessary. diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java index 46fe7b6975ef..f4e98cea8d8a 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java @@ -120,10 +120,11 @@ public int[] findSiblingDocIds(int childDocId) { int prevParent = parent > 0 ? parentBitSet.prevSetBit(parent - 1) : -1; int from = prevParent + 1; // Children of parent are all docIds in [from, parent); exclude childDocId itself + // The childDoc itself is scored in collect() int siblingsSize = parent - from - 1; // One child case if (siblingsSize == 0) { - return null; + return new int[0]; } int[] siblings = new int[siblingsSize]; int idx = 0; @@ -132,7 +133,7 @@ public int[] findSiblingDocIds(int childDocId) { siblings[idx++] = docId; } } - return idx > 0 ? siblings : null; + return idx > 0 ? siblings : new int[0]; } public int docIdToOrdinal(int docId) { diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java index 5a8afcb4c1a8..0f9c3467d162 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java @@ -91,7 +91,7 @@ public void testFindSiblingDocIds_singleChildParent_returnsNull() throws IOExcep int[] docToOrd = buildDocToOrd(numParents, childrenPerParent); DiversifyingNearestChildrenKnnCollector c = makeCollector(10, parents, docToOrd); - assertNull("sole child has no siblings", c.findSiblingDocIds(0)); + assertArrayEquals("sole child has no siblings", new int[0], c.findSiblingDocIds(0)); } /** From c9481f76cab94bd429d2b3ed05fb2e87ad12ea58 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Thu, 7 May 2026 15:26:38 +0200 Subject: [PATCH 16/23] Checked if numHnswNodes needed in scoreHnswNodes and changed some variables names --- .../util/hnsw/AbstractHnswGraphSearcher.java | 39 ++++++++++--------- .../lucene/util/hnsw/HnswGraphSearcher.java | 20 +++++----- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java index bb8c36a2e3b6..e9c2896f916a 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java @@ -103,15 +103,15 @@ protected static void scoreEntryPoints( visited.set(ep); candidates.add(ep, score); if (acceptOrds == null || acceptOrds.get(ep)) { - // Fetch siblings BEFORE collect() so the parent is not yet in the heap - int[] siblings = null; + // Fetch siblingsOrd BEFORE collect() so the parent is not yet in the heap + int[] siblingsOrd = null; int numSiblingsToVisit = 0; // The instanceof check is needed: this method is also called with a GraphBuilderKnnCollector if (results instanceof OrdinalTranslatedKnnCollector collector) { if (collector.isSiblingExpansionCollector()) { - siblings = collector.getSiblingOrdinals(ep, visited); - if (siblings.length > 0) { - // how many siblings are actually scored to avoid exceeding the visit budget. + siblingsOrd = collector.getSiblingOrdinals(ep, visited); + if (siblingsOrd.length > 0) { + // how many siblingsOrd are actually scored to avoid exceeding the visit budget. // controls the early termination condition. early terminates the search if we reach // visitLimit nodes // if this visit limit is high we just navigate the graph until we do not have any node @@ -120,23 +120,24 @@ protected static void scoreEntryPoints( // - No filter → Integer.MAX_VALUE (no constraint tighter than the full segment) // - With filter → cardinality (no constraint tighter than the full accepted set) numSiblingsToVisit = - (int) Math.min(siblings.length, results.visitLimit() - results.visitedCount()); - // Only mark as visited the siblings we will actually score; the rest remain + (int) Math.min(siblingsOrd.length, results.visitLimit() - results.visitedCount()); + // Only mark as visited the siblingsOrd we will actually score; the rest remain // reachable via normal graph traversal so a better child can still be found - for (int s = 0; s < numSiblingsToVisit; s++) visited.set(siblings[s]); + for (int s = 0; s < numSiblingsToVisit; s++) visited.set(siblingsOrd[s]); } } } // Collect the ep node here so after we have a correctly updated minCompetitiveSimilarity results.collect(ep, score); if (numSiblingsToVisit > 0) { + // IF NUMSIBLINGSTOVISIT IS NOT LIMITED WE CAN REMOVE THE VARIABLE AND USE SIBLINGS LENGTH siblingScores = scoreHnswNodes( results, scorer, candidates, acceptOrds, - siblings, + siblingsOrd, numSiblingsToVisit, siblingScores); } @@ -153,26 +154,26 @@ protected static float[] scoreHnswNodes( RandomVectorScorer scorer, NeighborQueue candidates, Bits acceptOrds, - int[] hnswNodes, - int numSiblings, + int[] hnswNodesOrd, + int numHnswNodes, float[] scores) throws IOException { // If siblingScores not defined yet or too small to collect scores a new one is created // Otherwise we reuse the old one that will be overridden in bulkScore with new scores - if (scores == null || scores.length < numSiblings) { - scores = new float[numSiblings]; + if (scores == null || scores.length < numHnswNodes) { + scores = new float[numHnswNodes]; } - float maxScore = scorer.bulkScore(hnswNodes, scores, numSiblings); - results.incVisitedCount(numSiblings); + float maxScore = scorer.bulkScore(hnswNodesOrd, scores, numHnswNodes); + results.incVisitedCount(numHnswNodes); if (maxScore > results.minCompetitiveSimilarity()) { float minSimilarity = Math.nextUp(results.minCompetitiveSimilarity()); - for (int j = 0; j < numSiblings; j++) { + for (int j = 0; j < numHnswNodes; j++) { float sibScore = scores[j]; // We avoid adding to candidates a sibling with a bad score if (sibScore >= minSimilarity) { - candidates.add(hnswNodes[j], sibScore); - if (acceptOrds == null || acceptOrds.get(hnswNodes[j])) { - results.collect(hnswNodes[j], sibScore); + candidates.add(hnswNodesOrd[j], sibScore); + if (acceptOrds == null || acceptOrds.get(hnswNodesOrd[j])) { + results.collect(hnswNodesOrd[j], sibScore); minSimilarity = Math.nextUp(results.minCompetitiveSimilarity()); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java index 5a516e435930..57b04e26a2df 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java @@ -348,20 +348,21 @@ void searchLevel( if (score >= minAcceptedSimilarity) { candidates.add(node, score); if (acceptOrds == null || acceptOrds.get(node)) { - // Fetch siblings BEFORE collect() so the parent is not yet in the heap - int[] siblings = null; + // Fetch siblingsOrd BEFORE collect() so the parent is not yet in the heap + int[] siblingsOrd = null; int numSiblingsToVisit = 0; // The instanceof check is needed: this method is also called with a GraphBuilderKnnCollector if (results instanceof OrdinalTranslatedKnnCollector collector) { if (collector.isSiblingExpansionCollector()) { - siblings = collector.getSiblingOrdinals(node, visited); - if (siblings.length > 0) { + siblingsOrd = collector.getSiblingOrdinals(node, visited); + if (siblingsOrd.length > 0) { + // TO DISCUSS IF NECESSARY numSiblingsToVisit = (int) - Math.min(siblings.length, results.visitLimit() - results.visitedCount()); - // Only mark as visited the siblings we will actually score; the rest remain + Math.min(siblingsOrd.length, results.visitLimit() - results.visitedCount()); + // Only mark as visited the siblingsOrd we will actually score; the rest remain // reachable via normal graph traversal so a better child can still be found - for (int s = 0; s < numSiblingsToVisit; s++) visited.set(siblings[s]); + for (int s = 0; s < numSiblingsToVisit; s++) visited.set(siblingsOrd[s]); } } } @@ -374,16 +375,17 @@ void searchLevel( shouldExploreMinSim = true; } } - // Score and collect all siblings of the newly-discovered parent + // Score and collect all siblingsOrd of the newly-discovered parent if (numSiblingsToVisit > 0) { float prevMinSim = results.minCompetitiveSimilarity(); + // IF NUMSIBLING IS NOT LIMITED WE CAN REMOVE THE VARIABLE AND USE SIBLINGS LENGTH siblingScores = scoreHnswNodes( results, scorer, candidates, acceptOrds, - siblings, + siblingsOrd, numSiblingsToVisit, siblingScores); if (results.minCompetitiveSimilarity() > prevMinSim) { From 70ea159db6cf16282e6a4235c2733c6b7a506e81 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Thu, 7 May 2026 16:07:50 +0200 Subject: [PATCH 17/23] Return empty array instead of null in buildDocToOrd --- ...versifyingNearestChildrenKnnCollector.java | 4 +- ...ingNearestChildrenKnnCollectorManager.java | 37 ++++++++++++++----- ...versifyingChildrenKnnSiblingExpansion.java | 6 +-- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java index f4e98cea8d8a..50ca21074fa7 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java @@ -49,7 +49,7 @@ class DiversifyingNearestChildrenKnnCollector extends AbstractKnnCollector * @param searchStrategy The search strategy to use * @param parentBitSet The leaf parent bitset * @param docToOrd precomputed array mapping docId to vector ordinal; {@code -1} entries mean the - * document has no vector. May be {@code null} to disable sibling expansion. + * document has no vector. An empty array disables sibling expansion. */ public DiversifyingNearestChildrenKnnCollector( int k, @@ -148,7 +148,7 @@ public int docIdToOrdinal(int docId) { // - future refactoring that changes the array size // - misuse when constructing the collector manually (as in tests, e.g. the // docIdToOrdinal(9999) test case) - if (docToOrd == null || docId >= docToOrd.length) { + if (docId >= docToOrd.length) { return -1; } return docToOrd[docId]; diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java index 9914b285e0ac..a908b203663a 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java @@ -36,10 +36,6 @@ */ public class DiversifyingNearestChildrenKnnCollectorManager implements KnnCollectorManager { - // Sentinel stored in the cache to represent "this segment has no vectors for the field", - // because ConcurrentHashMap does not allow null values. - private static final int[] NO_VECTORS = new int[0]; - // Cache keyed by (segment core cache key → (field name → docToOrd array)). // Entries are evicted automatically when the segment is closed via addClosedListener. private static final ConcurrentHashMap> @@ -123,30 +119,51 @@ private int[] getCachedDocToOrd(LeafReaderContext context) throws IOException { } int[] cached = fieldMap.get(field); if (cached != null) { - return cached == NO_VECTORS ? null : cached; + return cached; } int[] built = buildDocToOrd(context); - int[] stored = built != null ? built : NO_VECTORS; - int[] race = fieldMap.putIfAbsent(field, stored); - return race != null ? (race == NO_VECTORS ? null : race) : built; + int[] race = fieldMap.putIfAbsent(field, built); + return race != null ? race : built; } /** * Builds a docId-to-ordinal array for the given leaf, mapping each docId to its vector ordinal. * - *

    Returns {@code null} if the field has no vector values in this segment at all — sibling + *

    Returns an empty array if the field has no vector values in this segment at all — sibling * expansion will be disabled for this leaf. * *

    Otherwise returns an array of size {@code maxDoc} where each entry is the vector ordinal for * that docId, or {@code -1} if that specific document has no vector (sparse indexing). */ + // Step 1 — Index time: ordinals are assigned by insertion order + // In Lucene99FlatVectorsWriter.addValue(), each vector is appended to an ArrayList (vectors.add(copy)) and its docId + // is recorded in docsWithField. Documents are always added in ascending docId order (enforced by assert docID > + // lastDocID). So ordinal 0 = first doc with a vector, ordinal 1 = second, etc. + // + // Step 2 — Index time: ordToDoc mapping is written in the same order + // In OrdToDocDISIReaderConfiguration.writeStoredMeta(), docsWithField.iterator() is iterated in ascending docId + // order, and each docId is written to DirectMonotonicWriter sequentially. The i-th value written becomes ordinal + // i — so the ordToDoc array stored on disk is exactly: ordToDoc[0] = first docId, ordToDoc[1] = second docId, ... + // + // Step 3 — Query time: buildDocToOrd inverts the same ordering + // getFloatVectorValues(field).iterator() also yields docIds in ascending order (same set, same order as + // docsWithField at index time). The loop: + // while (iter.nextDoc() != NO_MORE_DOCS) { + // docToOrd[iter.docID()] = ord++; + // } + // assigns ord = 0 to the first docId, ord = 1 to the second — exactly inverting the ordToDoc array written at step 2. + // + // Step 4 — The HNSW graph uses these same ordinals as node IDs + // HNSW nodes are identified by their ordinal (the position in the flat vector store). So when the searcher returns + // ordinal k as a graph node, docToOrd[docId] = k being correct means docIdToOrdinal will find the right HNSW node + // for any sibling docId. private int[] buildDocToOrd(LeafReaderContext context) throws IOException { FieldInfo fi = context.reader().getFieldInfos().fieldInfo(field); // fi = null if the field doesn't exist in this segment at all. // fi.getVectorDimension() = 0 if the field exist in the segment but was not indexed as a vector // field. if (fi == null || fi.getVectorDimension() == 0) { - return null; + return new int[0]; } DocIdSetIterator iter = switch (fi.getVectorEncoding()) { diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java index 0f9c3467d162..9eaf98924191 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java @@ -134,11 +134,11 @@ public void testDocIdToOrdinal_correctMapping() throws IOException { assertEquals(-1, c.docIdToOrdinal(9999)); // beyond array bounds } - public void testDocIdToOrdinal_nullMapping_alwaysMinusOne() throws IOException { - // Collector created without a docToOrd array (sibling expansion disabled) + public void testDocIdToOrdinal_emptyMapping_alwaysMinusOne() throws IOException { + // Collector created with an empty docToOrd array (sibling expansion disabled) BitSet parents = parentBitSet(2, 2); DiversifyingNearestChildrenKnnCollector c = - new DiversifyingNearestChildrenKnnCollector(5, Integer.MAX_VALUE, null, parents, null); + new DiversifyingNearestChildrenKnnCollector(5, Integer.MAX_VALUE, null, parents, new int[0]); assertEquals(-1, c.docIdToOrdinal(0)); assertEquals(-1, c.docIdToOrdinal(1)); From 0d6154d1dff6dd1dde18c4d1bff098fcd10a34ad Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Thu, 7 May 2026 16:11:37 +0200 Subject: [PATCH 18/23] Why check on field info is needed --- .../DiversifyingNearestChildrenKnnCollectorManager.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java index a908b203663a..9b0a19cc13f4 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java @@ -162,6 +162,15 @@ private int[] buildDocToOrd(LeafReaderContext context) throws IOException { // fi = null if the field doesn't exist in this segment at all. // fi.getVectorDimension() = 0 if the field exist in the segment but was not indexed as a vector // field. + // + // 1. approximateSearch calls newCollector before searchNearestVectors + // 2. newCollector calls buildDocToOrd + // 3. Only then searchNearestVectors is called + // So buildDocToOrd is called for every segment before Lucene gets a chance to short-circuit on no vectors. + // The guard in buildDocToOrd is genuinely needed — without it, calling getFloatVectorValues on a segment with + // no vectors would return null (as we saw in CodecReader) and .iterator() would NPE. + // The alternative would be to guard in newCollector itself before calling getCachedDocToOrd, but the current + // placement is fine. if (fi == null || fi.getVectorDimension() == 0) { return new int[0]; } From 97828c6d139b823e5e393a5867f95abf032770ff Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Thu, 7 May 2026 16:46:04 +0200 Subject: [PATCH 19/23] Addressed Ben comments about reusing scratch space --- .../lucene/util/hnsw/AbstractHnswGraphSearcher.java | 4 ++-- .../apache/lucene/util/hnsw/HnswGraphSearcher.java | 4 ++-- .../util/hnsw/OrdinalTranslatedKnnCollector.java | 13 +++++++++---- ...TestDiversifyingChildrenKnnSiblingExpansion.java | 6 +++--- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java index e9c2896f916a..0823eb43b12d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java @@ -97,6 +97,7 @@ protected static void scoreEntryPoints( scorer.bulkScore(eps, scores, eps.length); results.incVisitedCount(eps.length); float[] siblingScores = null; + int[] siblingsOrd = new int[0]; for (int i = 0; i < eps.length; i++) { float score = scores[i]; int ep = eps[i]; @@ -104,12 +105,11 @@ protected static void scoreEntryPoints( candidates.add(ep, score); if (acceptOrds == null || acceptOrds.get(ep)) { // Fetch siblingsOrd BEFORE collect() so the parent is not yet in the heap - int[] siblingsOrd = null; int numSiblingsToVisit = 0; // The instanceof check is needed: this method is also called with a GraphBuilderKnnCollector if (results instanceof OrdinalTranslatedKnnCollector collector) { if (collector.isSiblingExpansionCollector()) { - siblingsOrd = collector.getSiblingOrdinals(ep, visited); + siblingsOrd = collector.getSiblingOrdinals(ep, visited, siblingsOrd); if (siblingsOrd.length > 0) { // how many siblingsOrd are actually scored to avoid exceeding the visit budget. // controls the early termination condition. early terminates the search if we reach diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java index 57b04e26a2df..40f005fa1956 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java @@ -45,6 +45,7 @@ public class HnswGraphSearcher extends AbstractHnswGraphSearcher { protected int[] bulkNodes = null; protected float[] bulkScores = null; protected float[] siblingScores = null; + protected int[] siblingsOrd = new int[0]; /** * HNSW search is roughly logarithmic. This doesn't take maxConn into account, but it is a pretty @@ -349,12 +350,11 @@ void searchLevel( candidates.add(node, score); if (acceptOrds == null || acceptOrds.get(node)) { // Fetch siblingsOrd BEFORE collect() so the parent is not yet in the heap - int[] siblingsOrd = null; int numSiblingsToVisit = 0; // The instanceof check is needed: this method is also called with a GraphBuilderKnnCollector if (results instanceof OrdinalTranslatedKnnCollector collector) { if (collector.isSiblingExpansionCollector()) { - siblingsOrd = collector.getSiblingOrdinals(node, visited); + siblingsOrd = collector.getSiblingOrdinals(node, visited, siblingsOrd); if (siblingsOrd.length > 0) { // TO DISCUSS IF NECESSARY numSiblingsToVisit = diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java index 9f3324a567f3..a2b70044bc95 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java @@ -60,7 +60,7 @@ public boolean isSiblingExpansionCollector() { return collector instanceof DocSiblingExpansion; } - public int[] getSiblingOrdinals(int hnswNode, BitSet visitedHnswNodes) { + public int[] getSiblingOrdinals(int hnswNode, BitSet visitedHnswNodes, int[] siblingOrdinals) { DocSiblingExpansion docExpanderCollector = (DocSiblingExpansion) collector; int docId = vectorOrdinalToDocId.apply(hnswNode); // We do not check if parent is in heap since if we already seed A @@ -68,7 +68,9 @@ public int[] getSiblingOrdinals(int hnswNode, BitSet visitedHnswNodes) { // we then found B through graph traversal. We want to visit it even if we already visited A. // We do not visit siblings in score order. int[] siblingDocIds = docExpanderCollector.findSiblingDocIds(docId); - int[] siblingOrdinals = new int[siblingDocIds.length]; + if (siblingOrdinals.length < siblingDocIds.length) { + siblingOrdinals = new int[siblingDocIds.length]; + } // siblingOrdinals is pre-allocated to siblingDocIds.length and Java initializes int arrays to 0 // so this variable is necessary. // due to visited result we could have a partial array to return @@ -82,9 +84,12 @@ public int[] getSiblingOrdinals(int hnswNode, BitSet visitedHnswNodes) { // was never called and the parent is not in the heap. Expansion from a different // child A finds B already visited. if (sibOrd >= 0 && !visitedHnswNodes.get(sibOrd)) { - siblingOrdinals[count++] = sibOrd; + siblingOrdinals[count++] = sibOrd; } } - return count > 0 ? ArrayUtil.copyOfSubArray(siblingOrdinals, 0, count) : new int[0]; + if (count == 0) { + return new int[0]; + } + return count < siblingOrdinals.length ? ArrayUtil.copyOfSubArray(siblingOrdinals, 0, count) : siblingOrdinals; } } diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java index 9eaf98924191..012124453052 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java @@ -223,7 +223,7 @@ public void testGetSiblingOrdinals_parentAlreadyInHeap_returnsUnvisitedSiblings( // C1 (ordinal 1) is not yet visited; must still be returned as an expansion candidate FixedBitSet visited = new FixedBitSet(3); - int[] result = collector.getSiblingOrdinals(0, visited); + int[] result = collector.getSiblingOrdinals(0, visited, new int[0]); assertNotEquals("unvisited sibling must be returned even when parent is already in heap", 0, result.length); assertArrayEquals(new int[] {1}, result); } @@ -247,7 +247,7 @@ public void testPendingSiblingOrdinals_filtersAlreadyVisited() throws IOExceptio visited.set(1); // Trigger on C0 (ordinal 0 → docId 0); siblings are C1 and C2 - int[] result = collector.getSiblingOrdinals(0, visited); + int[] result = collector.getSiblingOrdinals(0, visited, new int[0]); assertNotEquals(0, result.length); assertArrayEquals("C1 must be filtered (visited); only C2 remains", new int[] {2}, result); } @@ -270,7 +270,7 @@ public void testPendingSiblingOrdinals_filtersSparseSiblings() throws IOExceptio FixedBitSet visited = new FixedBitSet(4); // Trigger on C0 (ordinal 0 → docId 0); siblings are C1 (sparse) and C2 - int[] result = collector.getSiblingOrdinals(0, visited); + int[] result = collector.getSiblingOrdinals(0, visited, new int[0]); assertNotEquals(0, result.length); assertArrayEquals("C1 (no vector) must be filtered; only C2 remains", new int[] {1}, result); } From 51ddd9faeb476fe8d8f3dda093840339046f6503 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Thu, 7 May 2026 16:58:36 +0200 Subject: [PATCH 20/23] Gradlew tidy --- .../util/hnsw/AbstractHnswGraphSearcher.java | 6 ++-- .../lucene/util/hnsw/DocSiblingExpansion.java | 18 ++++++---- .../lucene/util/hnsw/HnswGraphSearcher.java | 6 ++-- .../hnsw/OrdinalTranslatedKnnCollector.java | 11 +++--- ...versifyingNearestChildrenKnnCollector.java | 6 +--- ...ingNearestChildrenKnnCollectorManager.java | 36 ++++++++++++------- ...versifyingChildrenKnnSiblingExpansion.java | 6 ++-- 7 files changed, 55 insertions(+), 34 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java index 0823eb43b12d..f7ebbf4321b4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java @@ -106,7 +106,8 @@ protected static void scoreEntryPoints( if (acceptOrds == null || acceptOrds.get(ep)) { // Fetch siblingsOrd BEFORE collect() so the parent is not yet in the heap int numSiblingsToVisit = 0; - // The instanceof check is needed: this method is also called with a GraphBuilderKnnCollector + // The instanceof check is needed: this method is also called with a + // GraphBuilderKnnCollector if (results instanceof OrdinalTranslatedKnnCollector collector) { if (collector.isSiblingExpansionCollector()) { siblingsOrd = collector.getSiblingOrdinals(ep, visited, siblingsOrd); @@ -114,7 +115,8 @@ protected static void scoreEntryPoints( // how many siblingsOrd are actually scored to avoid exceeding the visit budget. // controls the early termination condition. early terminates the search if we reach // visitLimit nodes - // if this visit limit is high we just navigate the graph until we do not have any node + // if this visit limit is high we just navigate the graph until we do not have any + // node // with a score higher than the ones already collected // Current values it could assume: // - No filter → Integer.MAX_VALUE (no constraint tighter than the full segment) diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java index 41112e35ccb1..d0200f58b06c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/DocSiblingExpansion.java @@ -18,9 +18,9 @@ package org.apache.lucene.util.hnsw; /** - * Implemented by collectors that - * understand parent-child document relationships and can enumerate sibling document ids for a given - * child document id, as well as translate document ids back to vector ordinals. + * Implemented by collectors that understand parent-child document relationships and can enumerate + * sibling document ids for a given child document id, as well as translate document ids back to + * vector ordinals. * *

    This interface is used internally by {@link OrdinalTranslatedKnnCollector} to bridge between * the ordinal space of the HNSW graph and the document-id space of the collector. @@ -28,12 +28,16 @@ * @lucene.experimental */ // The interface cannot be removed. It exists for a module-boundary reason. -// DocSiblingExpansion is in lucene/core, while DiversifyingNearestChildrenKnnCollector is in lucene/join. -// The dependency is one-way: join depends on core, never the reverse. So OrdinalTranslatedKnnCollector (in core) +// DocSiblingExpansion is in lucene/core, while DiversifyingNearestChildrenKnnCollector is in +// lucene/join. +// The dependency is one-way: join depends on core, never the reverse. So +// OrdinalTranslatedKnnCollector (in core) // has no way to reference DiversifyingNearestChildrenKnnCollector directly. // -// The interface is the bridge — it lets core call findSiblingDocIds and docIdToOrdinal on the collector without -// creating a circular dependency. Removing it would require either moving OrdinalTranslatedKnnCollector into +// The interface is the bridge — it lets core call findSiblingDocIds and docIdToOrdinal on the +// collector without +// creating a circular dependency. Removing it would require either moving +// OrdinalTranslatedKnnCollector into // join (bigger refactor) or adding a core → join dependency (illegal in this architecture). public interface DocSiblingExpansion { diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java index 40f005fa1956..62a518a5cc54 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java @@ -351,7 +351,8 @@ void searchLevel( if (acceptOrds == null || acceptOrds.get(node)) { // Fetch siblingsOrd BEFORE collect() so the parent is not yet in the heap int numSiblingsToVisit = 0; - // The instanceof check is needed: this method is also called with a GraphBuilderKnnCollector + // The instanceof check is needed: this method is also called with a + // GraphBuilderKnnCollector if (results instanceof OrdinalTranslatedKnnCollector collector) { if (collector.isSiblingExpansionCollector()) { siblingsOrd = collector.getSiblingOrdinals(node, visited, siblingsOrd); @@ -359,7 +360,8 @@ void searchLevel( // TO DISCUSS IF NECESSARY numSiblingsToVisit = (int) - Math.min(siblingsOrd.length, results.visitLimit() - results.visitedCount()); + Math.min( + siblingsOrd.length, results.visitLimit() - results.visitedCount()); // Only mark as visited the siblingsOrd we will actually score; the rest remain // reachable via normal graph traversal so a better child can still be found for (int s = 0; s < numSiblingsToVisit; s++) visited.set(siblingsOrd[s]); diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java index a2b70044bc95..d0afb7e0da93 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java @@ -64,12 +64,13 @@ public int[] getSiblingOrdinals(int hnswNode, BitSet visitedHnswNodes, int[] sib DocSiblingExpansion docExpanderCollector = (DocSiblingExpansion) collector; int docId = vectorOrdinalToDocId.apply(hnswNode); // We do not check if parent is in heap since if we already seed A - // - A was found and scored (parent added), but we reach the budget limit and were not able to score B, + // - A was found and scored (parent added), but we reach the budget limit and were not able to + // score B, // we then found B through graph traversal. We want to visit it even if we already visited A. // We do not visit siblings in score order. int[] siblingDocIds = docExpanderCollector.findSiblingDocIds(docId); if (siblingOrdinals.length < siblingDocIds.length) { - siblingOrdinals = new int[siblingDocIds.length]; + siblingOrdinals = new int[siblingDocIds.length]; } // siblingOrdinals is pre-allocated to siblingDocIds.length and Java initializes int arrays to 0 // so this variable is necessary. @@ -84,12 +85,14 @@ public int[] getSiblingOrdinals(int hnswNode, BitSet visitedHnswNodes, int[] sib // was never called and the parent is not in the heap. Expansion from a different // child A finds B already visited. if (sibOrd >= 0 && !visitedHnswNodes.get(sibOrd)) { - siblingOrdinals[count++] = sibOrd; + siblingOrdinals[count++] = sibOrd; } } if (count == 0) { return new int[0]; } - return count < siblingOrdinals.length ? ArrayUtil.copyOfSubArray(siblingOrdinals, 0, count) : siblingOrdinals; + return count < siblingOrdinals.length + ? ArrayUtil.copyOfSubArray(siblingOrdinals, 0, count) + : siblingOrdinals; } } diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java index 50ca21074fa7..404f8cd156c9 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java @@ -136,6 +136,7 @@ public int[] findSiblingDocIds(int childDocId) { return idx > 0 ? siblings : new int[0]; } + @Override public int docIdToOrdinal(int docId) { // Conditions explanation: // docToOrd == null — buildDocToOrd returns null when the segment has no vector values at all @@ -288,11 +289,6 @@ public final int size() { return size; } - /** Returns {@code true} if a child of the given parent is already in the heap. */ - public boolean containsParent(int parentDocId) { - return nodeIdHeapIndex.containsKey(parentDocId); - } - /** * Returns true if the element at heap position {@code k} should be evicted before the element * at position {@code j} (i.e. a is "less than" b in the min-heap ordering). Lower score loses diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java index 9b0a19cc13f4..845a0267c5e2 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java @@ -136,26 +136,35 @@ private int[] getCachedDocToOrd(LeafReaderContext context) throws IOException { * that docId, or {@code -1} if that specific document has no vector (sparse indexing). */ // Step 1 — Index time: ordinals are assigned by insertion order - // In Lucene99FlatVectorsWriter.addValue(), each vector is appended to an ArrayList (vectors.add(copy)) and its docId - // is recorded in docsWithField. Documents are always added in ascending docId order (enforced by assert docID > + // In Lucene99FlatVectorsWriter.addValue(), each vector is appended to an ArrayList + // (vectors.add(copy)) and its docId + // is recorded in docsWithField. Documents are always added in ascending docId order (enforced by + // assert docID > // lastDocID). So ordinal 0 = first doc with a vector, ordinal 1 = second, etc. // // Step 2 — Index time: ordToDoc mapping is written in the same order - // In OrdToDocDISIReaderConfiguration.writeStoredMeta(), docsWithField.iterator() is iterated in ascending docId - // order, and each docId is written to DirectMonotonicWriter sequentially. The i-th value written becomes ordinal - // i — so the ordToDoc array stored on disk is exactly: ordToDoc[0] = first docId, ordToDoc[1] = second docId, ... + // In OrdToDocDISIReaderConfiguration.writeStoredMeta(), docsWithField.iterator() is iterated in + // ascending docId + // order, and each docId is written to DirectMonotonicWriter sequentially. The i-th value written + // becomes ordinal + // i — so the ordToDoc array stored on disk is exactly: ordToDoc[0] = first docId, ordToDoc[1] = + // second docId, ... // // Step 3 — Query time: buildDocToOrd inverts the same ordering - // getFloatVectorValues(field).iterator() also yields docIds in ascending order (same set, same order as + // getFloatVectorValues(field).iterator() also yields docIds in ascending order (same set, same + // order as // docsWithField at index time). The loop: // while (iter.nextDoc() != NO_MORE_DOCS) { // docToOrd[iter.docID()] = ord++; // } - // assigns ord = 0 to the first docId, ord = 1 to the second — exactly inverting the ordToDoc array written at step 2. + // assigns ord = 0 to the first docId, ord = 1 to the second — exactly inverting the ordToDoc + // array written at step 2. // // Step 4 — The HNSW graph uses these same ordinals as node IDs - // HNSW nodes are identified by their ordinal (the position in the flat vector store). So when the searcher returns - // ordinal k as a graph node, docToOrd[docId] = k being correct means docIdToOrdinal will find the right HNSW node + // HNSW nodes are identified by their ordinal (the position in the flat vector store). So when the + // searcher returns + // ordinal k as a graph node, docToOrd[docId] = k being correct means docIdToOrdinal will find the + // right HNSW node // for any sibling docId. private int[] buildDocToOrd(LeafReaderContext context) throws IOException { FieldInfo fi = context.reader().getFieldInfos().fieldInfo(field); @@ -166,10 +175,13 @@ private int[] buildDocToOrd(LeafReaderContext context) throws IOException { // 1. approximateSearch calls newCollector before searchNearestVectors // 2. newCollector calls buildDocToOrd // 3. Only then searchNearestVectors is called - // So buildDocToOrd is called for every segment before Lucene gets a chance to short-circuit on no vectors. - // The guard in buildDocToOrd is genuinely needed — without it, calling getFloatVectorValues on a segment with + // So buildDocToOrd is called for every segment before Lucene gets a chance to short-circuit on + // no vectors. + // The guard in buildDocToOrd is genuinely needed — without it, calling getFloatVectorValues on + // a segment with // no vectors would return null (as we saw in CodecReader) and .iterator() would NPE. - // The alternative would be to guard in newCollector itself before calling getCachedDocToOrd, but the current + // The alternative would be to guard in newCollector itself before calling getCachedDocToOrd, + // but the current // placement is fine. if (fi == null || fi.getVectorDimension() == 0) { return new int[0]; diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java index 012124453052..2202384bb114 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestDiversifyingChildrenKnnSiblingExpansion.java @@ -138,7 +138,8 @@ public void testDocIdToOrdinal_emptyMapping_alwaysMinusOne() throws IOException // Collector created with an empty docToOrd array (sibling expansion disabled) BitSet parents = parentBitSet(2, 2); DiversifyingNearestChildrenKnnCollector c = - new DiversifyingNearestChildrenKnnCollector(5, Integer.MAX_VALUE, null, parents, new int[0]); + new DiversifyingNearestChildrenKnnCollector( + 5, Integer.MAX_VALUE, null, parents, new int[0]); assertEquals(-1, c.docIdToOrdinal(0)); assertEquals(-1, c.docIdToOrdinal(1)); @@ -224,7 +225,8 @@ public void testGetSiblingOrdinals_parentAlreadyInHeap_returnsUnvisitedSiblings( // C1 (ordinal 1) is not yet visited; must still be returned as an expansion candidate FixedBitSet visited = new FixedBitSet(3); int[] result = collector.getSiblingOrdinals(0, visited, new int[0]); - assertNotEquals("unvisited sibling must be returned even when parent is already in heap", 0, result.length); + assertNotEquals( + "unvisited sibling must be returned even when parent is already in heap", 0, result.length); assertArrayEquals(new int[] {1}, result); } From 9b3aae05bdd1d65e77ffaba49bef08f0fddcb783 Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Fri, 8 May 2026 10:00:42 +0200 Subject: [PATCH 21/23] Changes.txt --- lucene/CHANGES.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 331aab15c639..6dd5a7c3e91f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -146,6 +146,8 @@ Optimizations * GITHUB#15597, GITHUB#15777: Reduce memory usage of NeighborArray (Viliam Durina) +* GITHUB#16034: Sibling expansion as an optimization for KNN vector search over parent-child document relationships. (Anna Ruggero, Alessandro Benedetti) + Bug Fixes --------------------- * GITHUB#14049: Randomize KNN codec params in RandomCodec. Fixes scalar quantization div-by-zero From 765cd13842a49383cf5a2a4b392782627cae43cc Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Mon, 11 May 2026 12:31:50 +0200 Subject: [PATCH 22/23] Removed unuseful part related to numSiblingsToVisit --- .../util/hnsw/AbstractHnswGraphSearcher.java | 34 +++++-------------- .../lucene/util/hnsw/HnswGraphSearcher.java | 16 ++------- 2 files changed, 10 insertions(+), 40 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java index f7ebbf4321b4..813b5845a122 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java @@ -105,34 +105,17 @@ protected static void scoreEntryPoints( candidates.add(ep, score); if (acceptOrds == null || acceptOrds.get(ep)) { // Fetch siblingsOrd BEFORE collect() so the parent is not yet in the heap - int numSiblingsToVisit = 0; // The instanceof check is needed: this method is also called with a // GraphBuilderKnnCollector if (results instanceof OrdinalTranslatedKnnCollector collector) { if (collector.isSiblingExpansionCollector()) { siblingsOrd = collector.getSiblingOrdinals(ep, visited, siblingsOrd); - if (siblingsOrd.length > 0) { - // how many siblingsOrd are actually scored to avoid exceeding the visit budget. - // controls the early termination condition. early terminates the search if we reach - // visitLimit nodes - // if this visit limit is high we just navigate the graph until we do not have any - // node - // with a score higher than the ones already collected - // Current values it could assume: - // - No filter → Integer.MAX_VALUE (no constraint tighter than the full segment) - // - With filter → cardinality (no constraint tighter than the full accepted set) - numSiblingsToVisit = - (int) Math.min(siblingsOrd.length, results.visitLimit() - results.visitedCount()); - // Only mark as visited the siblingsOrd we will actually score; the rest remain - // reachable via normal graph traversal so a better child can still be found - for (int s = 0; s < numSiblingsToVisit; s++) visited.set(siblingsOrd[s]); - } + for (int ord : siblingsOrd) visited.set(ord); } } // Collect the ep node here so after we have a correctly updated minCompetitiveSimilarity results.collect(ep, score); - if (numSiblingsToVisit > 0) { - // IF NUMSIBLINGSTOVISIT IS NOT LIMITED WE CAN REMOVE THE VARIABLE AND USE SIBLINGS LENGTH + if (siblingsOrd.length > 0) { siblingScores = scoreHnswNodes( results, @@ -140,7 +123,6 @@ protected static void scoreEntryPoints( candidates, acceptOrds, siblingsOrd, - numSiblingsToVisit, siblingScores); } } @@ -157,19 +139,19 @@ protected static float[] scoreHnswNodes( NeighborQueue candidates, Bits acceptOrds, int[] hnswNodesOrd, - int numHnswNodes, float[] scores) throws IOException { + int numNodes = hnswNodesOrd.length; // If siblingScores not defined yet or too small to collect scores a new one is created // Otherwise we reuse the old one that will be overridden in bulkScore with new scores - if (scores == null || scores.length < numHnswNodes) { - scores = new float[numHnswNodes]; + if (scores == null || scores.length < numNodes) { + scores = new float[numNodes]; } - float maxScore = scorer.bulkScore(hnswNodesOrd, scores, numHnswNodes); - results.incVisitedCount(numHnswNodes); + float maxScore = scorer.bulkScore(hnswNodesOrd, scores, numNodes); + results.incVisitedCount(numNodes); if (maxScore > results.minCompetitiveSimilarity()) { float minSimilarity = Math.nextUp(results.minCompetitiveSimilarity()); - for (int j = 0; j < numHnswNodes; j++) { + for (int j = 0; j < numNodes; j++) { float sibScore = scores[j]; // We avoid adding to candidates a sibling with a bad score if (sibScore >= minSimilarity) { diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java index 62a518a5cc54..786fc8ee53da 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java @@ -350,22 +350,12 @@ void searchLevel( candidates.add(node, score); if (acceptOrds == null || acceptOrds.get(node)) { // Fetch siblingsOrd BEFORE collect() so the parent is not yet in the heap - int numSiblingsToVisit = 0; // The instanceof check is needed: this method is also called with a // GraphBuilderKnnCollector if (results instanceof OrdinalTranslatedKnnCollector collector) { if (collector.isSiblingExpansionCollector()) { siblingsOrd = collector.getSiblingOrdinals(node, visited, siblingsOrd); - if (siblingsOrd.length > 0) { - // TO DISCUSS IF NECESSARY - numSiblingsToVisit = - (int) - Math.min( - siblingsOrd.length, results.visitLimit() - results.visitedCount()); - // Only mark as visited the siblingsOrd we will actually score; the rest remain - // reachable via normal graph traversal so a better child can still be found - for (int s = 0; s < numSiblingsToVisit; s++) visited.set(siblingsOrd[s]); - } + for (int ord : siblingsOrd) visited.set(ord); } } if (results.collect(node, score)) { @@ -378,9 +368,8 @@ void searchLevel( } } // Score and collect all siblingsOrd of the newly-discovered parent - if (numSiblingsToVisit > 0) { + if (siblingsOrd.length > 0) { float prevMinSim = results.minCompetitiveSimilarity(); - // IF NUMSIBLING IS NOT LIMITED WE CAN REMOVE THE VARIABLE AND USE SIBLINGS LENGTH siblingScores = scoreHnswNodes( results, @@ -388,7 +377,6 @@ void searchLevel( candidates, acceptOrds, siblingsOrd, - numSiblingsToVisit, siblingScores); if (results.minCompetitiveSimilarity() > prevMinSim) { minAcceptedSimilarity = Math.nextUp(results.minCompetitiveSimilarity()); From e7604c4fd094027d5e82be8628ebc7b14c0a598a Mon Sep 17 00:00:00 2001 From: Anna Ruggero Date: Mon, 11 May 2026 12:34:04 +0200 Subject: [PATCH 23/23] Updated comment --- .../org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java index 813b5845a122..db9ef79b2163 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/AbstractHnswGraphSearcher.java @@ -142,7 +142,7 @@ protected static float[] scoreHnswNodes( float[] scores) throws IOException { int numNodes = hnswNodesOrd.length; - // If siblingScores not defined yet or too small to collect scores a new one is created + // If scores not defined yet or too small to collect scores a new one is created // Otherwise we reuse the old one that will be overridden in bulkScore with new scores if (scores == null || scores.length < numNodes) { scores = new float[numNodes];