Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
2190c70
Removed unuseful checks
aruggero Apr 23, 2026
66390c8
Removed unused variable
aruggero Apr 23, 2026
da4ec3d
Moved downHeap from int to void since the returned value is never used
aruggero Apr 23, 2026
2f33476
Implemented siblings exploration
aruggero May 4, 2026
b1f6b86
Added benchmark for measuring speed
aruggero May 4, 2026
9ab84ad
Added cache and update benchmark for testing different scenarios
aruggero May 4, 2026
07421fb
Updated tests for triggering early termination
aruggero May 4, 2026
8fae7fb
Added comments to tests
aruggero May 5, 2026
32d406f
Gradlew tidy
aruggero May 5, 2026
5dbf0f7
Gradlew check
aruggero May 5, 2026
2f97ab8
Merge remote-tracking branch 'upStream/main' into diversifyingImprove…
aruggero May 6, 2026
0448dcf
Small names refactoring
aruggero May 6, 2026
ebc30a0
Removed ChildrenSiblingExpansion interface
aruggero May 6, 2026
ace844f
Analyzed if DocSiblingExpansion could be removed -> NO
aruggero May 6, 2026
3154622
Removed return null from getSiblingOrdinals. Empty array given.
aruggero May 6, 2026
286d7ab
Returned empty array instead of null in findiSiblingDocIds
aruggero May 7, 2026
c9481f7
Checked if numHnswNodes needed in scoreHnswNodes and changed some var…
aruggero May 7, 2026
70ea159
Return empty array instead of null in buildDocToOrd
aruggero May 7, 2026
0d6154d
Why check on field info is needed
aruggero May 7, 2026
97828c6
Addressed Ben comments about reusing scratch space
aruggero May 7, 2026
51ddd9f
Gradlew tidy
aruggero May 7, 2026
9b3aae0
Changes.txt
aruggero May 8, 2026
765cd13
Removed unuseful part related to numSiblingsToVisit
aruggero May 11, 2026
e7604c4
Updated comment
aruggero May 11, 2026
d623ebe
Merge remote-tracking branch 'upStream/main' into diversifyingImprove…
aruggero May 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ Optimizations

* GITHUB#15597, GITHUB#15777: Reduce memory usage of NeighborArray (Viliam Durina)

* GITHUB#16034: Sibling expansion as an optimization for KNN vector search over parent-child document relationships. (Anna Ruggero, Alessandro Benedetti)

Bug Fixes
---------------------
* GITHUB#14049: Randomize KNN codec params in RandomCodec. Fixes scalar quantization div-by-zero
Expand Down
1 change: 1 addition & 0 deletions lucene/benchmark-jmh/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ description = 'Lucene JMH micro-benchmarking module'
dependencies {
moduleImplementation project(':lucene:core')
moduleImplementation project(':lucene:expressions')
moduleImplementation project(':lucene:join')
moduleImplementation project(':lucene:sandbox')
moduleTestImplementation project(':lucene:test-framework')

Expand Down
1 change: 1 addition & 0 deletions lucene/benchmark-jmh/src/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
requires jdk.unsupported;
requires org.apache.lucene.core;
requires org.apache.lucene.expressions;
requires org.apache.lucene.join;
requires org.apache.lucene.sandbox;
requires commons.math3;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.KnnFloatVectorField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.join.DiversifyingChildrenFloatKnnVectorQuery;
import org.apache.lucene.search.join.QueryBitSetProducer;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.IOUtils;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.Warmup;

/**
* Benchmarks end-to-end latency of {@link DiversifyingChildrenFloatKnnVectorQuery} with sibling
* expansion enabled across three sibling-correlation scenarios:
*
* <ul>
* <li><b>best</b> — siblings are nearly identical (small noise around a parent centroid).
* Expansion finds the best sibling immediately; HNSW terminates early.
* <li><b>standard</b> — siblings have moderate correlation (realistic use case).
* <li><b>worst</b> — siblings are fully independent random vectors. Expansion fires but adds no
* recall benefit; measures pure overhead.
* </ul>
*
* Run with:
*
* <pre>
* ./gradlew -p lucene/benchmark-jmh assemble
* java -jar lucene/benchmark-jmh/build/benchmarks/lucene-benchmark-jmh-*.jar DiversifyingChildrenKnnQueryBenchmark
* </pre>
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
// 4 iterations 1 second each - results discarded
@Warmup(iterations = 4, time = 1)
// 5 iterations 1 second each - results recorded (how many calls we can do in 1 sec)
@Measurement(iterations = 5, time = 1)
// 3 separate JVM processes
@Fork(
value = 3,
jvmArgsAppend = {"-Xmx4g", "-Xms4g", "-XX:+AlwaysPreTouch"})
public class DiversifyingChildrenKnnQueryBenchmark {

private static final String FIELD = "vec";
private static final String PARENT_FIELD = "docType";
private static final String PARENT_VALUE = "_parent";
private static final int NUM_QUERY_VECTORS = 256;

/**
* Sibling correlation scenario:
*
* <ul>
* <li>{@code best} — siblings nearly identical (noise = 0.05); best case for expansion.
* <li>{@code standard} — siblings moderately correlated (noise = 0.3); realistic case.
* <li>{@code worst} — siblings fully random; pure overhead, no recall benefit.
* </ul>
*/
@Param({"best", "standard", "worst"})
public String siblingCorrelation;

@Param({"5000"})
public int numParents;

@Param({"4", "8", "16"})
public int childrenPerParent;

@Param({"10", "100"})
public int k;

@Param({"128"})
public int dim;

private Path tmpDir;
private Directory dir;
private IndexReader reader;
private IndexSearcher searcher;
private QueryBitSetProducer parentFilter;
private float[][] queryVectors;
private int queryIdx;

@Setup(Level.Trial)
public void setup() throws IOException {
tmpDir = Files.createTempDirectory("DiversifyingChildrenKnnQueryBenchmark");
dir = MMapDirectory.open(tmpDir);

// How much siblings are near to each other
float noiseLevel =
switch (siblingCorrelation) {
case "best" -> 0.05f; // nearly identical
case "standard" -> 0.30f; // moderately correlated
default -> Float.NaN; // worst: fully random, no centroid
};

Random rnd = new Random(42);
// index creation
try (IndexWriter w = new IndexWriter(dir, new IndexWriterConfig())) {
// 5000 parents
for (int p = 0; p < numParents; p++) {
// vector of 128-dim
float[] centroid = Float.isNaN(noiseLevel) ? null : randomUnitVector(dim, rnd);
List<Document> block = new ArrayList<>();
// 4 - 8 - 16 children per parent
for (int c = 0; c < childrenPerParent; c++) {
float[] vec =
centroid == null
? randomUnitVector(dim, rnd)
: perturbedUnitVector(centroid, noiseLevel, rnd);
// create child doc
Document child = new Document();
child.add(new KnnFloatVectorField(FIELD, vec, VectorSimilarityFunction.DOT_PRODUCT));
// add to the index block
block.add(child);
}
// create parent document
Document parent = new Document();
// docType = _parent
parent.add(new StringField(PARENT_FIELD, PARENT_VALUE, Field.Store.NO));
// add to the index block
block.add(parent);
// add to the index writer
w.addDocuments(block);
}
// compress to one segment
w.forceMerge(1);
}

reader = DirectoryReader.open(dir);
searcher = new IndexSearcher(reader);
// parent filter docType = _parent
parentFilter = new QueryBitSetProducer(new TermQuery(new Term(PARENT_FIELD, PARENT_VALUE)));

Random qrnd = new Random(123);
queryVectors = new float[NUM_QUERY_VECTORS][];
for (int i = 0; i < NUM_QUERY_VECTORS; i++) {
// random query vectors
queryVectors[i] = randomUnitVector(dim, qrnd);
}
}

@TearDown(Level.Trial)
public void teardown() throws IOException {
IOUtils.close(reader, dir);
IOUtils.rm(tmpDir);
}

@Benchmark
public TopDocs search() throws IOException {
// benchmarked part - search
// iterates on all the queries in a round-robin
float[] query = queryVectors[queryIdx++ & (NUM_QUERY_VECTORS - 1)];
Query knnQuery =
new DiversifyingChildrenFloatKnnVectorQuery(FIELD, query, null, k, parentFilter);
return searcher.search(knnQuery, k);
}

private static float[] randomUnitVector(int dim, Random rnd) {
float[] v = new float[dim];
for (int i = 0; i < dim; i++) v[i] = rnd.nextFloat() * 2 - 1;
return normalise(v);
}

/** Returns a unit vector near {@code centroid} with per-dimension noise scaled by noiseLevel. */
private static float[] perturbedUnitVector(float[] centroid, float noiseLevel, Random rnd) {
float[] v = new float[centroid.length];
for (int i = 0; i < centroid.length; i++) {
v[i] = centroid[i] + noiseLevel * (rnd.nextFloat() * 2 - 1);
}
return normalise(v);
}

// Since we use DOT PRODUCT
private static float[] normalise(float[] v) {
float norm = 0;
for (float x : v) norm += x * x;
norm = (float) Math.sqrt(norm);
for (int i = 0; i < v.length; i++) v[i] /= norm;
return v;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -96,14 +96,73 @@ protected static void scoreEntryPoints(
assert scores != null && scores.length >= eps.length;
scorer.bulkScore(eps, scores, eps.length);
results.incVisitedCount(eps.length);
float[] siblingScores = null;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, let's not do it in the entry point exploration. I think just doing max there is the best way.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @benwtrent,
From what @alessandrobenedetti and I had seen, scoreEntryPoints() is done only once (multiple times only with a seeded query in knn), since we have:

org.apache.lucene.util.hnsw.AbstractHnswGraphSearcher#search
search() calling findBestEntryPoint() + searchLevel()

org.apache.lucene.util.hnsw.HnswGraphSearcher#searchLevel
and searchLevel() (which calls scoreEntryPoints()) is called only on level 0

It shouldn't add too much work here, right?

Moving this to the search-only part would break some assertions on visits that we would need to manage otherwise...

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

scoreEntryPoints is about getting to the best approximate option in the bottom layer. I don't think it adds too much work, but I wonder it is helping at all, and I suspect it isn't, and thus shouldn't be done.

int[] siblingsOrd = new int[0];
for (int i = 0; i < eps.length; i++) {
float score = scores[i];
int ep = eps[i];
visited.set(ep);
candidates.add(ep, score);
if (acceptOrds == null || acceptOrds.get(ep)) {
// Fetch siblingsOrd BEFORE collect() so the parent is not yet in the heap
// The instanceof check is needed: this method is also called with a
// GraphBuilderKnnCollector
if (results instanceof OrdinalTranslatedKnnCollector collector) {
if (collector.isSiblingExpansionCollector()) {
siblingsOrd = collector.getSiblingOrdinals(ep, visited, siblingsOrd);
for (int ord : siblingsOrd) visited.set(ord);
}
}
// Collect the ep node here so after we have a correctly updated minCompetitiveSimilarity
results.collect(ep, score);
if (siblingsOrd.length > 0) {
siblingScores =
scoreHnswNodes(
results,
scorer,
candidates,
acceptOrds,
siblingsOrd,
siblingScores);
}
}
}
}

/**
* Scores and collects siblings, adding competitive ones to the candidate queue. Reuses and
* returns the siblingScores buffer, reallocating only if too small.
*/
protected static float[] scoreHnswNodes(
KnnCollector results,
RandomVectorScorer scorer,
NeighborQueue candidates,
Bits acceptOrds,
int[] hnswNodesOrd,
float[] scores)
throws IOException {
int numNodes = hnswNodesOrd.length;
// If scores not defined yet or too small to collect scores a new one is created
// Otherwise we reuse the old one that will be overridden in bulkScore with new scores
if (scores == null || scores.length < numNodes) {
scores = new float[numNodes];
}
float maxScore = scorer.bulkScore(hnswNodesOrd, scores, numNodes);
results.incVisitedCount(numNodes);
if (maxScore > results.minCompetitiveSimilarity()) {
float minSimilarity = Math.nextUp(results.minCompetitiveSimilarity());
for (int j = 0; j < numNodes; j++) {
float sibScore = scores[j];
// We avoid adding to candidates a sibling with a bad score
if (sibScore >= minSimilarity) {
candidates.add(hnswNodesOrd[j], sibScore);
if (acceptOrds == null || acceptOrds.get(hnswNodesOrd[j])) {
results.collect(hnswNodesOrd[j], sibScore);
minSimilarity = Math.nextUp(results.minCompetitiveSimilarity());
}
}
}
}
return scores;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.util.hnsw;

/**
* Implemented by collectors that understand parent-child document relationships and can enumerate
* sibling document ids for a given child document id, as well as translate document ids back to
* vector ordinals.
*
* <p>This interface is used internally by {@link OrdinalTranslatedKnnCollector} to bridge between
* the ordinal space of the HNSW graph and the document-id space of the collector.
*
* @lucene.experimental
*/
// The interface cannot be removed. It exists for a module-boundary reason.
// DocSiblingExpansion is in lucene/core, while DiversifyingNearestChildrenKnnCollector is in
// lucene/join.
// The dependency is one-way: join depends on core, never the reverse. So
// OrdinalTranslatedKnnCollector (in core)
// has no way to reference DiversifyingNearestChildrenKnnCollector directly.
//
// The interface is the bridge — it lets core call findSiblingDocIds and docIdToOrdinal on the
// collector without
// creating a circular dependency. Removing it would require either moving
// OrdinalTranslatedKnnCollector into
// join (bigger refactor) or adding a core → join dependency (illegal in this architecture).
public interface DocSiblingExpansion {

/**
* Returns the doc ids of all siblings of {@code childDocId}, or an empty array if there are no
* other siblings.
*
* @param childDocId the document id of the child that is about to be collected
* @return sibling doc ids, or an empty array
*/
int[] findSiblingDocIds(int childDocId);

/**
* Translates a document id to its vector ordinal, or returns {@code -1} if the document has no
* vector in this field.
*
* @param docId the document id
* @return the vector ordinal, or {@code -1}
*/
int docIdToOrdinal(int docId);
}
Loading
Loading