diff --git a/docs/changelog/144548.yaml b/docs/changelog/144548.yaml new file mode 100644 index 0000000000000..43a229fcdfd7a --- /dev/null +++ b/docs/changelog/144548.yaml @@ -0,0 +1,5 @@ +area: Vector Search +issues: [] +pr: 144548 +summary: Nested KNN vs inner hits scoring consistency +type: bug diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/130_knn_query_nested_search.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/130_knn_query_nested_search.yml index 2416689c285fd..dffa5da2dd291 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/130_knn_query_nested_search.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/130_knn_query_nested_search.yml @@ -587,3 +587,110 @@ setup: - match: { hits.total.value: 0 } +--- +"nested kNN with score_mode max - root score matches inner_hits score (#138496, #138011)": + - requires: + test_runner_features: ["close_to"] + reason: "Assert root and inner hit scores match" + + - do: + search: + index: test + body: + _source: false + query: + nested: + path: nested + score_mode: max + query: + knn: + field: nested.vector + query_vector: [-0.5, 90.0, -10, 14.8, -156.0] + num_candidates: 5 + inner_hits: { size: 3, "fields": [ "nested.paragraph_id" ], _source: false } + + - match: { hits.total.value: 3 } + - match: { hits.hits.0._id: "2" } + - match: { hits.hits.1._id: "3" } + - match: { hits.hits.2._id: "1" } + # Root score must equal top inner hit score (score_mode max) + - close_to: { hits.hits.0._score: { value: 0.00909090, error: 0.0001 } } + - close_to: { hits.hits.0.inner_hits.nested.hits.hits.0._score: { value: 0.00909090, error: 0.0001 } } + - close_to: { hits.hits.1._score: { value: 0.0021519717, error: 0.0001 } } + - close_to: { hits.hits.1.inner_hits.nested.hits.hits.0._score: { value: 0.0021519717, error: 0.0001 } } + - close_to: { hits.hits.2._score: { value: 0.00001, error: 0.0001 } } + - close_to: { hits.hits.2.inner_hits.nested.hits.hits.0._score: { value: 0.00001, error: 0.0001 } } + +--- +"nested bool(knn + match) with score_mode max - root score matches inner_hits score (#138011)": + - requires: + test_runner_features: ["close_to"] + reason: "Assert root and inner hit scores match with multi-clause scoring" + + - do: + indices.create: + index: nested_score_consistency + body: + mappings: + properties: + nested: + type: nested + properties: + embedding: + type: dense_vector + dims: 5 + index: true + similarity: l2_norm + text: + type: text + + - do: + index: + index: nested_score_consistency + id: "1" + body: + nested: + - embedding: [0.0, 0.0, 0.0, 0.0, 0.0] + text: "foo bar" + + - do: + index: + index: nested_score_consistency + id: "2" + body: + nested: + - embedding: [1.0, 1.0, 1.0, 1.0, 1.0] + text: "foo bar" + + - do: + indices.refresh: {} + + - do: + search: + index: nested_score_consistency + body: + _source: false + query: + nested: + path: nested + score_mode: max + query: + bool: + must: + - knn: + field: nested.embedding + query_vector: [0.0, 0.0, 0.0, 0.0, 0.0] + num_candidates: 5 + - match: + nested.text: "foo" + inner_hits: { size: 3, _source: false } + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "2" } + # Root score must equal top inner hit score (bool KNN + match combines clause scores; compare via set/match) + - set: { hits.hits.0.inner_hits.nested.hits.hits.0._score: inner_score_doc0 } + - match: { hits.hits.0._score: $inner_score_doc0 } + - set: { hits.hits.1.inner_hits.nested.hits.hits.0._score: inner_score_doc1 } + - match: { hits.hits.1._score: $inner_score_doc1 } + diff --git a/server/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java b/server/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java index 6a67977ab2507..71374f640a40b 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java @@ -127,6 +127,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings { IndexSettings.INDEX_FAST_REFRESH_SETTING, IndexSettings.MAX_RESULT_WINDOW_SETTING, IndexSettings.MAX_INNER_RESULT_WINDOW_SETTING, + IndexSettings.MAX_NESTED_KNN_CANDIDATES_SETTING, IndexSettings.MAX_TOKEN_COUNT_SETTING, IndexSettings.MAX_DOCVALUE_FIELDS_SEARCH_SETTING, IndexSettings.MAX_SCRIPT_FIELDS_SETTING, diff --git a/server/src/main/java/org/elasticsearch/index/IndexSettings.java b/server/src/main/java/org/elasticsearch/index/IndexSettings.java index 5fdca38a61b1e..05d42b5724859 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexSettings.java +++ b/server/src/main/java/org/elasticsearch/index/IndexSettings.java @@ -165,6 +165,20 @@ public final class IndexSettings { Property.IndexScope ); + /** + * Index setting describing the maximum value of children per parent to be considered by the + * diversifying KNN when a nested query's inner query is a bool with both a KNN clause and other scoring clauses. + * The root score should be equal to the max combined score, hence consistent with inner_hits. + */ + public static final Setting MAX_NESTED_KNN_CANDIDATES_SETTING = Setting.intSetting( + "index.max_nested_knn_candidates", + 100, + 1, + 10000, + Property.Dynamic, + Property.IndexScope + ); + /** * Index setting describing the maximum value of allowed `script_fields`that can be retrieved * per search request. The default maximum of 32 is defensive for the reason that retrieving @@ -1197,6 +1211,7 @@ private void setRetentionLeaseMillis(final TimeValue retentionLease) { private volatile boolean warmerEnabled; private volatile int maxResultWindow; private volatile int maxInnerResultWindow; + private volatile int maxNestedKnnCandidates; private volatile int maxRescoreWindow; private volatile int maxDocvalueFields; private volatile int maxScriptFields; @@ -1398,6 +1413,7 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti warmerEnabled = scopedSettings.get(INDEX_WARMER_ENABLED_SETTING); maxResultWindow = scopedSettings.get(MAX_RESULT_WINDOW_SETTING); maxInnerResultWindow = scopedSettings.get(MAX_INNER_RESULT_WINDOW_SETTING); + maxNestedKnnCandidates = scopedSettings.get(MAX_NESTED_KNN_CANDIDATES_SETTING); maxRescoreWindow = scopedSettings.get(MAX_RESCORE_WINDOW_SETTING); maxDocvalueFields = scopedSettings.get(MAX_DOCVALUE_FIELDS_SEARCH_SETTING); maxScriptFields = scopedSettings.get(MAX_SCRIPT_FIELDS_SETTING); @@ -1529,6 +1545,7 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti scopedSettings.addSettingsUpdateConsumer(INDEX_TRANSLOG_SYNC_INTERVAL_SETTING, this::setTranslogSyncInterval); scopedSettings.addSettingsUpdateConsumer(MAX_RESULT_WINDOW_SETTING, this::setMaxResultWindow); scopedSettings.addSettingsUpdateConsumer(MAX_INNER_RESULT_WINDOW_SETTING, this::setMaxInnerResultWindow); + scopedSettings.addSettingsUpdateConsumer(MAX_NESTED_KNN_CANDIDATES_SETTING, this::setMaxNestedKnnCandidates); scopedSettings.addSettingsUpdateConsumer(MAX_RESCORE_WINDOW_SETTING, this::setMaxRescoreWindow); scopedSettings.addSettingsUpdateConsumer(MAX_DOCVALUE_FIELDS_SEARCH_SETTING, this::setMaxDocvalueFields); scopedSettings.addSettingsUpdateConsumer(MAX_SCRIPT_FIELDS_SETTING, this::setMaxScriptFields); @@ -1880,6 +1897,18 @@ private void setMaxInnerResultWindow(int maxInnerResultWindow) { this.maxInnerResultWindow = maxInnerResultWindow; } + /** + * Returns the maximum number of nested children per parent to consider when the nested inner + * query has both KNN and other scoring clauses (so root score matches inner_hits). + */ + public int getMaxNestedKnnCandidates() { + return maxNestedKnnCandidates; + } + + private void setMaxNestedKnnCandidates(int maxNestedKnnCandidates) { + this.maxNestedKnnCandidates = maxNestedKnnCandidates; + } + /** * Returns the maximum rescore window for search requests. */ diff --git a/server/src/main/java/org/elasticsearch/index/query/NestedQueryBuilder.java b/server/src/main/java/org/elasticsearch/index/query/NestedQueryBuilder.java index 9f14378307fd6..e1d2cb90bf340 100644 --- a/server/src/main/java/org/elasticsearch/index/query/NestedQueryBuilder.java +++ b/server/src/main/java/org/elasticsearch/index/query/NestedQueryBuilder.java @@ -40,12 +40,15 @@ import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.fetch.subphase.InnerHitsContext; import org.elasticsearch.search.internal.SearchContext; +import org.elasticsearch.search.vectors.KnnVectorQueryBuilder; import org.elasticsearch.xcontent.ParseField; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentParser; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Objects; @@ -268,7 +271,71 @@ protected int doHashCode() { @Override protected Query doToQuery(SearchExecutionContext context) throws IOException { - return toQuery((this.query::toQuery), path, scoreMode, ignoreUnmapped, context); + QueryBuilder innerToBuild = hasNestedKnnWithOtherScoringClauses(query) ? rewriteInnerForNestedKnnMultiClause(query) : query; + return toQuery(innerToBuild::toQuery, path, scoreMode, ignoreUnmapped, context); + } + + /** + * Rewrites the inner query so that each KnnVectorQueryBuilder in must/should is replaced by a + * copy with useMaxNestedKnnCandidatesInNested set, so the built query considers more children + * per parent and the root score matches inner_hits. Only rewrites top-level bool. + */ + private static QueryBuilder rewriteInnerForNestedKnnMultiClause(QueryBuilder inner) { + if ((inner instanceof BoolQueryBuilder) == false) { + return inner; + } + BoolQueryBuilder bool = (BoolQueryBuilder) inner; + BoolQueryBuilder rewritten = new BoolQueryBuilder(); + rewritten.boost(bool.boost()); + rewritten.queryName(bool.queryName()); + if (bool.minimumShouldMatch() != null) { + rewritten.minimumShouldMatch(bool.minimumShouldMatch()); + } + for (QueryBuilder clause : bool.must()) { + rewritten.must(clause instanceof KnnVectorQueryBuilder knn ? knn.copyWithUseMaxNestedKnnCandidatesInNested(true) : clause); + } + for (QueryBuilder clause : bool.should()) { + rewritten.should(clause instanceof KnnVectorQueryBuilder knn ? knn.copyWithUseMaxNestedKnnCandidatesInNested(true) : clause); + } + for (QueryBuilder clause : bool.filter()) { + rewritten.filter(clause); + } + for (QueryBuilder clause : bool.mustNot()) { + rewritten.mustNot(clause); + } + return rewritten; + } + + /** + * Returns true when the inner query is a bool that contains both a KNN clause and at least one + * other scoring clause (must or should). In that case the diversifying KNN should consider + * more children per parent so the root score equals the max combined score (inner_hits). + */ + static boolean hasNestedKnnWithOtherScoringClauses(QueryBuilder query) { + if ((query instanceof BoolQueryBuilder) == false) { + return false; + } + BoolQueryBuilder bool = (BoolQueryBuilder) query; + boolean hasKnn = false; + boolean hasOtherScoring = false; + for (QueryBuilder clause : concat(bool.must(), bool.should())) { + if (clause instanceof KnnVectorQueryBuilder) { + hasKnn = true; + } else { + hasOtherScoring = true; + } + if (hasKnn && hasOtherScoring) { + return true; + } + } + return false; + } + + private static List concat(List a, List b) { + List out = new ArrayList<>(a.size() + b.size()); + out.addAll(a); + out.addAll(b); + return out; } /** diff --git a/server/src/main/java/org/elasticsearch/search/vectors/KnnVectorQueryBuilder.java b/server/src/main/java/org/elasticsearch/search/vectors/KnnVectorQueryBuilder.java index 0ca8c9ef0c189..80ec9f37660db 100644 --- a/server/src/main/java/org/elasticsearch/search/vectors/KnnVectorQueryBuilder.java +++ b/server/src/main/java/org/elasticsearch/search/vectors/KnnVectorQueryBuilder.java @@ -87,7 +87,8 @@ public class KnnVectorQueryBuilder extends LeafQueryBuilder { + assertHitCount(response, 2); + for (SearchHit hit : response.getHits()) { + Map innerHits = hit.getInnerHits(); + assertThat(innerHits, notNullValue()); + SearchHits nestedHits = innerHits.get("nested"); + assertThat(nestedHits, notNullValue()); + assertThat(nestedHits.getHits().length, greaterThan(0)); + float innerScore = nestedHits.getAt(0).getScore(); + assertThat( + "Root score should match top inner hit score (#138496, #138011)", + (double) hit.getScore(), + closeTo(innerScore, 1e-5) + ); + } + }); + } + + /** + * Ensures nested bool(KNN + match) with score_mode max and inner_hits returns root document score + * equal to the top inner hit score (fix for #138011). + */ + public void testNestedKnnScoreConsistencyWithBoolKnnAndMatch() throws IOException { + XContentBuilder mapping = XContentFactory.jsonBuilder() + .startObject() + .startObject("properties") + .startObject("nested") + .field("type", "nested") + .startObject("properties") + .startObject("embedding") + .field("type", "dense_vector") + .field("dims", VECTOR_DIMENSION) + .field("index", true) + .field("similarity", "l2_norm") + .endObject() + .startObject("text") + .field("type", "text") + .endObject() + .endObject() + .endObject() + .endObject() + .endObject(); + createIndex("index", Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).build(), mapping); + + float[] queryVector = randomVector(); + prepareIndex("index").setId("1").setSource("nested", List.of(Map.of("embedding", randomVector(), "text", "foo bar"))).get(); + prepareIndex("index").setId("2").setSource("nested", List.of(Map.of("embedding", queryVector, "text", "foo bar"))).get(); + indicesAdmin().prepareRefresh("index").get(); + + var boolQuery = QueryBuilders.boolQuery() + .must(new KnnVectorQueryBuilder("nested.embedding", queryVector, 5, 20, 10f, null, null)) + .must(QueryBuilders.matchQuery("nested.text", "foo")); + var nestedQuery = QueryBuilders.nestedQuery("nested", boolQuery, ScoreMode.Max).innerHit(new InnerHitBuilder("nested").setSize(3)); + + assertResponse(client().prepareSearch("index").setQuery(nestedQuery).setSize(10), response -> { + assertHitCount(response, 2); + for (SearchHit hit : response.getHits()) { + Map innerHits = hit.getInnerHits(); + assertThat(innerHits, notNullValue()); + SearchHits nestedHits = innerHits.get("nested"); + assertThat(nestedHits, notNullValue()); + assertThat(nestedHits.getHits().length, greaterThan(0)); + float innerScore = nestedHits.getAt(0).getScore(); + assertThat( + "Root score should match top inner hit score with bool(KNN+match) (#138011)", + (double) hit.getScore(), + closeTo(innerScore, 1e-5) + ); + } + }); + } + private float[] randomVector() { float[] vector = new float[VECTOR_DIMENSION]; for (int i = 0; i < vector.length; i++) { diff --git a/server/src/test/java/org/elasticsearch/index/query/NestedQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/NestedQueryBuilderTests.java index 39520db299f65..af136ed8c6910 100644 --- a/server/src/test/java/org/elasticsearch/index/query/NestedQueryBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/NestedQueryBuilderTests.java @@ -469,4 +469,62 @@ public void testDisallowExpensiveQueries() { ElasticsearchException e = expectThrows(ElasticsearchException.class, () -> queryBuilder.toQuery(searchExecutionContext)); assertEquals("[joining] queries cannot be executed when 'search.allow_expensive_queries' is set to false.", e.getMessage()); } + + public void testHasNestedKnnWithOtherScoringClauses() { + KnnVectorQueryBuilder knn = new KnnVectorQueryBuilder( + "nested1." + VECTOR_FIELD, + new float[] { 1.0f, 2.0f, 3.0f }, + 1, + 10, + null, + null, + null + ); + QueryBuilder match = QueryBuilders.matchQuery(TEXT_FIELD_NAME, "foo"); + + assertFalse(NestedQueryBuilder.hasNestedKnnWithOtherScoringClauses(knn)); + assertFalse(NestedQueryBuilder.hasNestedKnnWithOtherScoringClauses(match)); + assertFalse(NestedQueryBuilder.hasNestedKnnWithOtherScoringClauses(QueryBuilders.boolQuery().must(knn))); + + assertTrue(NestedQueryBuilder.hasNestedKnnWithOtherScoringClauses(QueryBuilders.boolQuery().must(knn).should(match))); + assertTrue(NestedQueryBuilder.hasNestedKnnWithOtherScoringClauses(QueryBuilders.boolQuery().must(knn).must(match))); + assertTrue(NestedQueryBuilder.hasNestedKnnWithOtherScoringClauses(QueryBuilders.boolQuery().should(knn).should(match))); + } + + public void testNestedKnnMultiClauseScoringRewriteBuildsSuccessfully() throws IOException { + BoolQueryBuilder bool = QueryBuilders.boolQuery() + .must(new KnnVectorQueryBuilder("nested1." + VECTOR_FIELD, new float[] { 1.0f, 2.0f, 3.0f }, 1, 10, null, null, null)) + .should(QueryBuilders.matchQuery(TEXT_FIELD_NAME, "foo")); + NestedQueryBuilder nested = new NestedQueryBuilder("nested1", bool, ScoreMode.Max); + SearchExecutionContext context = createSearchExecutionContext(); + Query query = nested.toQuery(context); + assertThat(query, instanceOf(ESToParentBlockJoinQuery.class)); + } + + public void testKnnCopyWithUseMaxNestedKnnCandidatesInNestedAffectsEquality() { + KnnVectorQueryBuilder knn = new KnnVectorQueryBuilder( + "nested1." + VECTOR_FIELD, + new float[] { 1.0f, 2.0f, 3.0f }, + 1, + 10, + null, + null, + null + ); + KnnVectorQueryBuilder withFlag = knn.copyWithUseMaxNestedKnnCandidatesInNested(true); + assertNotEquals(knn, withFlag); + assertNotEquals(knn.hashCode(), withFlag.hashCode()); + assertEquals(withFlag, knn.copyWithUseMaxNestedKnnCandidatesInNested(true)); + assertEquals(knn, knn.copyWithUseMaxNestedKnnCandidatesInNested(false)); + } + + public void testNestedKnnWithScoreModeMaxAndBoolKnnPlusMatchBuildsSuccessfully() throws IOException { + BoolQueryBuilder bool = QueryBuilders.boolQuery() + .must(new KnnVectorQueryBuilder("nested1." + VECTOR_FIELD, new float[] { 1.0f, 2.0f, 3.0f }, 1, 10, null, null, null)) + .must(QueryBuilders.matchQuery(TEXT_FIELD_NAME, "foo")); + NestedQueryBuilder nested = new NestedQueryBuilder("nested1", bool, ScoreMode.Max); + SearchExecutionContext context = createSearchExecutionContext(); + Query query = nested.toQuery(context); + assertThat(query, instanceOf(ESToParentBlockJoinQuery.class)); + } } diff --git a/x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/TransportResumeFollowAction.java b/x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/TransportResumeFollowAction.java index d42e99438e92d..4c830718d96e3 100644 --- a/x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/TransportResumeFollowAction.java +++ b/x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/TransportResumeFollowAction.java @@ -468,6 +468,7 @@ static String[] extractLeaderShardHistoryUUIDs(Map ccrIndexMetad IndexSettings.INDEX_REFRESH_INTERVAL_SETTING, IndexSettings.MAX_RESCORE_WINDOW_SETTING, IndexSettings.MAX_INNER_RESULT_WINDOW_SETTING, + IndexSettings.MAX_NESTED_KNN_CANDIDATES_SETTING, IndexSettings.DEFAULT_FIELD_SETTING, IndexSettings.QUERY_STRING_LENIENT_SETTING, IndexSettings.QUERY_STRING_ANALYZE_WILDCARD,