Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ public class DenseVectorField extends FloatPointField {
static final String HNSW_M = "hnswM";
static final String HNSW_EF_CONSTRUCTION = "hnswEfConstruction";
static final String VECTOR_ENCODING = "vectorEncoding";
static final String USE_VECTOR_VALUES_AS_STORED = "useVectorValuesAsStored";
static final VectorEncoding DEFAULT_VECTOR_ENCODING = VectorEncoding.FLOAT32;
static final String KNN_SIMILARITY_FUNCTION = "similarityFunction";
static final VectorSimilarityFunction DEFAULT_SIMILARITY = VectorSimilarityFunction.EUCLIDEAN;
Expand Down Expand Up @@ -117,6 +118,8 @@ public class DenseVectorField extends FloatPointField {
*/
private VectorEncoding vectorEncoding;

private boolean useVectorValuesAsStored;

private int cuvsWriterThreads;
private int cuvsIntGraphDegree;
private int cuvsGraphDegree;
Expand Down Expand Up @@ -187,6 +190,10 @@ public void init(IndexSchema schema, Map<String, String> args) {
.orElse(DEFAULT_VECTOR_ENCODING);
args.remove(VECTOR_ENCODING);

this.useVectorValuesAsStored =
ofNullable(args.get(USE_VECTOR_VALUES_AS_STORED)).map(Boolean::parseBoolean).orElse(false);
args.remove(USE_VECTOR_VALUES_AS_STORED);

this.hnswM =
ofNullable(args.get(HNSW_M))
.map(Integer::parseInt)
Expand Down Expand Up @@ -309,6 +316,10 @@ public int getCuvsHnswEfConstruction() {
return cuvsHnswEfConstruction;
}

public boolean useVectorValuesAsStored() {
return useVectorValuesAsStored;
}

@Override
protected boolean enableDocValuesByDefault() {
return false;
Expand All @@ -324,6 +335,21 @@ public void checkSchemaField(final SchemaField field) throws SolrException {
getClass().getSimpleName() + " fields can not have docValues: " + field.getName());
}

if (useVectorValuesAsStored) {
if (!field.stored()) {
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR,
USE_VECTOR_VALUES_AS_STORED + " requires stored=true for field " + field.getName());
}
if (field.multiValued()) {
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR,
USE_VECTOR_VALUES_AS_STORED
+ " is not supported for multiValued DenseVectorField: "
+ field.getName());
}
}

switch (vectorEncoding) {
case FLOAT32:
if (dimension > KnnVectorsFormat.DEFAULT_MAX_DIMENSIONS) {
Expand Down Expand Up @@ -360,7 +386,7 @@ public List<IndexableField> createFields(SchemaField field, Object value) {
if (field.indexed()) {
fields.add(createField(field, vectorBuilder));
}
if (field.stored()) {
if (field.stored() && !useVectorValuesAsStored) {
switch (vectorEncoding) {
case FLOAT32:
fields.ensureCapacity(vectorBuilder.getFloatVector().length + 1);
Expand Down
141 changes: 136 additions & 5 deletions solr/core/src/java/org/apache/solr/search/SolrDocumentFetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,15 @@
import org.apache.lucene.document.StoredValue;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.ByteVectorValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.KnnVectorValues;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
Expand All @@ -68,6 +71,7 @@
import org.apache.solr.response.DocsStreamer;
import org.apache.solr.response.ResultContext;
import org.apache.solr.schema.BoolField;
import org.apache.solr.schema.DenseVectorField;
import org.apache.solr.schema.EnumFieldType;
import org.apache.solr.schema.LatLonPointSpatialField;
import org.apache.solr.schema.NumberType;
Expand Down Expand Up @@ -95,6 +99,7 @@ public class SolrDocumentFetcher {
private final Set<String> allStored;

private final Set<String> dvsCanSubstituteStored;
private final Set<String> derivedStoredVectorFields;

/** Contains the names/patterns of all docValues=true,stored=false fields in the schema. */
private final Set<String> allNonStoredDVs;
Expand Down Expand Up @@ -133,6 +138,7 @@ private SolrDocumentFetcher(SolrDocumentFetcher template, StoredFields storedFie
this.largeFields = template.largeFields;
this.dvsCanSubstituteStored = template.dvsCanSubstituteStored;
this.allStored = template.allStored;
this.derivedStoredVectorFields = template.derivedStoredVectorFields;
this.storedHighlightFieldNames = template.indexedFieldNames;
this.indexedFieldNames = template.indexedFieldNames;
this.storedFields = storedFields;
Expand Down Expand Up @@ -169,6 +175,15 @@ protected SolrDocumentFetcher clone() {
final Set<String> storedLargeFields = new HashSet<>();
final Set<String> dvsCanSubstituteStored = new HashSet<>();
final Set<String> allStoreds = new HashSet<>();
final Set<String> derivedStoredVectors = new HashSet<>();

for (SchemaField schemaField : searcher.getSchema().getFields().values()) {
if (schemaField.getType() instanceof DenseVectorField vectorField
&& schemaField.stored()
&& vectorField.useVectorValuesAsStored()) {
derivedStoredVectors.add(schemaField.getName());
}
}

// can find materialized dynamic fields, unlike using the Solr IndexSchema.
for (FieldInfo fieldInfo : searcher.getFieldInfos()) {
Expand All @@ -182,6 +197,11 @@ protected SolrDocumentFetcher clone() {
if (schemaField.stored()) {
allStoreds.add(fieldInfo.name);
}
if (schemaField.getType() instanceof DenseVectorField vectorField
&& schemaField.stored()
&& vectorField.useVectorValuesAsStored()) {
derivedStoredVectors.add(fieldInfo.name);
}
if (!schemaField.stored() && schemaField.hasDocValues()) {
if (schemaField.useDocValuesAsStored()) {
nonStoredDVsUsedAsStored.add(fieldInfo.name);
Expand All @@ -203,6 +223,7 @@ protected SolrDocumentFetcher clone() {
this.largeFields = Collections.unmodifiableSet(storedLargeFields);
this.dvsCanSubstituteStored = Collections.unmodifiableSet(dvsCanSubstituteStored);
this.allStored = Collections.unmodifiableSet(allStoreds);
this.derivedStoredVectorFields = Collections.unmodifiableSet(derivedStoredVectors);
this.storedFields = null; // template docFetcher should throw NPE if used directly
this.storedHighlightFieldNames = new Collection[1];
this.indexedFieldNames = new Collection[1];
Expand Down Expand Up @@ -768,7 +789,7 @@ public Set<String> getNonStoredDVsWithoutCopyTargets() {
}

/**
* Moved as a private class here, we consider it an impelmentation detail. It should not be
* Moved as a private class here, we consider it an implementation detail. It should not be
* exposed outside of this class.
*
* <p>This class is in charge of insuring that SolrDocuments can have their fields populated
Expand All @@ -780,6 +801,7 @@ class RetrieveFieldsOptimizer {
private final Set<String> storedFields;
// always non null
private final Set<String> dvFields;
private final Set<String> vectorFields;

private final SolrReturnFields solrReturnFields;

Expand All @@ -788,6 +810,7 @@ class RetrieveFieldsOptimizer {
RetrieveFieldsOptimizer(SolrReturnFields solrReturnFields) {
this.storedFields = calcStoredFieldsForReturn(solrReturnFields);
this.dvFields = calcDocValueFieldsForReturn(solrReturnFields);
this.vectorFields = calcDerivedVectorFieldsForReturn(solrReturnFields);
this.solrReturnFields = solrReturnFields;

if (storedFields != null && dvsCanSubstituteStored.containsAll(storedFields)) {
Expand Down Expand Up @@ -830,11 +853,27 @@ private Set<String> calcStoredFieldsForReturn(ReturnFields returnFields) {
if (returnFields.wantsAllFields()) {
return null;
} else if (returnFields.hasPatternMatching()) {
for (String s : getAllStored()) {
if (returnFields.wantsField(s)) {
storedFields.add(s);
}
if (fnames == null) {
return null;
}
storedFields.addAll(fnames);
storedFields.removeIf(
(String name) -> {
SchemaField schemaField = searcher.getSchema().getFieldOrNull(name);
if (schemaField == null) {
// Get it from the stored fields if, for some reason, we can't get the schema.
return false;
}
if (schemaField.stored() && schemaField.multiValued()) {
// must return multivalued fields from stored data if possible.
return false;
}
if (schemaField.stored() == false) {
// if it's not stored, no choice but to return from DV.
return true;
}
return false;
});
} else if (fnames != null) {
storedFields.addAll(fnames);
storedFields.removeIf(
Expand Down Expand Up @@ -893,6 +932,96 @@ private Set<String> calcDocValueFieldsForReturn(ReturnFields returnFields) {
return result;
}

private Set<String> calcDerivedVectorFieldsForReturn(ReturnFields returnFields) {
if (derivedStoredVectorFields.isEmpty()) {
return Set.of();
}

final Set<String> result = new HashSet<>();
if (returnFields.wantsAllFields()) {
result.addAll(derivedStoredVectorFields);
} else if (returnFields.hasPatternMatching()) {
for (String field : derivedStoredVectorFields) {
if (returnFields.wantsField(field)) {
result.add(field);
}
}
} else {
Set<String> fnames = returnFields.getLuceneFieldNames();
if (fnames != null) {
result.addAll(fnames);
result.retainAll(derivedStoredVectorFields);
} else {
for (String field : derivedStoredVectorFields) {
if (returnFields.wantsField(field)) {
result.add(field);
}
}
}
}
return result;
}

private void decorateDerivedVectorFields(SolrDocument sdoc, int luceneDocId)
throws IOException {
if (vectorFields.isEmpty()) {
return;
}

final List<LeafReaderContext> leafContexts = searcher.getLeafContexts();
final int subIndex = ReaderUtil.subIndex(luceneDocId, leafContexts);
final LeafReaderContext leafReaderContext = leafContexts.get(subIndex);
final LeafReader leafReader = leafReaderContext.reader();
final int localId = luceneDocId - leafReaderContext.docBase;

for (String field : vectorFields) {
if (sdoc.containsKey(field)) {
continue;
}

SchemaField schemaField = searcher.getSchema().getFieldOrNull(field);
if (schemaField == null
|| !(schemaField.getType() instanceof DenseVectorField vectorField)) {
continue;
}

switch (vectorField.getVectorEncoding()) {
case FLOAT32:
FloatVectorValues floatVectorValues = leafReader.getFloatVectorValues(field);
if (floatVectorValues == null) {
continue;
}
KnnVectorValues.DocIndexIterator floatIterator = floatVectorValues.iterator();
if (floatIterator.advance(localId) != localId) {
continue;
}
float[] floatVector = floatVectorValues.vectorValue(floatIterator.index());
List<Number> floatValues = new ArrayList<>(floatVector.length);
for (float value : floatVector) {
floatValues.add(value);
}
sdoc.setField(field, floatValues);
break;
case BYTE:
ByteVectorValues byteVectorValues = leafReader.getByteVectorValues(field);
if (byteVectorValues == null) {
continue;
}
KnnVectorValues.DocIndexIterator byteIterator = byteVectorValues.iterator();
if (byteIterator.advance(localId) != localId) {
continue;
}
byte[] byteVector = byteVectorValues.vectorValue(byteIterator.index());
List<Number> byteValues = new ArrayList<>(byteVector.length);
for (byte value : byteVector) {
byteValues.add((int) value);
}
sdoc.setField(field, byteValues);
break;
}
}
}

private SolrDocument getSolrDoc(int luceneDocId) {

SolrDocument sdoc = null;
Expand All @@ -903,6 +1032,7 @@ private SolrDocument getSolrDoc(int luceneDocId) {
sdoc =
DocsStreamer.convertLuceneDocToSolrDoc(doc, searcher.getSchema(), getReturnFields());
if (returnDVFields() == false) {
decorateDerivedVectorFields(sdoc, luceneDocId);
solrReturnFields.setFieldSources(SolrReturnFields.FIELD_SOURCES.ALL_FROM_STORED);
return sdoc;
} else {
Expand All @@ -918,6 +1048,7 @@ private SolrDocument getSolrDoc(int luceneDocId) {
if (returnDVFields()) {
decorateDocValueFields(sdoc, luceneDocId, getDvFields(), reuseDvIters);
}
decorateDerivedVectorFields(sdoc, luceneDocId);
} catch (IOException e) {
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<schema name="bad-schema-densevector-derived-stored-multivalued" version="1.7">
<fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" useVectorValuesAsStored="true"/>
<field name="vector" type="knn_vector" indexed="true" stored="true" multiValued="true"/>
</schema>
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<schema name="schema-densevector-derived-stored" version="1.6">
<fieldType name="string" class="solr.StrField" multiValued="true"/>
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
<fieldType name="knn_vector_derived" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" useVectorValuesAsStored="true"/>
<fieldType name="knn_vector_byte_derived" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" vectorEncoding="BYTE" useVectorValuesAsStored="true"/>
<fieldType name="plong" class="solr.LongPointField" useDocValuesAsStored="false"/>

<field name="_root_" type="string" indexed="true" stored="true" multiValued="false" required="true"/>
<fieldType name="_nest_path_" class="solr.NestPathField" />

<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="_nest_path_" type="_nest_path_" />

<field name="vector" type="knn_vector_derived" indexed="true" stored="true"/>
<field name="vector_byte_encoding" type="knn_vector_byte_derived" indexed="true" stored="true" />
<field name="string_field" type="string" indexed="true" stored="true" multiValued="false" required="false"/>

<field name="_version_" type="plong" indexed="true" stored="true" multiValued="false" />
<field name="_text_" type="text_general" indexed="true" stored="false" multiValued="true"/>
<copyField source="*" dest="_text_"/>
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100" multiValued="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>

<uniqueKey>id</uniqueKey>
</schema>
Loading
Loading