Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@
import java.io.IOException;
import java.util.Objects;
import org.apache.lucene.search.BulkScorer;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.LRUQueryCache;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.RamUsageEstimator;

/**
* A {@link BulkScorer} that restricts collection to the half-open doc ID interval {@code [minDocID,
Expand Down Expand Up @@ -95,4 +98,32 @@ public int score(LeafCollector collector, Bits acceptDocs, int min, int max) thr
public long cost() {
return maxDocID - minDocID;
}

@Override
public LRUQueryCache.CacheAndCount intoCacheAndCount(int maxDoc) {
DocIdSet docIdSet = new RangeDocIdSet(minDocID, maxDocID);
return new LRUQueryCache.CacheAndCount(docIdSet, maxDocID - minDocID);
}

private static class RangeDocIdSet extends DocIdSet {
private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(RangeDocIdSet.class);
private final int minDocID;
private final int maxDocID;

RangeDocIdSet(int minDocID, int maxDocID) {
this.minDocID = minDocID;
this.maxDocID = maxDocID;
}

@Override
public DocIdSetIterator iterator() {
return DocIdSetIterator.range(minDocID, maxDocID);
}

@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED;
}
}
}
112 changes: 112 additions & 0 deletions lucene/core/src/java/org/apache/lucene/search/BulkScorer.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@
package org.apache.lucene.search;

import java.io.IOException;
import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RoaringDocIdSet;

/**
* This class is used to score a range of documents at once, and is returned by {@link
Expand Down Expand Up @@ -78,4 +81,113 @@ public abstract int score(LeafCollector collector, Bits acceptDocs, int min, int

/** Same as {@link DocIdSetIterator#cost()} for bulk scorers. */
public abstract long cost();

/**
* Materializes all matching document IDs in {@code [0, maxDoc)} into a {@link DocIdSet} together
* with an exact match count, for use by {@link LRUQueryCache} (see {@link
* LRUQueryCache#cacheImpl}).
*
* <p>Implementations score from document 0 up to {@link DocIdSetIterator#NO_MORE_DOCS} with no
* {@code acceptDocs} filter.
*
* <p>The default representation is chosen from {@link #cost()} versus {@code maxDoc}: when {@code
* cost * 100 >= maxDoc} (estimated density at least 1%), a dense {@link BitDocIdSet} is used;
* otherwise a sparse {@link RoaringDocIdSet} is built.
*
* @param maxDoc one past the maximum document ID in the index (e.g. {@link
* org.apache.lucene.index.LeafReader#maxDoc()})
* @return the cached doc-id set and its cardinality
* @throws IOException if scoring fails
* @see LRUQueryCache.CacheAndCount
*/
public LRUQueryCache.CacheAndCount intoCacheAndCount(int maxDoc) throws IOException {
if (cost() * 100 >= maxDoc) {
// FixedBitSet is faster for dense sets and will enable the random-access
// optimization in ConjunctionDISI
return cacheIntoBitSet(maxDoc);
} else {
return cacheIntoRoaringDocIdSet(maxDoc);
}
}

private LRUQueryCache.CacheAndCount cacheIntoBitSet(int maxDoc) throws IOException {
final FixedBitSet bitSet = new FixedBitSet(maxDoc);
int[] count = new int[1];
score(
new LeafCollector() {

private int[] buffer;

@Override
public void setScorer(Scorable scorer) {}

@Override
public void collect(int doc) {
count[0]++;
bitSet.set(doc);
}

@Override
public void collectRange(int min, int max) {
count[0] += max - min;
bitSet.set(min, max);
}

@Override
public void collect(DocIdStream stream) {
if (buffer == null) {
buffer = new int[128];
}
for (int c = stream.intoArray(buffer); c != 0; c = stream.intoArray(buffer)) {
for (int i = 0; i < c; ++i) {
bitSet.set(buffer[i]);
}
count[0] += c;
}
}
},
null,
0,
DocIdSetIterator.NO_MORE_DOCS);
return new LRUQueryCache.CacheAndCount(new BitDocIdSet(bitSet, count[0]), count[0]);
}

private LRUQueryCache.CacheAndCount cacheIntoRoaringDocIdSet(int maxDoc) throws IOException {
RoaringDocIdSet.Builder builder = new RoaringDocIdSet.Builder(maxDoc);
score(
new LeafCollector() {

private int[] buffer = null;

@Override
public void setScorer(Scorable scorer) {}

@Override
public void collect(int doc) {
builder.add(doc);
}

@Override
public void collectRange(int min, int max) {
builder.add(min, max);
}

@Override
public void collect(DocIdStream stream) {
if (buffer == null) {
buffer = new int[128];
}
for (int c = stream.intoArray(buffer); c != 0; c = stream.intoArray(buffer)) {
for (int i = 0; i < c; ++i) {
builder.add(buffer[i]);
}
}
}
},
null,
0,
DocIdSetIterator.NO_MORE_DOCS);
RoaringDocIdSet cache = builder.build();
return new LRUQueryCache.CacheAndCount(cache, cache.cardinality());
}
}
90 changes: 1 addition & 89 deletions lucene/core/src/java/org/apache/lucene/search/LRUQueryCache.java
Original file line number Diff line number Diff line change
Expand Up @@ -572,13 +572,7 @@ public Collection<Accountable> getChildResources() {
* and a {@link BitDocIdSet} over a {@link FixedBitSet} otherwise.
*/
protected CacheAndCount cacheImpl(BulkScorer scorer, int maxDoc) throws IOException {
if (scorer.cost() * 100 >= maxDoc) {
// FixedBitSet is faster for dense sets and will enable the random-access
// optimization in ConjunctionDISI
return cacheIntoBitSet(scorer, maxDoc);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this comment is still true now that we have intoBitSet() and docIDRunEnd()? I don't really like the idea of pushing things from LRUQueryCache onto BulkScorer - I feel like the way to fix this is to make RoaringBitSet more performant.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And how do you fix it if the range matches 2% of the index? having to build a full FixedBitSet for a dense range feels wrong.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean that maybe we should always be using RoaringBitSet?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, you mean using RoaringBitSet more aggresively.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, exactly. I need to do some code archaeology but I think ConjunctionDISI used to do instanceof checks for its input filters but now just uses intoBitSet and docIdRunEnd, so if we can get the performance of RoaringBitSet up for those methods then we don't need to use FixedBitSet here at all. Which would save us a whole bunch of memory.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this would be a step in that direction: #16084

I like the idea of going away of those fixed bit set in caching as they are a source of humongous allocations which moving to RoaringBitSet would avoid. How can be test the performance here?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should be able to adjust luceneutil to use a query cache which should give us an idea of performance changes?

} else {
return cacheIntoRoaringDocIdSet(scorer, maxDoc);
}
return scorer.intoCacheAndCount(maxDoc);
}

/**
Expand All @@ -601,88 +595,6 @@ protected CacheAndCount tryPopulateCache(
return cached;
}

private static CacheAndCount cacheIntoBitSet(BulkScorer scorer, int maxDoc) throws IOException {
final FixedBitSet bitSet = new FixedBitSet(maxDoc);
int[] count = new int[1];
scorer.score(
new LeafCollector() {

private int[] buffer;

@Override
public void setScorer(Scorable scorer) {}

@Override
public void collect(int doc) {
count[0]++;
bitSet.set(doc);
}

@Override
public void collectRange(int min, int max) {
count[0] += max - min;
bitSet.set(min, max);
}

@Override
public void collect(DocIdStream stream) {
if (buffer == null) {
buffer = new int[128];
}
for (int c = stream.intoArray(buffer); c != 0; c = stream.intoArray(buffer)) {
for (int i = 0; i < c; ++i) {
bitSet.set(buffer[i]);
}
count[0] += c;
}
}
},
null,
0,
DocIdSetIterator.NO_MORE_DOCS);
return new CacheAndCount(new BitDocIdSet(bitSet, count[0]), count[0]);
}

private static CacheAndCount cacheIntoRoaringDocIdSet(BulkScorer scorer, int maxDoc)
throws IOException {
RoaringDocIdSet.Builder builder = new RoaringDocIdSet.Builder(maxDoc);
scorer.score(
new LeafCollector() {

private int[] buffer = null;

@Override
public void setScorer(Scorable scorer) {}

@Override
public void collect(int doc) {
builder.add(doc);
}

@Override
public void collectRange(int min, int max) {
builder.add(min, max);
}

@Override
public void collect(DocIdStream stream) {
if (buffer == null) {
buffer = new int[128];
}
for (int c = stream.intoArray(buffer); c != 0; c = stream.intoArray(buffer)) {
for (int i = 0; i < c; ++i) {
builder.add(buffer[i]);
}
}
}
},
null,
0,
DocIdSetIterator.NO_MORE_DOCS);
RoaringDocIdSet cache = builder.build();
return new CacheAndCount(cache, cache.cardinality());
}

/**
* Return the total number of times that a {@link Query} has been looked up in this {@link
* QueryCache}. Note that this number is incremented once per segment so running a cached query
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.apache.lucene.search.BulkScorer;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.LRUQueryCache;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.ScoreMode;
Expand Down Expand Up @@ -141,4 +142,23 @@ private static void assertCollectedRanges(List<int[]> expected, List<int[]> actu
assertArrayEquals(expected.get(i), actual.get(i));
}
}

public void testIntoCacheAndCount() throws Exception {
int rangeMin = 20;
int rangeMaxExclusive = 80;
BulkScorer bs = newBulkScorer(rangeMin, rangeMaxExclusive);
int leafMaxDoc = 1000;

LRUQueryCache.CacheAndCount cached = bs.intoCacheAndCount(leafMaxDoc);
assertEquals(rangeMaxExclusive - rangeMin, cached.count());

DocIdSetIterator expected = DocIdSetIterator.range(rangeMin, rangeMaxExclusive);
DocIdSetIterator actual = cached.iterator();
for (int doc = expected.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = expected.nextDoc()) {
assertEquals(doc, actual.nextDoc());
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, actual.nextDoc());
}
}
Loading