Skip to content

Commit

Permalink
bump tantivy to v0.21 (#1043)
Browse files Browse the repository at this point in the history
* bump tantivy to v0.22

* address clippy

* fix broken tests
  • Loading branch information
oppiliappan committed Oct 13, 2023
1 parent d11c60d commit cf74866
Show file tree
Hide file tree
Showing 8 changed files with 203 additions and 112 deletions.
234 changes: 151 additions & 83 deletions Cargo.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion server/bleep/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ harness = false
[dependencies]

# core
tantivy = { version = "0.19.2", features = ["mmap"] }
tantivy = { version = "0.21.0", features = ["mmap"] }
tantivy-columnar = "0.2.0"
tokio = { version = "1.32.0", features = ["macros", "process", "rt", "rt-multi-thread", "io-std", "io-util", "sync", "fs"] }
futures = "0.3.28"
rayon = "1.8.0"
Expand Down
18 changes: 13 additions & 5 deletions server/bleep/src/collector/bytes_filter.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// a version of tantivy::collector::FilterCollector that works on byte fast fields

use tantivy::collector::{Collector, SegmentCollector};
use tantivy::fastfield::BytesFastFieldReader;
use tantivy::schema::Field;
use tantivy::{Score, SegmentReader, TantivyError};

Expand Down Expand Up @@ -58,7 +57,8 @@ where
)));
}

let fast_field_reader = segment_reader.fast_fields().bytes(self.field)?;
let field_name = schema.get_field_name(self.field);
let fast_field_reader = segment_reader.fast_fields().bytes(field_name)?.unwrap();

let segment_collector = self
.collector
Expand Down Expand Up @@ -87,7 +87,7 @@ pub struct BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
where
TPredicate: 'static,
{
fast_field_reader: BytesFastFieldReader,
fast_field_reader: tantivy_columnar::BytesColumn,
segment_collector: TSegmentCollector,
predicate: TPredicate,
}
Expand All @@ -101,8 +101,16 @@ where
type Fruit = TSegmentCollector::Fruit;

fn collect(&mut self, doc: u32, score: Score) {
let value = self.fast_field_reader.get_bytes(doc);
if (self.predicate)(value) {
let mut value = Vec::new();
self.fast_field_reader
.ords()
.values_for_doc(doc)
.for_each(|ord| {
self.fast_field_reader
.ord_to_bytes(ord, &mut value)
.unwrap();
});
if (self.predicate)(&value) {
self.segment_collector.collect(doc, score)
}
}
Expand Down
17 changes: 9 additions & 8 deletions server/bleep/src/collector/frequency.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ use std::collections::HashMap;

use tantivy::{
collector::{Collector, SegmentCollector},
fastfield::BytesFastFieldReader,
schema::Field,
Score, SegmentReader,
};
use tantivy_columnar::BytesColumn;

pub struct FrequencyCollector(pub Field);

Expand All @@ -19,7 +19,8 @@ impl Collector for FrequencyCollector {
_segment_local_id: u32,
segment_reader: &SegmentReader,
) -> tantivy::Result<FrequencySegmentCollector> {
let reader = segment_reader.fast_fields().bytes(self.0)?;
let field_name = segment_reader.schema().get_field_name(self.0);
let reader = segment_reader.fast_fields().bytes(field_name)?.unwrap();
Ok(FrequencySegmentCollector {
reader,
freqs: HashMap::new(),
Expand All @@ -43,19 +44,19 @@ impl Collector for FrequencyCollector {
}

pub struct FrequencySegmentCollector {
reader: BytesFastFieldReader,
reader: BytesColumn,
freqs: HashMap<Vec<u8>, usize>,
}

impl SegmentCollector for FrequencySegmentCollector {
type Fruit = HashMap<Vec<u8>, usize>;

fn collect(&mut self, doc: u32, _score: Score) {
let k = self.reader.get_bytes(doc);
self.freqs
.entry(k.to_owned())
.and_modify(|v| *v += 1)
.or_insert(1);
let mut k = Vec::new();
self.reader.ords().values_for_doc(doc).for_each(|ord| {
self.reader.ord_to_bytes(ord, &mut k).unwrap();
});
self.freqs.entry(k).and_modify(|v| *v += 1).or_insert(1);
}

fn harvest(self) -> <Self as SegmentCollector>::Fruit {
Expand Down
2 changes: 1 addition & 1 deletion server/bleep/src/indexes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ impl<T: Indexable> Indexer<T> {
index.set_multithread_executor(threads)?;
index
.tokenizers()
.register("default", NgramTokenizer::new(1, 3, false));
.register("default", NgramTokenizer::new(1, 3, false).unwrap());

Ok(index)
}
Expand Down
4 changes: 4 additions & 0 deletions server/bleep/src/indexes/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ impl File {
histogram: Arc::new(Histogram::builder().build().unwrap().into()),
}
}

pub fn schema(&self) -> Schema {
self.schema.clone()
}
}

impl Default for File {
Expand Down
4 changes: 2 additions & 2 deletions server/bleep/src/query/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ impl Compiler {

let field_query = match extraction {
Extraction::Literal(Literal::Plain(text)) => {
let tokenizer = index
let mut tokenizer = index
.tokenizer_for_field(*field)
.context("field is missing tokenizer")?;

Expand Down Expand Up @@ -376,7 +376,7 @@ mod tests {
let (occur, term) = &subquery.clauses()[0];
let term = term.downcast_ref::<TermQuery>().unwrap();
assert_eq!(*occur, Occur::Should);
assert_eq!(term.term().as_str().unwrap(), expected);
assert_eq!(term.term().value().as_str().unwrap(), expected);
}
}
}
33 changes: 21 additions & 12 deletions server/bleep/src/query/ranking.rs
Original file line number Diff line number Diff line change
@@ -1,32 +1,37 @@
use std::{sync::Arc, time::SystemTime};
use std::time::SystemTime;

use tantivy::{
collector::{ScoreSegmentTweaker, ScoreTweaker},
fastfield::{BytesFastFieldReader, Column},
fastfield::Column,
DocId, Score,
};
use tantivy_columnar::{column_values::ColumnValues, BytesColumn};

use crate::indexes::file::File;

pub struct DocumentTweaker(pub File);
pub struct SegmentScorer {
line_length: Arc<dyn Column<f64>>,
lang: BytesFastFieldReader,
last_commit: Arc<dyn Column<u64>>,
line_length: Column<f64>,
lang: BytesColumn,
last_commit: Column<u64>,
}

impl ScoreSegmentTweaker<Score> for SegmentScorer {
fn score(&mut self, doc: DocId, mut score: Score) -> Score {
// * 1000 if it's a language we understand
score *= 1.0 + self.lang.num_bytes(doc).min(1) as f32 * 999.0;
let mut bytes = Vec::new();
self.lang.ords().values_for_doc(doc).for_each(|ord| {
self.lang.ord_to_bytes(ord, &mut bytes).unwrap();
});
score *= 1.0 + bytes.len().min(1) as f32 * 999.0;

// Penalty for lines that are too long
score /= self.line_length.get_val(doc).clamp(20.0, 1000.0) as f32;
score /= self.line_length.values.get_val(doc).clamp(20.0, 1000.0) as f32;
score /= SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.unwrap()
.as_secs()
.saturating_sub(self.last_commit.get_val(doc))
.saturating_sub(self.last_commit.values.get_val(doc))
.min(5_000_000) as f32;

score
Expand All @@ -40,13 +45,17 @@ impl ScoreTweaker<Score> for DocumentTweaker {
&self,
segment_reader: &tantivy::SegmentReader,
) -> tantivy::Result<Self::Child> {
let Self(schema) = self;
let Self(file) = self;
let schema = file.schema();
let avg_line_length_field = schema.get_field_name(file.avg_line_length);
let lang_field = schema.get_field_name(file.lang);
let last_commit_unix_seconds_field = schema.get_field_name(file.last_commit_unix_seconds);
Ok(SegmentScorer {
line_length: segment_reader.fast_fields().f64(schema.avg_line_length)?,
lang: segment_reader.fast_fields().bytes(schema.lang)?,
line_length: segment_reader.fast_fields().f64(avg_line_length_field)?,
lang: segment_reader.fast_fields().bytes(lang_field)?.unwrap(),
last_commit: segment_reader
.fast_fields()
.u64(schema.last_commit_unix_seconds)?,
.u64(last_commit_unix_seconds_field)?,
})
}
}

0 comments on commit cf74866

Please sign in to comment.