Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support inverted index for sparse vector #517

Merged
merged 21 commits into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 17 additions & 3 deletions crates/base/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,11 @@ pub struct IndexOptions {

impl IndexOptions {
fn validate_self(&self) -> Result<(), ValidationError> {
match (self.vector.v, &self.indexing) {
(VectorKind::Vecf32, _) => Ok(()),
(VectorKind::Vecf16, _) => Ok(()),
match (self.vector.v, self.vector.d, &self.indexing) {
(VectorKind::Vecf32, _, _) => Ok(()),
(VectorKind::Vecf16, _, _) => Ok(()),
(
_,
_,
IndexingOptions::Flat(FlatIndexingOptions {
quantization: QuantizationOptions::Trivial(_),
Expand All @@ -123,6 +124,7 @@ impl IndexOptions {
..
}),
) => Ok(()),
(VectorKind::SVecf32, DistanceKind::Dot, IndexingOptions::InvertedSparse(_)) => Ok(()),
_ => Err(ValidationError::new("not valid index options")),
}
}
Expand Down Expand Up @@ -259,6 +261,7 @@ pub enum IndexingOptions {
Flat(FlatIndexingOptions),
Ivf(IvfIndexingOptions),
Hnsw(HnswIndexingOptions),
InvertedSparse(InvertedSparseIndexingOptions),
}

impl IndexingOptions {
Expand Down Expand Up @@ -294,10 +297,21 @@ impl Validate for IndexingOptions {
Self::Flat(x) => x.validate(),
Self::Ivf(x) => x.validate(),
Self::Hnsw(x) => x.validate(),
Self::InvertedSparse(_) => Ok(()),
}
}
}

#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
#[serde(deny_unknown_fields)]
pub struct InvertedSparseIndexingOptions {}

impl Default for InvertedSparseIndexingOptions {
fn default() -> Self {
Self {}
}
}

#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
#[serde(deny_unknown_fields)]
pub struct FlatIndexingOptions {
Expand Down
1 change: 1 addition & 0 deletions crates/index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ storage = { path = "../storage" }
# algorithms
flat = { path = "../flat" }
hnsw = { path = "../hnsw" }
inverted = { path = "../inverted" }
kemingy marked this conversation as resolved.
Show resolved Hide resolved
ivf = { path = "../ivf" }

[lints]
Expand Down
10 changes: 10 additions & 0 deletions crates/index/src/indexing/sealed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ use base::operator::*;
use base::search::*;
use flat::Flat;
use hnsw::Hnsw;
use inverted::InvertedSparse;
use ivf::Ivf;
use std::path::Path;

pub enum SealedIndexing<O: Op> {
Flat(Flat<O>),
Ivf(Ivf<O>),
Hnsw(Hnsw<O>),
InvertedSparse(InvertedSparse<O>),
}

impl<O: Op> SealedIndexing<O> {
Expand All @@ -23,6 +25,9 @@ impl<O: Op> SealedIndexing<O> {
IndexingOptions::Flat(_) => Self::Flat(Flat::create(path, options, source)),
IndexingOptions::Ivf(_) => Self::Ivf(Ivf::create(path, options, source)),
IndexingOptions::Hnsw(_) => Self::Hnsw(Hnsw::create(path, options, source)),
IndexingOptions::InvertedSparse(_) => {
Self::InvertedSparse(InvertedSparse::create(path, options, source))
}
}
}

Expand All @@ -31,6 +36,7 @@ impl<O: Op> SealedIndexing<O> {
IndexingOptions::Flat(_) => Self::Flat(Flat::open(path)),
IndexingOptions::Ivf(_) => Self::Ivf(Ivf::open(path)),
IndexingOptions::Hnsw(_) => Self::Hnsw(Hnsw::open(path)),
IndexingOptions::InvertedSparse(_) => Self::InvertedSparse(InvertedSparse::open(path)),
}
}

Expand All @@ -43,6 +49,7 @@ impl<O: Op> SealedIndexing<O> {
SealedIndexing::Flat(x) => x.vbase(vector, opts),
SealedIndexing::Ivf(x) => x.vbase(vector, opts),
SealedIndexing::Hnsw(x) => x.vbase(vector, opts),
SealedIndexing::InvertedSparse(x) => x.vbase(vector, opts),
}
}

Expand All @@ -51,6 +58,7 @@ impl<O: Op> SealedIndexing<O> {
SealedIndexing::Flat(x) => x.len(),
SealedIndexing::Ivf(x) => x.len(),
SealedIndexing::Hnsw(x) => x.len(),
SealedIndexing::InvertedSparse(x) => x.len(),
}
}

Expand All @@ -59,6 +67,7 @@ impl<O: Op> SealedIndexing<O> {
SealedIndexing::Flat(x) => x.vector(i),
SealedIndexing::Ivf(x) => x.vector(i),
SealedIndexing::Hnsw(x) => x.vector(i),
SealedIndexing::InvertedSparse(x) => x.vector(i),
}
}

Expand All @@ -67,6 +76,7 @@ impl<O: Op> SealedIndexing<O> {
SealedIndexing::Flat(x) => x.payload(i),
SealedIndexing::Ivf(x) => x.payload(i),
SealedIndexing::Hnsw(x) => x.payload(i),
SealedIndexing::InvertedSparse(x) => x.payload(i),
}
}
}
12 changes: 10 additions & 2 deletions crates/index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use common::dir_ops::sync_walk_from_dir;
use common::file_atomic::FileAtomic;
use crossbeam::atomic::AtomicCell;
use crossbeam::channel::Sender;
use inverted::operator::OperatorInvertedSparse;
use ivf::operator::OperatorIvf;
use parking_lot::Mutex;
use quantization::operator::OperatorQuantization;
Expand All @@ -41,9 +42,16 @@ use storage::OperatorStorage;
use thiserror::Error;
use validator::Validate;

pub trait Op: Operator + OperatorQuantization + OperatorStorage + OperatorIvf {}
pub trait Op:
Operator + OperatorQuantization + OperatorStorage + OperatorIvf + OperatorInvertedSparse
{
}

impl<T: Operator + OperatorQuantization + OperatorStorage + OperatorIvf> Op for T {}
impl<
T: Operator + OperatorQuantization + OperatorStorage + OperatorIvf + OperatorInvertedSparse,
> Op for T
{
}

#[derive(Debug, Error)]
#[error("The index view is outdated.")]
Expand Down
1 change: 1 addition & 0 deletions crates/index/src/segment/sealed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ impl<O: Op> SealedSegment<O> {
SealedIndexing::Flat(x) => x,
SealedIndexing::Ivf(x) => x,
SealedIndexing::Hnsw(x) => x,
SealedIndexing::InvertedSparse(x) => x,
}
}
}
Expand Down
13 changes: 13 additions & 0 deletions crates/inverted/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[package]
name = "inverted"
version.workspace = true
edition.workspace = true

[dependencies]
base = { path = "../base" }
common = { path = "../common" }
quantization = { path = "../quantization" }
storage = { path = "../storage" }

[lints]
workspace = true
162 changes: 162 additions & 0 deletions crates/inverted/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#![allow(clippy::len_without_is_empty)]

pub mod operator;

use self::operator::OperatorInvertedSparse;
use base::index::{IndexOptions, SearchOptions};
use base::operator::Borrowed;
use base::scalar::{ScalarLike, F32};
use base::search::{Collection, Element, Payload, Source, Vectors};
use common::dir_ops::sync_dir;
use common::json::Json;
use common::mmap_array::MmapArray;
use common::remap::RemappedCollection;
use storage::Storage;

use std::collections::{BTreeMap, BinaryHeap};
use std::fs::create_dir;
use std::path::Path;

const ZERO: F32 = F32(0.0);

#[allow(dead_code)]
pub struct InvertedSparse<O: OperatorInvertedSparse> {
storage: O::Storage,
payloads: MmapArray<Payload>,
indexes: Json<Vec<u32>>,
offsets: Json<Vec<u32>>,
scores: Json<Vec<F32>>,
}

impl<O: OperatorInvertedSparse> InvertedSparse<O> {
pub fn create(path: impl AsRef<Path>, options: IndexOptions, source: &impl Source<O>) -> Self {
let remapped = RemappedCollection::from_source(source);
from_nothing(path, options, &remapped)
}

pub fn open(path: impl AsRef<Path>) -> Self {
open(path)
}

pub fn vbase<'a>(
&'a self,
vector: Borrowed<'a, O>,
_: &'a SearchOptions,
) -> (Vec<Element>, Box<(dyn Iterator<Item = Element> + 'a)>) {
let mut doc_score = vec![ZERO; self.payloads.len()];
for (token, val) in O::to_index_vec(vector) {
let start = self.offsets[token as usize];
let end = self.offsets[token as usize + 1];
for i in (start as usize)..(end as usize) {
doc_score[self.indexes[i] as usize] += self.scores[i] * val;
}
}
let mut candidates: BinaryHeap<(F32, Payload)> = doc_score
.iter()
.enumerate()
.filter(|&(_, score)| *score > ZERO)
kemingy marked this conversation as resolved.
Show resolved Hide resolved
.map(|(i, score)| (*score, self.payload(i as u32)))
.collect::<Vec<_>>()
.into();

(
Vec::new(),
Box::new(std::iter::from_fn(move || {
candidates.pop().map(|(score, payload)| Element {
distance: -score,
payload,
})
})),
)
}

pub fn len(&self) -> u32 {
self.storage.len()
}

pub fn vector(&self, i: u32) -> Borrowed<'_, O> {
self.storage.vector(i)
}

pub fn payload(&self, i: u32) -> Payload {
self.payloads[i as usize]
}
}

fn from_nothing<O: OperatorInvertedSparse>(
path: impl AsRef<Path>,
_: IndexOptions,
collection: &impl Collection<O>,
) -> InvertedSparse<O> {
create_dir(path.as_ref()).expect("failed to create path for inverted sparse index");

let mut token_collection = BTreeMap::new();
kemingy marked this conversation as resolved.
Show resolved Hide resolved
for i in 0..collection.len() {
for (token, score) in O::to_index_vec(collection.vector(i)) {
token_collection
.entry(token)
.or_insert_with(Vec::new)
.push((i, score.to_f()));
}
}
let (indexes, offsets, scores) = build_compressed_matrix(token_collection);

let storage = O::Storage::create(path.as_ref().join("storage"), collection);
let payloads = MmapArray::create(
path.as_ref().join("payloads"),
(0..collection.len()).map(|i| collection.payload(i)),
);
let json_index = Json::create(path.as_ref().join("indexes"), indexes);
let json_offset = Json::create(path.as_ref().join("offsets"), offsets);
let json_score = Json::create(path.as_ref().join("scores"), scores);
sync_dir(path);
kemingy marked this conversation as resolved.
Show resolved Hide resolved
InvertedSparse {
storage,
payloads,
indexes: json_index,
offsets: json_offset,
scores: json_score,
}
}

fn open<O: OperatorInvertedSparse>(path: impl AsRef<Path>) -> InvertedSparse<O> {
let storage = O::Storage::open(path.as_ref().join("storage"));
let payloads = MmapArray::open(path.as_ref().join("payloads"));
let offsets = Json::open(path.as_ref().join("offsets"));
let indexes = Json::open(path.as_ref().join("indexes"));
let scores = Json::open(path.as_ref().join("scores"));
InvertedSparse {
storage,
payloads,
indexes,
offsets,
scores,
}
}

fn build_compressed_matrix(
token_collection: BTreeMap<u32, Vec<(u32, F32)>>,
) -> (Vec<u32>, Vec<u32>, Vec<F32>) {
let mut indexes = Vec::new();
let mut offsets = Vec::new();
let mut scores = Vec::new();

let mut i = 0;
let mut last: u32 = 0;
offsets.push(0);
for (token, id_scores) in token_collection.iter() {
while *token != i {
offsets.push(last);
i += 1;
}
for (id, score) in id_scores {
indexes.push(*id);
scores.push(*score);
}
last += id_scores.len() as u32;
offsets.push(last);
i += 1;
}

(indexes, offsets, scores)
}
Loading
Loading