Skip to content

Commit

Permalink
Make lance add upsert vectors
Browse files Browse the repository at this point in the history
  • Loading branch information
kartik4949 authored and blythed committed Mar 21, 2024
1 parent 951ab66 commit 1723f6d
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 13 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
#### Changed defaults / behaviours
- Run Tests from within the container
- Add model dict output indexing in graph
- Make lance upsert for added vectors

#### New Features & Functionality
- Add compute_kwargs option for model
Expand Down
26 changes: 13 additions & 13 deletions superduperdb/vector_search/lance.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,37 +30,37 @@ def __init__(
):
self.dataset_path = os.path.join(CFG.lance_home, f'{identifier}.lance')
self.dimensions = dimensions
self._created = False
self.measure = measure
if h is not None:
self._create_or_append_to_dataset(h, index, mode='create')
if not os.path.exists(self.dataset_path):
os.makedirs(self.dataset_path, exist_ok=True)
self._create_or_append_to_dataset(h, index, mode='create')

@property
def dataset(self):
if not os.path.exists(self.dataset_path):
self._create_or_append_to_dataset([], [])
self._create_or_append_to_dataset([], [], mode='create')
return lance.dataset(self.dataset_path)

def __len__(self):
return self.dataset.count_rows()

def _create_or_append_to_dataset(self, vectors, ids, mode: str = 'create'):
if not self._created:
if not os.path.exists(self.dataset_path):
mode = 'create'
os.makedirs(self.dataset_path, exist_ok=True)
else:
self._created = True
mode = "append"
def _create_or_append_to_dataset(self, vectors, ids, mode: str = 'upsert'):
type = pa.list_(
pa.field('values', pa.float32(), nullable=False), self.dimensions
)
vectors = self.to_list(vectors)
_vecs = pa.array([v for v in vectors], type=type)
_ids = pa.array(ids, type=pa.string())
_table = pa.Table.from_arrays([_ids, _vecs], names=['id', 'vector'])
lance.write_dataset(_table, self.dataset_path, mode=mode)
self._created = True

if mode == 'upsert':
dataset = lance.dataset(self.dataset_path)
dataset.merge_insert(
"id"
).when_matched_update_all().when_not_matched_insert_all().execute(_table)
else:
lance.write_dataset(_table, self.dataset_path, mode=mode)

def add(self, items: t.Sequence[VectorItem]) -> None:
ids = [item.id for item in items]
Expand Down

0 comments on commit 1723f6d

Please sign in to comment.