Skip to content

Commit

Permalink
Normalize vectors in advance for cosine
Browse files Browse the repository at this point in the history
  • Loading branch information
kartik4949 authored and blythed committed Apr 8, 2024
1 parent b667e11 commit 7098861
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 8 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Run Tests from within the container
- Add model dict output indexing in graph
- Make lance upsert for added vectors
- Make vectors normalized in inmemory vector database for cosine measure.

#### New Features & Functionality
- Add nightly image for pre-release testing in the cloud environment
Expand Down
5 changes: 2 additions & 3 deletions superduperdb/vector_search/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,9 @@ def cosine(x, y):
'''
Cosine similarity function for vector search
'''
x = x.astype(float)
y = y.astype(float)
x = x / numpy.linalg.norm(x, axis=1)[:, None]
y = y / numpy.linalg.norm(y, axis=1)[:, None]
# y which implies all vectors in vectordatabase
# has normalized vectors.
return dot(x, y)


Expand Down
16 changes: 11 additions & 5 deletions superduperdb/vector_search/in_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ def __init__(
self._cache: t.Sequence[VectorItem] = []
self._CACHE_SIZE = 10000

self.measure = measure
if isinstance(measure, str):
self.measure = measures[measure]

if h is not None:
assert index is not None
self._setup(h, index)
Expand All @@ -39,17 +43,19 @@ def __init__(
self.index = None
self.lookup = None

self.measure = measure
if isinstance(measure, str):
self.measure = measures[measure]

self.identifier = identifier

def __len__(self):
return self.h.shape[0]

def _setup(self, h, index):
self.h = numpy.array(h) if not isinstance(h, numpy.ndarray) else h
h = numpy.array(h) if not isinstance(h, numpy.ndarray) else h

if self.measure == 'cosine':
# Normalization is required for cosine, hence preparing
# all vectors in advance.
h = h / numpy.linalg.norm(h, axis=1)[:, None]
self.h = h
self.index = index
self.lookup = dict(zip(index, range(len(index))))

Expand Down

0 comments on commit 7098861

Please sign in to comment.