Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Models refactoring, add stella, BGE, Linq #1225

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mteb/abstasks/AbsTaskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def _evaluate_subset(
evaluator = BitextMiningEvaluator(
data_split,
task_name=self.metadata.name,
task_type=self.metadata.type,
pair_columns=pairs, # type: ignore
**kwargs,
)
Expand Down
3 changes: 3 additions & 0 deletions mteb/abstasks/AbsTaskClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def _evaluate_subset(
eval_split["text"], # type: ignore
eval_split["label"], # type: ignore
task_name=self.metadata.name,
task_type=self.metadata.type,
encode_kwargs=encode_kwargs,
**params,
)
Expand All @@ -159,6 +160,7 @@ def _evaluate_subset(
eval_split["text"], # type: ignore
eval_split["label"], # type: ignore
task_name=self.metadata.name,
task_type=self.metadata.type,
encode_kwargs=encode_kwargs,
**params,
)
Expand All @@ -169,6 +171,7 @@ def _evaluate_subset(
eval_split["text"], # type: ignore
eval_split["label"], # type: ignore
task_name=self.metadata.name,
task_type=self.metadata.type,
encode_kwargs=encode_kwargs,
**params,
)
Expand Down
1 change: 1 addition & 0 deletions mteb/abstasks/AbsTaskClustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def _evaluate_subset(
cluster_set["sentences"], # type: ignore
cluster_set["labels"], # type: ignore
task_name=self.metadata.name,
task_type=self.metadata.type,
**kwargs,
)
metrics = evaluator(model, encode_kwargs=encode_kwargs)
Expand Down
1 change: 1 addition & 0 deletions mteb/abstasks/AbsTaskClusteringFast.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ def _evaluate_subset(
downsampled_dataset["sentences"], # type: ignore
model=model,
prompt_name=self.metadata.name,
task_type=self.metadata.type,
**encode_kwargs,
)

Expand Down
1 change: 1 addition & 0 deletions mteb/abstasks/AbsTaskInstructionRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,7 @@ def evaluate(
retriever = InstructionRetrievalEvaluator(
retriever=model,
task_name=self.metadata.name,
task_type=self.metadata.type,
encode_kwargs=encode_kwargs,
**kwargs,
)
Expand Down
7 changes: 6 additions & 1 deletion mteb/abstasks/AbsTaskMultilabelClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def _evaluate_subset(
unique_train_sentences,
model=model,
prompt_name=self.metadata.name,
task_type=self.metadata.type,
**encode_kwargs,
)
unique_train_embeddings = dict(
Expand All @@ -184,7 +185,11 @@ def _evaluate_subset(
logger.warning("Couldn't subsample, continuing with the entire test set.")

X_test = model_encode(
test_text, model=model, prompt_name=self.metadata.name, **encode_kwargs
test_text,
model=model,
prompt_name=self.metadata.name,
task_type=self.metadata.type,
**encode_kwargs,
)
for i_experiment, sample_indices in enumerate(train_samples):
logger.info(
Expand Down
1 change: 1 addition & 0 deletions mteb/abstasks/AbsTaskPairClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def _evaluate_subset(
data_split["sentence2"],
data_split["labels"],
task_name=self.metadata.name,
task_type=self.metadata.type,
**kwargs,
)
scores = evaluator.compute_metrics(model, encode_kwargs=encode_kwargs)
Expand Down
1 change: 1 addition & 0 deletions mteb/abstasks/AbsTaskReranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def _evaluate_subset(
evaluator = RerankingEvaluator(
data_split,
task_name=self.metadata.name,
task_type=self.metadata.type,
encode_kwargs=encode_kwargs,
**kwargs,
)
Expand Down
1 change: 1 addition & 0 deletions mteb/abstasks/AbsTaskRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ def evaluate(
retriever = RetrievalEvaluator(
retriever=model,
task_name=self.metadata.name,
task_type=self.metadata.type,
encode_kwargs=encode_kwargs,
**kwargs,
)
Expand Down
1 change: 1 addition & 0 deletions mteb/abstasks/AbsTaskSTS.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def normalize(x):
data_split["sentence2"],
normalized_scores,
task_name=self.metadata.name,
task_type=self.metadata.type,
**kwargs,
)
scores = evaluator(model, encode_kwargs=encode_kwargs)
Expand Down
1 change: 1 addition & 0 deletions mteb/abstasks/AbsTaskSummarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def _evaluate_subset(
texts=data_split["text"],
gold_scores=normalized_scores,
task_name=self.metadata.name,
task_type=self.metadata.type,
**kwargs,
)
scores = evaluator(model, encode_kwargs=encode_kwargs)
Expand Down
3 changes: 3 additions & 0 deletions mteb/evaluation/evaluators/BitextMiningEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(
self,
sentences: Dataset,
task_name: str | None = None,
task_type: str | None = None,
pair_columns: list[tuple[str, str]] = DEFAULT_PAIR,
**kwargs,
):
Expand All @@ -38,6 +39,7 @@ def __init__(
else sentences["gold"]
)
self.task_name = task_name
self.task_type = task_type

def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}):
scores = self.compute_metrics(model, encode_kwargs=encode_kwargs)
Expand All @@ -56,6 +58,7 @@ def compute_metrics(self, model: Encoder, encode_kwargs: dict[str, Any] = {}):
self.sentences[sub],
model=model,
prompt_name=self.task_name,
task_type=self.task_type,
**encode_kwargs,
)

Expand Down
18 changes: 13 additions & 5 deletions mteb/evaluation/evaluators/ClassificationEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,7 @@
import numpy as np
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
accuracy_score,
average_precision_score,
f1_score,
)
from sklearn.metrics import accuracy_score, average_precision_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from torch import Tensor

Expand All @@ -34,6 +30,7 @@ def __init__(
sentences_test,
y_test,
task_name: str | None = None,
task_type: str | None = None,
k: int = 1,
encode_kwargs: dict[str, Any] = {},
limit: int | None = None,
Expand All @@ -51,6 +48,7 @@ def __init__(
self.y_test = y_test

self.task_name = task_name
self.task_type = task_type
self.encode_kwargs = encode_kwargs

if "batch_size" not in self.encode_kwargs:
Expand All @@ -67,13 +65,15 @@ def __call__(self, model, test_cache=None):
self.sentences_train,
model=model,
prompt_name=self.task_name,
task_type=self.task_type,
**self.encode_kwargs,
)
if test_cache is None:
X_test = model_encode(
self.sentences_test,
model=model,
prompt_name=self.task_name,
task_type=self.task_type,
**self.encode_kwargs,
)
test_cache = X_test
Expand Down Expand Up @@ -109,6 +109,7 @@ def __init__(
sentences_test,
y_test,
task_name: str,
task_type: str,
k: int = 1,
encode_kwargs: dict[str, Any] = {},
limit: int | None = None,
Expand All @@ -127,6 +128,7 @@ def __init__(
self.y_test = y_test

self.task_name = task_name
self.task_type = task_type
self.encode_kwargs = encode_kwargs

if "batch_size" not in self.encode_kwargs:
Expand All @@ -143,6 +145,7 @@ def __call__(self, model: Encoder, test_cache=None):
self.sentences_train,
model=model,
prompt_name=self.task_name,
task_type=self.task_type,
**self.encode_kwargs,
)

Expand All @@ -151,6 +154,7 @@ def __call__(self, model: Encoder, test_cache=None):
self.sentences_test,
model=model,
prompt_name=self.task_name,
task_type=self.task_type,
**self.encode_kwargs,
)
test_cache = X_test
Expand Down Expand Up @@ -261,6 +265,7 @@ def __init__(
sentences_test,
y_test,
task_name: str,
task_type: str,
max_iter: int = 100,
encode_kwargs: dict[str, Any] = {},
limit: int | None = None,
Expand All @@ -284,6 +289,7 @@ def __init__(

self.max_iter = max_iter
self.task_name = task_name
self.task_type = task_type

def __call__(self, model, test_cache=None):
scores = {}
Expand All @@ -297,13 +303,15 @@ def __call__(self, model, test_cache=None):
self.sentences_train,
model=model,
prompt_name=self.task_name,
task_type=self.task_type,
**self.encode_kwargs,
)
if test_cache is None:
X_test = model_encode(
self.sentences_test,
model=model,
prompt_name=self.task_name,
task_type=self.task_type,
**self.encode_kwargs,
)
test_cache = X_test
Expand Down
3 changes: 3 additions & 0 deletions mteb/evaluation/evaluators/ClusteringEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def __init__(
sentences,
labels,
task_name: str | None = None,
task_type: str | None = None,
clustering_batch_size: int = 500,
limit: int | None = None,
**kwargs,
Expand All @@ -33,6 +34,7 @@ def __init__(
self.labels = labels
self.clustering_batch_size = clustering_batch_size
self.task_name = task_name
self.task_type = task_type

def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}):
if "batch_size" not in encode_kwargs:
Expand All @@ -42,6 +44,7 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}):
self.sentences,
model=model,
prompt_name=self.task_name,
task_type=self.task_type,
**encode_kwargs,
)

Expand Down
5 changes: 2 additions & 3 deletions mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@

import logging

from .RetrievalEvaluator import (
RetrievalEvaluator,
)
from .RetrievalEvaluator import RetrievalEvaluator

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -35,5 +33,6 @@ def __call__(
instructions=instructions,
request_qid=qid,
prompt_name=self.task_name,
task_type=self.task_type,
**kwargs,
)
3 changes: 3 additions & 0 deletions mteb/evaluation/evaluators/PairClassificationEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def __init__(
sentences2,
labels,
task_name: str | None = None,
task_type: str | None = None,
limit: int | None = None,
**kwargs,
):
Expand All @@ -55,6 +56,7 @@ def __init__(
self.sentences2 = sentences2
self.labels = labels
self.task_name = task_name
self.task_type = task_type

assert len(self.sentences1) == len(self.sentences2)
assert len(self.sentences1) == len(self.labels)
Expand Down Expand Up @@ -94,6 +96,7 @@ def compute_metrics(
sentences,
model=model,
prompt_name=self.task_name,
task_type=self.task_type,
**encode_kwargs,
)
emb_dict = dict(zip(sentences, embeddings))
Expand Down
18 changes: 16 additions & 2 deletions mteb/evaluation/evaluators/RerankingEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(
self,
samples,
task_name: str | None = None,
task_type: str | None = None,
mrr_at_k: int = 10,
name: str = "",
similarity_fct=cos_sim,
Expand All @@ -53,6 +54,7 @@ def __init__(
self.similarity_fct = similarity_fct
self.use_batched_encoding = use_batched_encoding
self.task_name = task_name
self.task_type = task_type
self.k_values = k_values
self.evaluator_type = evaluator_type
self.encode_kwargs = encode_kwargs
Expand Down Expand Up @@ -104,6 +106,7 @@ def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode)
encode_queries_func(
[sample["query"] for sample in self.samples],
prompt_name=self.task_name,
task_type=self.task_type,
**self.encode_kwargs,
)
)
Expand All @@ -116,6 +119,7 @@ def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode)
all_query_flattened,
encode_queries_func,
prompt_name=self.task_name,
task_type=self.task_type,
**self.encode_kwargs,
)
else:
Expand Down Expand Up @@ -210,6 +214,7 @@ def _encode_candidates_batched(
all_docs,
encode_corpus_func,
prompt_name=self.task_name,
task_type=self.task_type,
**self.encode_kwargs,
)

Expand Down Expand Up @@ -307,7 +312,10 @@ def _encode_candidates_miracl_batched(self, all_query_embs, encode_corpus_func):

all_docs_embs = np.asarray(
encode_corpus_func(
all_docs, prompt_name=self.task_name, **self.encode_kwargs
all_docs,
prompt_name=self.task_name,
task_type=self.task_type,
**self.encode_kwargs,
)
)

Expand Down Expand Up @@ -422,6 +430,7 @@ def _encode_unique_texts(
all_texts: list[str],
encode_fn: Callable,
prompt_name: str | None,
task_type: str | None,
**encode_kwargs: Any,
):
index_map, all_unique_texts, all_texts_indexes = {}, [], []
Expand All @@ -435,7 +444,12 @@ def _encode_unique_texts(
f"A total on {len(all_texts) - len(all_unique_texts)}/{len(all_texts)} duplicate texts were found during encoding. Only encoding unique text and duplicating embeddings across."
)
all_unique_texts_embs = np.asarray(
encode_fn(all_unique_texts, prompt_name=prompt_name, **encode_kwargs)
encode_fn(
all_unique_texts,
prompt_name=prompt_name,
task_type=task_type,
**encode_kwargs,
)
)
return all_unique_texts_embs[all_texts_indexes]

Expand Down
Loading
Loading