Use LLMs to answer difficult science questions
Inspired by the OpenBookQA dataset, this competition challenges participants to answer difficult science-based questions written by a Large Language Model.
- Type: NLP
- Recommended Model: BERT, LLAMA
- Evaluation Metrics: Mean Average Precision @ 3
- Baseline:
One row example:
propmt: What is the most popular explanation for the shower-curtain effect?
A: The pressure differential between the inside and outside of the shower
B: The decrease in velocity resulting in an increase in pressure
C: The movement of air across the outside surface of the shower curtain
D: The use of cold water
E: Bernoulli's principle
answer: E
test.csv - the test set. Your task it to predict the top three most probable answers given the prompt.
One row example:
propmt: What is the term used in astrophysics to describe light-matter interactions resulting in energy shif...
A: Blueshifting
B: Redshifting
C: Reddening
D: Whitening
E: Yellowing
- Utilized the Pre-trained BERT model on Hugging face.
- Finetune the BERT model.
- Generate probabilities for each option with AutoModelForMultipleChoice class.
- Manipulate the output for 3 options with highest prob.
- Sturecture the pipeline for switching different models.
- bert-base-cased model: CV 1.512489
- bert-base-multilingual-uncased model: CV 1.248739
# --- skeleton Requirement: can generate different predictions for different models in Hugging Face.
class LLM_prediction:
def __init__(self,model_path,options = 'ABCDE'):
self.model_path = model_path
self.options = options
self.indices = list(range(len(options)))
self.option_to_index = {option: index for option, index in zip(self.options, self.indices)}
self.index_to_option = {index: option for option, index in zip(self.options, self.indices)}
return
def read_data(self,data_folder = None):
#training
train_df = pd.read_csv(f"{data_folder}/train.csv")
self.train_ds = Dataset.from_pandas(train_df)
#testing
self.test_df = pd.read_csv(f"{data_folder}/test.csv")
return self.train_ds,self.test_df
def pre_process_data(self,row):
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
question = [row['prompt']]*5
answers = []
for option in self.options:
answers.append(row[option])
tokenized_row = self.tokenizer(question,answers,truncation = True)
tokenized_row['label'] = self.option_to_index[row['answer']]
return tokenized_row
def nlp(self,output_model_dir = 'finetuned_bert'):
#return trainer
model = AutoModelForMultipleChoice.from_pretrained(self.model_path)
tokenized_train_ds = self.train_ds.map(self.pre_process_data, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
training_args = TrainingArguments(
output_dir=output_model_dir,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
learning_rate=5e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=3,
weight_decay=0.01,
report_to='none'
)
self.trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train_ds,
eval_dataset=tokenized_train_ds,
tokenizer=self.tokenizer,
data_collator=DataCollatorForMultipleChoice(tokenizer=self.tokenizer),
)
self.trainer.train()
return self.trainer
def predictions_to_map_output(self,predictions):
sorted_answer_indices = np.argsort(-predictions)
top_answer_indices = sorted_answer_indices[:,:3] # Get the first three answers in each row
top_answers = np.vectorize(self.index_to_option.get)(top_answer_indices)
return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)
def inference(self,assign_random_answer = True):
if not assign_random_answer:
raise ValueError('Another inference way has not been be developed.')
self.test_df['answer']='A'
self.test_ds = Dataset.from_pandas(self.test_df)
tokenized_test_ds = self.test_ds.map(self.pre_process_data, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
predictions=self.trainer.predict(tokenized_test_ds)
return self.predictions_to_map_output(predictions.predictions)