Update dependencies, update code, removed unused methods.

Riccorl · Jan 30, 2021 · 0d2ec62 · 0d2ec62
1 parent b262c32
commit 0d2ec62
Show file tree

Hide file tree

Showing 10 changed files with 35 additions and 69,922 deletions.
diff --git a/scripts/bert_base_span.sh b/scripts/bert_base_span.sh
@@ -1,26 +1,17 @@
 #!/bin/bash
-source /home/orlando/miniconda3/bin/activate allennlp
+source /Users/ric/mambaforge/bin/activate srl-mt
 
-HOME="/home/orlando"
-DATASET="$HOME/datasets/ontonotes/conll-formatted-ontonotes-verbatlas"
-PROJECT="$HOME/transformer-srl"
+#HOME="/home/orlando"
+DATASET="/Users/ric/Documents/ComputerScience/Projects/transformer-srl/data/conll2012_pb"
+PROJECT="/Users/ric/Documents/ComputerScience/Projects/transformer-srl"
 # local
 # DATASET="/mnt/d/Datasets/conll2012/conll-formatted-ontonotes-verbatlas-subset"
 # PROJECT="/mnt/c/Users/rikkw/Desktop/Ric/Projects/srl-bert-span"
 
 export SRL_TRAIN_DATA_PATH="$DATASET/data/train"
 export SRL_VALIDATION_DATA_PATH="$DATASET/data/development"
 
-CONFIG="$PROJECT/training_config/bert_base.jsonnet"
+CONFIG="$PROJECT/training_config/bert_base_span.jsonnet"
+MODEL_DIR="$PROJECT/models/bert_base_conll2012"
 
-free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i 1 | grep -Eo [0-9]+)
-
-echo "$free_mem MB"
-while [ "$free_mem" -lt 10000 ]; do
-    free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i 1 | grep -Eo [0-9]+)
-    sleep 5
-done
-
-echo "GPU finally free, training..."
-
-allennlp train $CONFIG -s models/bert_base_va --include-package transformer_srl #--recover
+allennlp train $CONFIG -s $MODEL_DIR --include-package transformer_srl --force #--recover
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="transformer_srl",  # Replace with your own username
-    version="2.4.11",
+    version="2.5",
     author="Riccardo Orlando",
     author_email="[email protected]",
     description="SRL Transformer model",
@@ -20,8 +20,8 @@
         "Operating System :: OS Independent",
     ],
     install_requires=[
-        "allennlp>=1.2,<1.3",
-        "allennlp_models>=1.2,<1.3",
+        "allennlp>=2.0,<2.1",
+        "allennlp_models>=2.0,<2.1",
         "spacy>=2.3,<2.4"
     ],
     python_requires=">=3.6",

diff --git a/training_config/bert_base_span.jsonnet b/training_config/bert_base_span.jsonnet
@@ -40,6 +40,6 @@
         "grad_norm": 1.0,
         "num_epochs": 15,
         "validation_metric": "+f1_role",
-        "cuda_device": 0,
+        "cuda_device": -1,
     },
 }
diff --git a/transformer_srl/dataset_readers.py b/transformer_srl/dataset_readers.py
@@ -1,5 +1,4 @@
 import logging
-import pathlib
 from collections import defaultdict
 from typing import Any, DefaultDict, Dict, List, Set, Tuple
 
@@ -19,8 +18,6 @@
 
 logger = logging.getLogger(__name__)
 
-FRAME_LIST_PATH = pathlib.Path(__file__).resolve().parent / "resources" / "framelist.txt"
-
 
 def _convert_verb_indices_to_wordpiece_indices(
     verb_indices: List[int], offsets: List[int], binary: bool = True
@@ -468,7 +465,10 @@ def _read(self, file_path: str):
                 # transpose rolses, to have a list of roles per frame
                 roles = list(map(list, zip(*roles)))
                 current_frame = 0
-                for i, frame, in enumerate(frames):
+                for (
+                    i,
+                    frame,
+                ) in enumerate(frames):
                     if frame != "_":
                         verb_indicator = [0] * len(frames)
                         verb_indicator[i] = 1

diff --git a/transformer_srl/models.py b/transformer_srl/models.py
@@ -1,14 +1,12 @@
 import pathlib
 from typing import Any, Dict, List, Union
 
-import numpy as np
 import torch
 import torch.nn.functional as F
 from allennlp.data import TextFieldTensors, Vocabulary
 from allennlp.models.model import Model
-from allennlp.modules import Seq2SeqEncoder
 from allennlp.nn import InitializerApplicator, util
-from allennlp.nn.util import get_device_of, get_text_field_mask, sequence_cross_entropy_with_logits
+from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
 from allennlp.training.metrics.fbeta_measure import FBetaMeasure
 from allennlp_models.structured_prediction import SrlBert
 from allennlp_models.structured_prediction.metrics.srl_eval_scorer import (
@@ -19,10 +17,8 @@
 from torch import nn
 from transformers import AutoModel
 
-from transformer_srl.utils import load_label_list, load_lemma_frame, load_role_frame
+from transformer_srl.utils import load_label_list
 
-LEMMA_FRAME_PATH = pathlib.Path(__file__).resolve().parent / "resources" / "lemma2va_ml.tsv"
-FRAME_ROLE_PATH = pathlib.Path(__file__).resolve().parent / "resources" / "frame2role_ml.tsv"
 FRAME_LIST_PATH = pathlib.Path(__file__).resolve().parent / "resources" / "framelist.txt"
 ROLE_LIST_PATH = pathlib.Path(__file__).resolve().parent / "resources" / "rolelist.txt"
 
@@ -57,17 +53,11 @@ def __init__(
         label_smoothing: float = None,
         ignore_span_metric: bool = False,
         srl_eval_path: str = DEFAULT_SRL_EVAL_PATH,
-        restrict_frames: bool = False,
-        restrict_roles: bool = False,
         inventory: str = "verbatlas",
         **kwargs,
     ) -> None:
         # bypass SrlBert constructor
         Model.__init__(self, vocab, **kwargs)
-        self.lemma_frame_dict = load_lemma_frame(LEMMA_FRAME_PATH)
-        self.frame_role_dict = load_role_frame(FRAME_ROLE_PATH)
-        self.restrict_frames = restrict_frames
-        self.restrict_roles = restrict_roles
         self.transformer = AutoModel.from_pretrained(bert_model)
         self.frame_criterion = nn.CrossEntropyLoss()
         if inventory == "verbatlas":
@@ -146,7 +136,10 @@ def forward(  # type: ignore
         mask = get_text_field_mask(tokens)
         input_ids = util.get_token_ids_from_text_field_tensors(tokens)
         bert_embeddings, _ = self.transformer(
-            input_ids=input_ids, token_type_ids=verb_indicator, attention_mask=mask,
+            input_ids=input_ids,
+            token_type_ids=verb_indicator,
+            attention_mask=mask,
+            return_dict=False,
         )
         # extract embeddings
         embedded_text_input = self.embedding_dropout(bert_embeddings)
@@ -224,32 +217,7 @@ def forward(  # type: ignore
     def decode_frames(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
         # frame prediction
         frame_probabilities = output_dict["frame_probabilities"]
-        if self.restrict_frames:
-            frame_probabilities = frame_probabilities.cpu().data.numpy()
-            lemmas = output_dict["lemma"]
-            candidate_labels = [self.lemma_frame_dict.get(l, []) for l in lemmas]
-            # clear candidates from unknowns
-            label_set = set(k for k in self._get_label_tokens("frames_labels"))
-            candidate_labels_ids = [
-                [
-                    self.vocab.get_token_index(l, namespace="frames_labels")
-                    for l in cl
-                    if l in label_set
-                ]
-                for cl in candidate_labels
-            ]
-
-            frame_predictions = []
-            for cl, fp in zip(candidate_labels_ids, frame_probabilities):
-                # restrict candidates from verbatlas inventory
-                fp_candidates = np.take(fp, cl)
-                if fp_candidates.size > 0:
-                    frame_predictions.append(cl[fp_candidates.argmax(axis=-1)])
-                else:
-                    frame_predictions.append(fp.argmax(axis=-1))
-        else:
-            frame_predictions = frame_probabilities.argmax(dim=-1).cpu().data.numpy()
-
+        frame_predictions = frame_probabilities.argmax(dim=-1).cpu().data.numpy()
         output_dict["frame_tags"] = [
             self.vocab.get_token_from_index(f, namespace="frames_labels") for f in frame_predictions
         ]
@@ -263,33 +231,9 @@ def make_output_human_readable(
         self, output_dict: Dict[str, torch.Tensor]
     ) -> Dict[str, torch.Tensor]:
         output_dict = self.decode_frames(output_dict)
-        if self.restrict_roles:
-            output_dict = self._mask_args(output_dict)
         output_dict = super().make_output_human_readable(output_dict)
         return output_dict
 
-    def _mask_args(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
-        class_probs = output_dict["class_probabilities"]
-        device = get_device_of(class_probs)
-        # torch doesn't like -1 as cpu device
-        device = torch.device("cuda" if device >= 0 else "cpu")
-        lemmas = output_dict["lemma"]
-        frames = output_dict["frame_tags"]
-        candidate_mask = torch.ones_like(class_probs, dtype=torch.bool).to(device)
-        for i, (l, f) in enumerate(zip(lemmas, frames)):
-            candidates = self.frame_role_dict.get((l, f), [])
-            if candidates:
-                canidate_ids = [
-                    self.vocab.get_token_index(r, namespace="labels") for r in candidates
-                ]
-                canidate_ids = torch.tensor(canidate_ids).to(device)
-                canidate_ids = canidate_ids.repeat(candidate_mask.shape[1], 1)
-                candidate_mask[i].scatter_(1, canidate_ids, False)
-            else:
-                candidate_mask[i].fill_(False)
-        class_probs.masked_fill_(candidate_mask, 0)
-        return output_dict
-
     @overrides
     def get_metrics(self, reset: bool = False):
         if self.ignore_span_metric:
@@ -303,13 +247,9 @@ def get_metrics(self, reset: bool = False):
             # This can be a lot of metrics, as there are 3 per class.
             # we only really care about the overall metrics, so we filter for them here.
             metric_dict_filtered = {
-                x.split("-")[0] + "_role": y
-                for x, y in metric_dict.items()
-                if "overall" in x #and "f1" in x
-            }
-            frame_metric_dict = {
-                x + "_frame": y for x, y in frame_metric_dict.items() #if "fscore" in x
+                x.split("-")[0] + "_role": y for x, y in metric_dict.items() if "overall" in x
             }
+            frame_metric_dict = {x + "_frame": y for x, y in frame_metric_dict.items()}
             return {**metric_dict_filtered, **frame_metric_dict}
 
     def _get_label_tokens(self, namespace: str = "labels"):
@@ -351,17 +291,10 @@ def __init__(
         label_smoothing: float = None,
         ignore_span_metric: bool = False,
         srl_eval_path: str = DEFAULT_SRL_EVAL_PATH,
-        restrict_frames: bool = False,
-        restrict_roles: bool = False,
         **kwargs,
     ) -> None:
         # bypass SrlBert constructor
         Model.__init__(self, vocab, **kwargs)
-        self.lemma_frame_dict = load_lemma_frame(LEMMA_FRAME_PATH)
-        self.frame_role_dict = load_role_frame(FRAME_ROLE_PATH)
-        self.restrict_frames = restrict_frames
-        self.restrict_roles = restrict_roles
-
         if isinstance(model_name, str):
             self.transformer = AutoModel.from_pretrained(model_name)
         else:
@@ -435,6 +368,7 @@ def forward(  # type: ignore
             input_ids=util.get_token_ids_from_text_field_tensors(tokens),
             token_type_ids=verb_indicator,
             attention_mask=mask,
+            return_dict=False,
         )
 
         # extract embeddings
@@ -489,32 +423,7 @@ def forward(  # type: ignore
     def decode_frames(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
         # frame prediction
         frame_probabilities = output_dict["frame_probabilities"]
-        if self.restrict:
-            frame_probabilities = frame_probabilities.cpu().data.numpy()
-            lemmas = output_dict["lemma"]
-            candidate_labels = [self.lemma_frame_dict.get(l, []) for l in lemmas]
-            # clear candidates from unknowns
-            label_set = set(k for k in self._get_label_tokens("frames_labels"))
-            candidate_labels_ids = [
-                [
-                    self.vocab.get_token_index(l, namespace="frames_labels")
-                    for l in cl
-                    if l in label_set
-                ]
-                for cl in candidate_labels
-            ]
-
-            frame_predictions = []
-            for cl, fp in zip(candidate_labels_ids, frame_probabilities):
-                # restrict candidates from verbatlas inventory
-                fp_candidates = np.take(fp, cl)
-                if fp_candidates.size > 0:
-                    frame_predictions.append(cl[fp_candidates.argmax(axis=-1)])
-                else:
-                    frame_predictions.append(fp.argmax(axis=-1))
-        else:
-            frame_predictions = frame_probabilities.argmax(dim=-1).cpu().data.numpy()
-
+        frame_predictions = frame_probabilities.argmax(dim=-1).cpu().data.numpy()
         output_dict["frame_tags"] = [
             self.vocab.get_token_from_index(f, namespace="frames_labels") for f in frame_predictions
         ]
@@ -530,7 +439,7 @@ def make_output_human_readable(
         output_dict = self.decode_frames(output_dict)
         # if self.restrict:
         #     output_dict = self._mask_args(output_dict)
-        # output_dict = super().make_output_human_readable(output_dict)
+        output_dict = super().make_output_human_readable(output_dict)
         roles_probabilities = output_dict["role_probabilities"]
         roles_predictions = roles_probabilities.argmax(dim=-1).cpu().data.numpy()
 
@@ -540,26 +449,6 @@ def make_output_human_readable(
         ]
         return output_dict
 
-    def _mask_args(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
-        class_probs = output_dict["class_probabilities"]
-        device = get_device_of(class_probs)
-        lemmas = output_dict["lemma"]
-        frames = output_dict["frame_tags"]
-        candidate_mask = torch.ones_like(class_probs, dtype=torch.bool).to(device)
-        for i, (l, f) in enumerate(zip(lemmas, frames)):
-            candidates = self.frame_role_dict.get((l, f), [])
-            if candidates:
-                canidate_ids = [
-                    self.vocab.get_token_index(r, namespace="labels") for r in candidates
-                ]
-                canidate_ids = torch.tensor(canidate_ids).to(device)
-                canidate_ids = canidate_ids.repeat(candidate_mask.shape[1], 1)
-                candidate_mask[i].scatter_(1, canidate_ids, False)
-            else:
-                candidate_mask[i].fill_(False)
-        class_probs.masked_fill_(candidate_mask, 0)
-        return output_dict
-
     @overrides
     def get_metrics(self, reset: bool = False):
         role_metric_dict = self.f1_role_metric.get_metric(reset=reset)
@@ -582,4 +471,3 @@ def _get_label_ids(self, namespace: str = "labels"):
         return self.vocab.get_index_to_token_vocabulary(namespace).keys()
 
     default_predictor = "transformer_srl"
-
diff --git a/transformer_srl/predictors.py b/transformer_srl/predictors.py
@@ -1,6 +1,6 @@
 import enum
 from typing import List, Dict, Type
-from allennlp.data.tokenizers.token import Token
+from allennlp.data.tokenizers.token_class import Token
 
 import numpy
 from allennlp.common import plugins
@@ -17,7 +17,10 @@
 @Predictor.register("transformer_srl")
 class SrlTransformersPredictor(SemanticRoleLabelerPredictor):
     def __init__(
-        self, model: Model, dataset_reader: DatasetReader, language: str = "en_core_web_sm",
+        self,
+        model: Model,
+        dataset_reader: DatasetReader,
+        language: str = "en_core_web_sm",
     ) -> None:
         super().__init__(model, dataset_reader, language)
 
@@ -180,9 +183,9 @@ def from_archive(
             model_type = config.get("model").get("type")
             model_class, _ = Model.resolve_class_name(model_type)
             predictor_name = model_class.default_predictor
-        predictor_class: Type[Predictor] = Predictor.by_name(  # type: ignore
-            predictor_name
-        ) if predictor_name is not None else cls
+        predictor_class: Type[Predictor] = (
+            Predictor.by_name(predictor_name) if predictor_name is not None else cls  # type: ignore
+        )
 
         if dataset_reader_to_load == "validation" and "validation_dataset_reader" in config:
             dataset_reader_params = config["validation_dataset_reader"]