Push to GitHub container registry GHCR (#507)

* dataflow cleanups
broadinstitute · Jan 25, 2023 · c26de33 · c26de33
1 parent 42dde3c
commit c26de33
Show file tree

Hide file tree

Showing 40 changed files with 3,093 additions and 1,052 deletions.
diff --git a/docker/vm_boot_images/build.sh b/docker/vm_boot_images/build.sh
@@ -12,6 +12,7 @@ set -e
 ################### VARIABLES ############################################
 
 REPO="gcr.io/broad-ml4cvd/deeplearning"
+GITHUB_REPO="ghcr.io/broadinstitute/ml4h"
 TAG=$( git rev-parse --short HEAD )
 CONTEXT="docker/vm_boot_images/"
 CPU_ONLY="false"
@@ -54,7 +55,7 @@ usage()
                             Default: Build image to run on GPU-enabled machines and tag image also as '${LATEST_TAG_GPU}'.
 
         -p                  Push to Google Container Register
-        -P                  Push to latest tag
+        -P                  Push to latest tag and to the Github GHCR repository
 
         -h                  Print this help text
 
@@ -116,12 +117,17 @@ docker build ${CONTEXT} \
     --build-arg BASE_IMAGE=${BASE_IMAGE} \
     --tag "${REPO}:${TAG}" \
     --tag "${REPO}:${LATEST_TAG}" \
+    --tag "${GITHUB_REPO}:${TAG}" \
+    --tag "${GITHUB_REPO}:${LATEST_TAG}" \
     --network host \
 
 if [[ ${PUSH_TO_LATEST} == "true" ]]; then
-    echo -e "${BLUE}Pushing the image '${REPO}' to Google Container Registry with tags '${TAG}' and '${LATEST_TAG}'...${NC}"
+    echo -e "${BLUE}Pushing the image '${REPO}' to Github GHCR and Google Container Registry with tags '${TAG}' and '${LATEST_TAG}'...${NC}"
     docker push ${REPO}:${TAG}
     docker push ${REPO}:${LATEST_TAG}
+
+    docker push ${GITHUB_REPO}:${TAG}
+    docker push ${GITHUB_REPO}:${LATEST_TAG}
 fi
 
 if [[ ${PUSH_TO_GCR} == "true" ]]; then

diff --git a/docker/vm_boot_images/config/tensorflow-requirements.txt b/docker/vm_boot_images/config/tensorflow-requirements.txt
@@ -15,8 +15,10 @@ pytest
 pysam
 tensorflow==2.9.1
 tensorflow-addons==0.17.1
-tensorflow_probability==0.17.0
 tensorflow_hub
+tensorflow_probability==0.17.0
+tensorflow-text==2.9
+tf-models-official==2.9
 keras-tuner
 numcodecs
 beautifulsoup4
@@ -36,4 +38,4 @@ torch
 opencv-python
 blosc
 boto3
-ml4ht
+ml4ht==0.0.9
diff --git a/env/requirements_ml4cvd_dataflow.txt b/env/requirements_ml4cvd_dataflow.txt
diff --git a/ml4h/TensorMap.py b/ml4h/TensorMap.py
@@ -46,6 +46,7 @@ class Interpretation(Enum):
     SURVIVAL_CURVE = auto()
     DISCRETIZED = auto()
     MESH = auto()
+    TEXT = auto()
 
     def __str__(self):
         """class Interpretation.FLOAT_ARRAY becomes float_array"""
@@ -170,8 +171,17 @@ def __init__(
         self.time_series_lookup = time_series_lookup
         self.discretization_bounds = discretization_bounds
 
+        # Infer shape from channel map or interpretation
+        if self.shape is None:
+            self.shape = (2,) if self.is_time_to_event() else (len(channel_map),)
+            # Setting time_series_limit indicates dynamic shaping which is always accompanied by 1st dim of None
+            if self.time_series_limit is not None:
+                self.shape = (None,) + self.shape
+
         # Infer loss from interpretation
-        if self.loss is None and self.is_categorical():
+        if self.loss is None and self.is_categorical() and self.shape[0] == 1:
+            self.loss = 'sparse_categorical_crossentropy'
+        elif self.loss is None and self.is_categorical():
             self.loss = 'categorical_crossentropy'
         elif self.loss is None and self.is_continuous() and self.sentinel is not None:
             self.loss = sentinel_logcosh_loss(self.sentinel)
@@ -194,12 +204,7 @@ def __init__(
         elif self.activation is None and (self.is_survival_curve() or self.is_time_to_event()):
             self.activation = 'sigmoid'
 
-        # Infer shape from channel map or interpretation
-        if self.shape is None:
-            self.shape = (2,) if self.is_time_to_event() else (len(channel_map),)
-            # Setting time_series_limit indicates dynamic shaping which is always accompanied by 1st dim of None
-            if self.time_series_limit is not None:
-                self.shape = (None,) + self.shape
+
 
         if self.channel_map is None and self.is_time_to_event():
             self.channel_map = DEFAULT_TIME_TO_EVENT_CHANNELS
@@ -274,6 +279,9 @@ def is_survival_curve(self):
     def is_discretized(self):
         return self.interpretation == Interpretation.DISCRETIZED
 
+    def is_text(self):
+        return self.interpretation == Interpretation.TEXT
+
     def axes(self):
         return len(self.shape)
 
@@ -450,7 +458,7 @@ def _default_tensor_from_file(tm, hd5, dependents={}):
         if tm.hd5_key_guess() in hd5:
             data = tm.hd5_first_dataset_in_group(hd5, tm.hd5_key_guess())
             if tm.storage_type == StorageType.CATEGORICAL_INDEX or tm.storage_type == StorageType.CATEGORICAL_FLAG:
-                index = int(data[0])
+                index = min(int(data[0]), categorical_data.shape[0]-1)
                 categorical_data[index] = 1.0
             else:
                 categorical_data = np.array(data)

diff --git a/ml4h/__init__.py b/ml4h/__init__.py
@@ -1,2 +1 @@
 from . import defines
-#from .tensorize.database.tensorize import tensorize_sql_fields, write_tensor_from_sql
diff --git a/ml4h/arguments.py b/ml4h/arguments.py
@@ -69,8 +69,6 @@ def parse_args():
     parser.add_argument('--bigquery_dataset', default='broad-ml4cvd.ukbb7089_r10data', help='BigQuery dataset containing tables we want to query.')
     parser.add_argument('--xml_folder', default='/mnt/disks/ecg-rest-xml/', help='Path to folder of XMLs of ECG data.')
     parser.add_argument('--zip_folder', default='/mnt/disks/sax-mri-zip/', help='Path to folder of zipped dicom images.')
-    parser.add_argument('--phenos_folder', default='gs://ml4cvd/phenotypes/', help='Path to folder of phenotype defining CSVs.')
-    parser.add_argument('--phecode_definitions', default='/mnt/ml4cvd/projects/jamesp/data/phecode_definitions1.2.csv', help='CSV of phecode definitions')
     parser.add_argument('--dicoms', default='./dicoms/', help='Path to folder of dicoms.')
     parser.add_argument('--sample_csv', default=None, help='Path to CSV with Sample IDs to restrict tensor paths')
     parser.add_argument('--tsv_style', default='standard', choices=['standard', 'genetics'], help='Format choice for the TSV file produced in output by infer and explore modes.')
@@ -452,7 +450,7 @@ def generate_tensormap_id(tm):
     return hashlib.sha256(str(tm).encode("utf-8")).hexdigest()
 
 
-def generate_model_id(tensor_maps_in, tensor_maps_out):
+def generate_model_id(model_name: str, tensor_maps_in: List[TensorMap], tensor_maps_out: List[TensorMap]):
     str_i = '_'.join([str(tmi) for tmi in tensor_maps_in])
     str_o = '_'.join([str(tmo) for tmo in tensor_maps_out])
     model_str = f'{str_i}&{str_o}'
@@ -526,7 +524,7 @@ def _process_args(args):
     logging.info(f"Command Line was: {command_line}")
     logging.info(f'Input SHA256s: {[(tm.name, generate_tensormap_id(tm)) for tm in args.tensor_maps_in]}')
     logging.info(f'Output SHA256s: {[(tm.name, generate_tensormap_id(tm)) for tm in args.tensor_maps_out]}')
-    logging.info(f'Model SHA256: {generate_model_id(args.tensor_maps_in, args.tensor_maps_out)}')
+    logging.info(f'Model SHA256: {generate_model_id(args.id, args.tensor_maps_in, args.tensor_maps_out)}')
     logging.info(f"Arguments are {args}\n")
 
     if args.eager:

diff --git a/ml4h/defines.py b/ml4h/defines.py
@@ -173,7 +173,7 @@ def __str__(self):
 TENSOR_MAPS_FILE_NAME = 'by_script'
 
 # BigQuery tables
-SQL_DATASET = "ukbb7089_201904"
+SQL_DATASET = "ukbb7089_202006"
 DICTIONARY_TABLE = SQL_DATASET+".dictionary"
 CODING_TABLE = SQL_DATASET+".coding"
 PHENOTYPE_TABLE = SQL_DATASET+".phenotype"