Skip to content

Commit

Permalink
Push to GitHub container registry GHCR (#507)
Browse files Browse the repository at this point in the history
* dataflow cleanups
  • Loading branch information
lucidtronix authored Jan 25, 2023
1 parent 42dde3c commit c26de33
Show file tree
Hide file tree
Showing 40 changed files with 3,093 additions and 1,052 deletions.
10 changes: 8 additions & 2 deletions docker/vm_boot_images/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ set -e
################### VARIABLES ############################################

REPO="gcr.io/broad-ml4cvd/deeplearning"
GITHUB_REPO="ghcr.io/broadinstitute/ml4h"
TAG=$( git rev-parse --short HEAD )
CONTEXT="docker/vm_boot_images/"
CPU_ONLY="false"
Expand Down Expand Up @@ -54,7 +55,7 @@ usage()
Default: Build image to run on GPU-enabled machines and tag image also as '${LATEST_TAG_GPU}'.
-p Push to Google Container Register
-P Push to latest tag
-P Push to latest tag and to the Github GHCR repository
-h Print this help text
Expand Down Expand Up @@ -116,12 +117,17 @@ docker build ${CONTEXT} \
--build-arg BASE_IMAGE=${BASE_IMAGE} \
--tag "${REPO}:${TAG}" \
--tag "${REPO}:${LATEST_TAG}" \
--tag "${GITHUB_REPO}:${TAG}" \
--tag "${GITHUB_REPO}:${LATEST_TAG}" \
--network host \

if [[ ${PUSH_TO_LATEST} == "true" ]]; then
echo -e "${BLUE}Pushing the image '${REPO}' to Google Container Registry with tags '${TAG}' and '${LATEST_TAG}'...${NC}"
echo -e "${BLUE}Pushing the image '${REPO}' to Github GHCR and Google Container Registry with tags '${TAG}' and '${LATEST_TAG}'...${NC}"
docker push ${REPO}:${TAG}
docker push ${REPO}:${LATEST_TAG}

docker push ${GITHUB_REPO}:${TAG}
docker push ${GITHUB_REPO}:${LATEST_TAG}
fi

if [[ ${PUSH_TO_GCR} == "true" ]]; then
Expand Down
6 changes: 4 additions & 2 deletions docker/vm_boot_images/config/tensorflow-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ pytest
pysam
tensorflow==2.9.1
tensorflow-addons==0.17.1
tensorflow_probability==0.17.0
tensorflow_hub
tensorflow_probability==0.17.0
tensorflow-text==2.9
tf-models-official==2.9
keras-tuner
numcodecs
beautifulsoup4
Expand All @@ -36,4 +38,4 @@ torch
opencv-python
blosc
boto3
ml4ht
ml4ht==0.0.9
45 changes: 0 additions & 45 deletions env/requirements_ml4cvd_dataflow.txt

This file was deleted.

24 changes: 16 additions & 8 deletions ml4h/TensorMap.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class Interpretation(Enum):
SURVIVAL_CURVE = auto()
DISCRETIZED = auto()
MESH = auto()
TEXT = auto()

def __str__(self):
"""class Interpretation.FLOAT_ARRAY becomes float_array"""
Expand Down Expand Up @@ -170,8 +171,17 @@ def __init__(
self.time_series_lookup = time_series_lookup
self.discretization_bounds = discretization_bounds

# Infer shape from channel map or interpretation
if self.shape is None:
self.shape = (2,) if self.is_time_to_event() else (len(channel_map),)
# Setting time_series_limit indicates dynamic shaping which is always accompanied by 1st dim of None
if self.time_series_limit is not None:
self.shape = (None,) + self.shape

# Infer loss from interpretation
if self.loss is None and self.is_categorical():
if self.loss is None and self.is_categorical() and self.shape[0] == 1:
self.loss = 'sparse_categorical_crossentropy'
elif self.loss is None and self.is_categorical():
self.loss = 'categorical_crossentropy'
elif self.loss is None and self.is_continuous() and self.sentinel is not None:
self.loss = sentinel_logcosh_loss(self.sentinel)
Expand All @@ -194,12 +204,7 @@ def __init__(
elif self.activation is None and (self.is_survival_curve() or self.is_time_to_event()):
self.activation = 'sigmoid'

# Infer shape from channel map or interpretation
if self.shape is None:
self.shape = (2,) if self.is_time_to_event() else (len(channel_map),)
# Setting time_series_limit indicates dynamic shaping which is always accompanied by 1st dim of None
if self.time_series_limit is not None:
self.shape = (None,) + self.shape


if self.channel_map is None and self.is_time_to_event():
self.channel_map = DEFAULT_TIME_TO_EVENT_CHANNELS
Expand Down Expand Up @@ -274,6 +279,9 @@ def is_survival_curve(self):
def is_discretized(self):
return self.interpretation == Interpretation.DISCRETIZED

def is_text(self):
return self.interpretation == Interpretation.TEXT

def axes(self):
return len(self.shape)

Expand Down Expand Up @@ -450,7 +458,7 @@ def _default_tensor_from_file(tm, hd5, dependents={}):
if tm.hd5_key_guess() in hd5:
data = tm.hd5_first_dataset_in_group(hd5, tm.hd5_key_guess())
if tm.storage_type == StorageType.CATEGORICAL_INDEX or tm.storage_type == StorageType.CATEGORICAL_FLAG:
index = int(data[0])
index = min(int(data[0]), categorical_data.shape[0]-1)
categorical_data[index] = 1.0
else:
categorical_data = np.array(data)
Expand Down
1 change: 0 additions & 1 deletion ml4h/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
from . import defines
#from .tensorize.database.tensorize import tensorize_sql_fields, write_tensor_from_sql
6 changes: 2 additions & 4 deletions ml4h/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,6 @@ def parse_args():
parser.add_argument('--bigquery_dataset', default='broad-ml4cvd.ukbb7089_r10data', help='BigQuery dataset containing tables we want to query.')
parser.add_argument('--xml_folder', default='/mnt/disks/ecg-rest-xml/', help='Path to folder of XMLs of ECG data.')
parser.add_argument('--zip_folder', default='/mnt/disks/sax-mri-zip/', help='Path to folder of zipped dicom images.')
parser.add_argument('--phenos_folder', default='gs://ml4cvd/phenotypes/', help='Path to folder of phenotype defining CSVs.')
parser.add_argument('--phecode_definitions', default='/mnt/ml4cvd/projects/jamesp/data/phecode_definitions1.2.csv', help='CSV of phecode definitions')
parser.add_argument('--dicoms', default='./dicoms/', help='Path to folder of dicoms.')
parser.add_argument('--sample_csv', default=None, help='Path to CSV with Sample IDs to restrict tensor paths')
parser.add_argument('--tsv_style', default='standard', choices=['standard', 'genetics'], help='Format choice for the TSV file produced in output by infer and explore modes.')
Expand Down Expand Up @@ -452,7 +450,7 @@ def generate_tensormap_id(tm):
return hashlib.sha256(str(tm).encode("utf-8")).hexdigest()


def generate_model_id(tensor_maps_in, tensor_maps_out):
def generate_model_id(model_name: str, tensor_maps_in: List[TensorMap], tensor_maps_out: List[TensorMap]):
str_i = '_'.join([str(tmi) for tmi in tensor_maps_in])
str_o = '_'.join([str(tmo) for tmo in tensor_maps_out])
model_str = f'{str_i}&{str_o}'
Expand Down Expand Up @@ -526,7 +524,7 @@ def _process_args(args):
logging.info(f"Command Line was: {command_line}")
logging.info(f'Input SHA256s: {[(tm.name, generate_tensormap_id(tm)) for tm in args.tensor_maps_in]}')
logging.info(f'Output SHA256s: {[(tm.name, generate_tensormap_id(tm)) for tm in args.tensor_maps_out]}')
logging.info(f'Model SHA256: {generate_model_id(args.tensor_maps_in, args.tensor_maps_out)}')
logging.info(f'Model SHA256: {generate_model_id(args.id, args.tensor_maps_in, args.tensor_maps_out)}')
logging.info(f"Arguments are {args}\n")

if args.eager:
Expand Down
2 changes: 1 addition & 1 deletion ml4h/defines.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def __str__(self):
TENSOR_MAPS_FILE_NAME = 'by_script'

# BigQuery tables
SQL_DATASET = "ukbb7089_201904"
SQL_DATASET = "ukbb7089_202006"
DICTIONARY_TABLE = SQL_DATASET+".dictionary"
CODING_TABLE = SQL_DATASET+".coding"
PHENOTYPE_TABLE = SQL_DATASET+".phenotype"
Expand Down
Loading

0 comments on commit c26de33

Please sign in to comment.