Nd bias bootstrap (#374)

Adds protected tensor maps gathered during test set inference Adds bootstrapped performance comparison per class of protected tensor maps
broadinstitute · Jul 29, 2020 · 18839e7 · 18839e7
1 parent c3abae1
commit 18839e7
Show file tree

Hide file tree

Showing 8 changed files with 667 additions and 185 deletions.
diff --git a/ml4cvd/arguments.py b/ml4cvd/arguments.py
@@ -50,11 +50,13 @@ def parse_args():
     )
 
     # Tensor Map arguments
-    parser.add_argument('--input_tensors', default=[], nargs='+')
-    parser.add_argument('--output_tensors', default=[], nargs='+')
-    parser.add_argument('--sample_weight', default=None,  help='TensorMap key for sample weight in training.')
+    parser.add_argument('--input_tensors', default=[], nargs='*')
+    parser.add_argument('--output_tensors', default=[], nargs='*')
+    parser.add_argument('--protected_tensors', default=[], nargs='*')
+    parser.add_argument('--sample_weight', default=None, help='TensorMap key for sample weight in training.')
     parser.add_argument('--tensor_maps_in', default=[], help='Do not set this directly. Use input_tensors')
     parser.add_argument('--tensor_maps_out', default=[], help='Do not set this directly. Use output_tensors')
+    parser.add_argument('--tensor_maps_protected', default=[], help='Do not set this directly. Use protected_tensors')
 
     # Input and Output files and directories
     parser.add_argument(
@@ -414,6 +416,7 @@ def _process_args(args):
         )
     args.tensor_maps_out.extend([_get_tmap(ot, needed_tensor_maps) for ot in args.output_tensors])
     args.tensor_maps_out = parent_sort(args.tensor_maps_out)
+    args.tensor_maps_protected = [_get_tmap(it, needed_tensor_maps) for it in args.protected_tensors]
 
     args.bottleneck_type = BOTTLENECK_STR_TO_ENUM[args.bottleneck_type]
     if args.bottleneck_type == BottleneckType.NoBottleNeck:

diff --git a/ml4cvd/plots.py b/ml4cvd/plots.py
diff --git a/ml4cvd/recipes.py b/ml4cvd/recipes.py
diff --git a/ml4cvd/tensor_generators.py b/ml4cvd/tensor_generators.py
@@ -730,6 +730,7 @@ def get_train_valid_test_paths_split_by_csvs(
 def test_train_valid_tensor_generators(
     tensor_maps_in: List[TensorMap],
     tensor_maps_out: List[TensorMap],
+    tensor_maps_protected: List[TensorMap],
     tensors: str,
     batch_size: int,
     num_workers: int,
@@ -751,9 +752,10 @@ def test_train_valid_tensor_generators(
     **kwargs
 ) -> Tuple[TensorGenerator, TensorGenerator, TensorGenerator]:
     """ Get 3 tensor generator functions for training, validation and testing data.
-
     :param tensor_maps_in: list of TensorMaps that are input names to a model
     :param tensor_maps_out: list of TensorMaps that are output from a model
+    :param tensor_maps_protected: list of TensorMaps that are sensitive to bias from a model
+                                    only added to the test set
     :param tensors: directory containing tensors
     :param batch_size: number of examples in each mini-batch
     :param num_workers: number of processes spun off for training and testing. Validation uses half as many workers
@@ -799,9 +801,18 @@ def test_train_valid_tensor_generators(
 
     num_train_workers = int(training_steps / (training_steps + validation_steps) * num_workers) or (1 if num_workers else 0)
     num_valid_workers = int(validation_steps / (training_steps + validation_steps) * num_workers) or (1 if num_workers else 0)
-    generate_train = TensorGenerator(batch_size, tensor_maps_in, tensor_maps_out, train_paths, num_train_workers, cache_size, weights, keep_paths, mixup_alpha, name='train_worker', siamese=siamese, augment=True, sample_weight=sample_weight)
-    generate_valid = TensorGenerator(batch_size, tensor_maps_in, tensor_maps_out, valid_paths, num_valid_workers, cache_size, weights, keep_paths, name='validation_worker', siamese=siamese, augment=False)
-    generate_test = TensorGenerator(batch_size, tensor_maps_in, tensor_maps_out, test_paths, num_workers, 0, weights, keep_paths or keep_paths_test, name='test_worker', siamese=siamese, augment=False)
+    generate_train = TensorGenerator(
+        batch_size, tensor_maps_in, tensor_maps_out, train_paths, num_train_workers, cache_size, weights,
+        keep_paths, mixup_alpha, name='train_worker', siamese=siamese, augment=True, sample_weight=sample_weight,
+    )
+    generate_valid = TensorGenerator(
+        batch_size, tensor_maps_in, tensor_maps_out, valid_paths, num_valid_workers, cache_size, weights,
+        keep_paths, name='validation_worker', siamese=siamese, augment=False,
+    )
+    generate_test = TensorGenerator(
+        batch_size, tensor_maps_in, tensor_maps_out+tensor_maps_protected, test_paths, num_workers, 0, weights,
+        keep_paths or keep_paths_test, name='test_worker', siamese=siamese, augment=False,
+    )
     return generate_train, generate_valid, generate_test
 
 

diff --git a/ml4cvd/tensor_maps_by_hand.py b/ml4cvd/tensor_maps_by_hand.py
@@ -653,10 +653,15 @@
     },
 )
 
-TMAPS['genetic_caucasian'] = TensorMap('Genetic-ethnic-grouping_Caucasian_0_0', Interpretation.CATEGORICAL, path_prefix='categorical', channel_map={'no_caucasian': 0, 'caucasian': 1})
+TMAPS['genetic_caucasian'] = TensorMap(
+    'Genetic-ethnic-grouping_Caucasian_0_0', Interpretation.CATEGORICAL,
+    storage_type=StorageType.CATEGORICAL_FLAG, path_prefix='categorical',
+    channel_map={'no_caucasian': 0, 'Genetic-ethnic-grouping_Caucasian_0_0': 1},
+)
 TMAPS['genetic_caucasian_weighted'] = TensorMap(
-    'Genetic-ethnic-grouping_Caucasian_0_0', Interpretation.CATEGORICAL, path_prefix='categorical',
-    channel_map={'no_caucasian': 0, 'caucasian': 1}, loss=weighted_crossentropy([10.0, 1.0], 'caucasian_loss'),
+    'Genetic-ethnic-grouping_Caucasian_0_0', Interpretation.CATEGORICAL, storage_type=StorageType.CATEGORICAL_FLAG,
+    path_prefix='categorical', channel_map={'no_caucasian': 0, 'Genetic-ethnic-grouping_Caucasian_0_0': 1},
+    loss=weighted_crossentropy([10.0, 1.0], 'caucasian_loss'),
 )
 
 TMAPS['mothers_age'] = TensorMap(

diff --git a/ml4cvd/tensor_writer_ukbb.py b/ml4cvd/tensor_writer_ukbb.py
@@ -54,7 +54,7 @@
     'cine_segmented_sax_b3', 'cine_segmented_sax_b4', 'cine_segmented_sax_b5', 'cine_segmented_sax_b6', 'cine_segmented_sax_b7',
     'cine_segmented_sax_b8', 'cine_segmented_sax_b9', 'cine_segmented_sax_b10', 'cine_segmented_sax_b11', 'cine_segmented_sax_b12',
     'cine_segmented_sax_b13', 'cine_segmented_sax_inlinevf', 'cine_segmented_lax_inlinevf', 'cine_segmented_ao_dist',
-    'cine_segmented_lvot', 'flow_250_tp_aov_bh_epat@c_p', 'flow_250_tp_aov_bh_epat@c', 'flow_250_tp_aov_bh_epat@c_mag'
+    'cine_segmented_lvot', 'flow_250_tp_aov_bh_epat@c_p', 'flow_250_tp_aov_bh_epat@c', 'flow_250_tp_aov_bh_epat@c_mag',
 ]
 MRI_CARDIAC_SERIES_SEGMENTED = [series+'_segmented' for series in MRI_CARDIAC_SERIES]
 MRI_BRAIN_SERIES = ['t1_p2_1mm_fov256_sag_ti_880', 't2_flair_sag_p2_1mm_fs_ellip_pf78']
@@ -503,7 +503,7 @@ def _tensorize_short_and_long_axis_segmented_cardiac_mri(
             series_segmented = f'{series}_segmented'
             series_zoom = f'{series}_zoom'
             series_zoom_segmented = f'{series}_zoom_segmented'
-            
+
             try:
                 overlay, mask, ventricle_pixels, _ = _get_overlay_from_dicom(slicer)
             except KeyError:

diff --git a/ml4cvd/test_utils.py b/ml4cvd/test_utils.py
@@ -25,14 +25,14 @@
         f'language_1hot_window', shape=(32, 26),
         interpretation=Interpretation.LANGUAGE,
         channel_map={f'c_{i}': i for i in range(26)},
-    )
+    ),
 ]
 LANGUAGE_TMAP_1HOT_SOFTMAX = [
     TensorMap(
         f'language_1hot_out', shape=(26,),
         interpretation=Interpretation.LANGUAGE,
         channel_map={f'c_{i}': i for i in range(26)},
-    )
+    ),
 ]
 
 TMAPS_UP_TO_4D = CONTINUOUS_TMAPS[:-1] + CATEGORICAL_TMAPS[:-1]