Skip to content

Commit

Permalink
Non-GVS bits required for GVS [VS-971] (#8362)
Browse files Browse the repository at this point in the history
  • Loading branch information
mcovarr authored Jun 27, 2023
1 parent 0cb0861 commit daeb3e2
Show file tree
Hide file tree
Showing 11 changed files with 66 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_D

echo "Running Filtering WDL through cromwell"

cat > ${WORKING_DIR}/src/test/resources/cromwell_monitoring_script.sh <<FIN
while true
do
echo 'dummy monitoring script running...'
sleep 10
done
FIN
cat $WORKING_DIR/vcf_site_level_filtering_mod.json
java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_mod.json

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@
"JointVcfFiltering.annotations": ["ReadPosRankSum", "FS", "SOR", "QD"],
"JointVcfFiltering.output_prefix": "test_10_samples",
"JointVcfFiltering.resource_args": "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
"JointVcfFiltering.extract_extra_args": "-L chr21"
}
"JointVcfFiltering.extract_extra_args": "-L chr21",
"JointVcfFiltering.monitoring_script": "/home/runner/work/gatk/gatk/src/test/resources/cromwell_monitoring_script.sh"
}
36 changes: 33 additions & 3 deletions scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ workflow JointVcfFiltering {
RuntimeAttributes? extract_runtime_attributes
RuntimeAttributes? train_runtime_attributes
RuntimeAttributes? score_runtime_attributes

File? monitoring_script
}

parameter_meta {
Expand All @@ -68,7 +70,8 @@ workflow JointVcfFiltering {
extra_args = extract_extra_args,
gatk_docker = gatk_docker,
gatk_override = gatk_override,
runtime_attributes = extract_runtime_attributes
runtime_attributes = extract_runtime_attributes,
monitoring_script = monitoring_script
}
call TrainVariantAnnotationsModel {
Expand All @@ -82,7 +85,8 @@ workflow JointVcfFiltering {
extra_args = train_extra_args,
gatk_docker = gatk_docker,
gatk_override = gatk_override,
runtime_attributes = train_runtime_attributes
runtime_attributes = train_runtime_attributes,
monitoring_script = monitoring_script
}
scatter (i in range(length(input_vcfs))) {
Expand All @@ -100,7 +104,8 @@ workflow JointVcfFiltering {
extra_args = score_extra_args,
gatk_docker = gatk_docker,
gatk_override = gatk_override,
runtime_attributes = score_runtime_attributes
runtime_attributes = score_runtime_attributes,
monitoring_script = monitoring_script
}
}
Expand All @@ -116,6 +121,13 @@ workflow JointVcfFiltering {
Array[File] scored_vcf_idxs = ScoreVariantAnnotations.scored_vcf_idx
Array[File?] annotations_hdf5s = ScoreVariantAnnotations.annotations_hdf5
Array[File?] scores_hdf5s = ScoreVariantAnnotations.scores_hdf5

Array[File?] monitoring_logs = flatten(
[
[ExtractVariantAnnotations.monitoring_log],
[TrainVariantAnnotationsModel.monitoring_log],
ScoreVariantAnnotations.monitoring_log
])
}
}

Expand All @@ -127,6 +139,7 @@ task ExtractVariantAnnotations {
Array[String] annotations
String resource_args
String? extra_args
File? monitoring_script

String gatk_docker
File? gatk_override
Expand All @@ -143,6 +156,10 @@ task ExtractVariantAnnotations {
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}

if [ -s ~{monitoring_script} ]; then
bash ~{monitoring_script} > monitoring.log &
fi

gatk --java-options "-Xmx~{default=6 runtime_attributes.command_mem_gb}G" \
ExtractVariantAnnotations \
-V ~{input_vcf} \
Expand All @@ -167,6 +184,7 @@ task ExtractVariantAnnotations {
File? unlabeled_annotations_hdf5 = "~{output_prefix}.extract.unlabeled.annot.hdf5"
File extracted_vcf = "~{output_prefix}.extract.vcf.gz" # this line will break if extra_args includes the do-not-gzip-vcf-output argument
File extracted_vcf_idx = "~{output_prefix}.extract.vcf.gz.tbi" # this line will break if extra_args includes the do-not-gzip-vcf-output argument
File? monitoring_log = "monitoring.log"
}
}

Expand All @@ -179,6 +197,7 @@ task TrainVariantAnnotationsModel {
File? hyperparameters_json
String output_prefix
String? extra_args
File? monitoring_script

String gatk_docker
File? gatk_override
Expand All @@ -190,6 +209,10 @@ task TrainVariantAnnotationsModel {
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}

if [ -s ~{monitoring_script} ]; then
bash ~{monitoring_script} > monitoring.log &
fi

gatk --java-options "-Xmx~{default=6 runtime_attributes.command_mem_gb}G" \
TrainVariantAnnotationsModel \
--annotations-hdf5 ~{annotations_hdf5} \
Expand All @@ -213,6 +236,7 @@ task TrainVariantAnnotationsModel {
output {
Array[File] model_files = glob("~{output_prefix}.train.*")
File? monitoring_log = "monitoring.log"
}
}

Expand All @@ -228,6 +252,7 @@ task ScoreVariantAnnotations {
String model_prefix
Array[File] model_files
String? extra_args
File? monitoring_script

String gatk_docker
File? gatk_override
Expand All @@ -245,6 +270,10 @@ task ScoreVariantAnnotations {
command {
set -e

if [ -s ~{monitoring_script} ]; then
bash ~{monitoring_script} > monitoring.log &
fi

export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}

mkdir model-files
Expand Down Expand Up @@ -276,5 +305,6 @@ task ScoreVariantAnnotations {
File scored_vcf_idx = "~{output_prefix}.score.vcf.gz.tbi" # this line will break if extra_args includes the do-not-gzip-vcf-output argument
File? annotations_hdf5 = "~{output_prefix}.score.annot.hdf5" # this file will only be produced if the number of sites scored is nonzero
File? scores_hdf5 = "~{output_prefix}.score.scores.hdf5" # this file will only be produced if the number of sites scored is nonzero
File? monitoring_log = "monitoring.log"
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import com.google.cloud.bigquery.JobStatistics;
import com.google.cloud.bigquery.TableResult;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import com.google.cloud.bigquery.*;
import io.grpc.StatusRuntimeException;
Expand Down Expand Up @@ -451,6 +451,19 @@ public static StorageAPIAvroReaderAndBigQueryStatistics executeQueryWithStorageA
}

public static boolean doRowsExistFor(String projectID, String datasetName, String tableName, String columnName, String value) {
String template = "SELECT COUNT(*) FROM `%s.%s.%s` WHERE %s = '%s'";
String query = String.format(template, projectID, datasetName, tableName, columnName, value);

BigQueryResultAndStatistics resultAndStatistics = BigQueryUtils.executeQuery(projectID, query, true, null);
for (final FieldValueList row : resultAndStatistics.result.iterateAll()) {
final long count = row.get(0).getLongValue();
return count != 0;
}
throw new GATKException(String.format("No rows returned from count of `%s.%s.%s` for %s = '%s'",
projectID, datasetName, tableName, columnName, value));
}

public static boolean doRowsExistFor(String projectID, String datasetName, String tableName, String columnName, Long value) {
String template = "SELECT COUNT(*) FROM `%s.%s.%s` WHERE %s = %s";
String query = String.format(template, projectID, datasetName, tableName, columnName, value);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import htsjdk.samtools.util.CloseableIterator;
import org.apache.avro.generic.GenericRecord;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import com.google.cloud.bigquery.storage.v1.AvroRows;
import com.google.cloud.bigquery.storage.v1.BigQueryReadClient;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import com.google.cloud.bigquery.JobStatistics;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import com.google.common.collect.ImmutableList;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,9 @@ their names (or descriptions) depend on some threshold. Those filters are not i
public static final String VQSR_FAILURE_PREFIX = "low_VQSLOD_";
public static final String VQSR_FAILURE_SNP = VQSR_FAILURE_PREFIX + SNP;
public static final String VQSR_FAILURE_INDEL = VQSR_FAILURE_PREFIX + INDEL;
public static final String VQS_SENS_FAILURE_PREFIX = "low_VQS_SENS_";
// Prefix for a site (SNP/INDEL) that failed calibration sensitivity cutoff. In this case, the site would be a
// failure if the sensitivity is greater than the threshold.
public static final String VQS_SENS_FAILURE_PREFIX = "high_VQS_SENS_";
public static final String VQS_SENS_FAILURE_SNP = VQS_SENS_FAILURE_PREFIX + SNP;
public static final String VQS_SENS_FAILURE_INDEL = VQS_SENS_FAILURE_PREFIX + INDEL;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import com.google.cloud.bigquery.FieldValueList;
import org.apache.avro.generic.GenericRecord;
Expand Down

0 comments on commit daeb3e2

Please sign in to comment.