diff --git a/conf/modules.config b/conf/modules.config index 576e579e..40481af6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -381,7 +381,7 @@ process { path: { "${params.outdir}/assembly/polishing/intermediate/cluster/members" }, mode: params.publish_dir_mode, enabled: params.save_intermediate_polishing, - pattern: "*.fa.gz" + pattern: "*.fa" ] } @@ -391,7 +391,7 @@ process { path: { "${params.outdir}/assembly/polishing/intermediate/cluster/centroids" }, mode: params.publish_dir_mode, enabled: params.save_intermediate_polishing, - pattern: "*.fa.gz" + pattern: "*.fa" ] } diff --git a/modules/local/seqkit_grep_multi.nf b/modules/local/seqkit_grep_multi.nf new file mode 100644 index 00000000..081daba9 --- /dev/null +++ b/modules/local/seqkit_grep_multi.nf @@ -0,0 +1,63 @@ +process SEQKIT_GREP_MULTI { + tag "$meta.id" + label 'process_low' + + conda "bioconda::seqkit=2.4.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqkit:2.4.0--h9ee0642_0': + 'biocontainers/seqkit:2.4.0--h9ee0642_0' }" + + // Modified from original nf-core file + input: + tuple val(meta), path(txt_pattern_files) + path sequence + + output: + tuple val(meta), path("*.{fa,fq}") , emit: filter + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + // fasta or fastq. Exact pattern match .fasta or .fa suffix with optional .gz (gzip) suffix + def suffix = task.ext.suffix ?: "${sequence}" ==~ /(.*f[astn]*a(.gz)?$)/ ? "fa" : "fq" + + """ + for file in ${txt_pattern_files.join(' ')}; + do + prefix=\$( basename \${file} | sed 's/\\.[^.]*\$//' ) + echo "Processing file: \${prefix} .." + seqkit \\ + grep \\ + $args \\ + --threads $task.cpus \\ + -f \${file} \\ + ${sequence} \\ + -o \$prefix.${suffix} \\ + && echo "Done" || echo "Failed" + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$( seqkit version | sed 's/seqkit v//' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + // fasta or fastq. Exact pattern match .fasta or .fa suffix with optional .gz (gzip) suffix + def suffix = task.ext.suffix ?: "${sequence}" ==~ /(.*f[astn]*a(.gz)?$)/ ? "fa" : "fq" + + """ + touch ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$( seqkit version | sed 's/seqkit v//' ) + END_VERSIONS + """ +} diff --git a/subworkflows/local/clust_seq_extract.nf b/subworkflows/local/clust_seq_extract.nf index 2b3af80d..21c6d695 100644 --- a/subworkflows/local/clust_seq_extract.nf +++ b/subworkflows/local/clust_seq_extract.nf @@ -1,10 +1,7 @@ // modules include { CLUSTER_EXTRACT } from '../../modules/local/cluster_extract' -include { SEQKIT_GREP as SEQKIT_GREP_MEMBERS } from '../../modules/nf-core/seqkit/grep/main' -include { SEQKIT_GREP as SEQKIT_GREP_CENTROIDS } from '../../modules/nf-core/seqkit/grep/main' -include { GUNZIP as GUNZIP_MEMBERS } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_CENTROIDS } from '../../modules/nf-core/gunzip/main' - +include { SEQKIT_GREP_MULTI as SEQKIT_GREP_MEMBERS } from '../../modules/local/seqkit_grep_multi' +include { SEQKIT_GREP_MULTI as SEQKIT_GREP_CENTROIDS } from '../../modules/local/seqkit_grep_multi' workflow CLUST_SEQ_EXTRACT { @@ -21,38 +18,24 @@ workflow CLUST_SEQ_EXTRACT { CLUSTER_EXTRACT(ch_clusters, cluster_method) ch_versions = ch_versions.mix(CLUSTER_EXTRACT.out.versions.first()) - // join with sequence data based on meta - transpose to a long format so we can split up the channels - CLUSTER_EXTRACT.out.members_centroids - .join(db_seq) - .transpose() //wide to long - .set { members_centroids_transposed } - - // Update the meta data to include the cluster ID - ch_members_centroids = members_centroids_transposed.map { create_member_ref_channel(it) } + members_centroids = CLUSTER_EXTRACT.out.members_centroids + // [sample_meta, [members], [centroids] ] - // split up members, centroids and db for downstream processing - ch_members_centroids - .map { [ it[1] ] } - .set { ch_members } - ch_members_centroids - .map { [ it[2] ] } - .set { ch_centroids } - ch_members_centroids - .map { [ it[0], it[3] ] } - .set { ch_db_seq_anno } + ch_centroids = members_centroids.map {meta, members, centroids -> [meta, centroids] } + ch_members = members_centroids.map {meta, members, centroids -> [meta, members] } - SEQKIT_GREP_MEMBERS(ch_db_seq_anno,ch_members ) - GUNZIP_MEMBERS(SEQKIT_GREP_MEMBERS.out.filter) - - SEQKIT_GREP_CENTROIDS(ch_db_seq_anno,ch_centroids ) - GUNZIP_CENTROIDS(SEQKIT_GREP_CENTROIDS.out.filter) + // extract members and centroids from db + SEQKIT_GREP_MEMBERS (ch_members, db_seq.map { it[1] } ) + ch_versions = ch_versions.mix(SEQKIT_GREP_MEMBERS.out.versions.first()) + SEQKIT_GREP_CENTROIDS( ch_centroids, db_seq.map { it[1] }) ch_versions = ch_versions.mix(SEQKIT_GREP_CENTROIDS.out.versions.first()) - ch_versions = ch_versions.mix(GUNZIP_CENTROIDS.out.versions.first()) - GUNZIP_CENTROIDS.out.gunzip - .join(GUNZIP_MEMBERS.out.gunzip, remainder: true) + SEQKIT_GREP_CENTROIDS.out.filter + .join(SEQKIT_GREP_MEMBERS.out.filter, remainder: true) + .transpose() //wide to long + .map { create_member_ref_channel(it) } .set { seq_centroids_members } emit: @@ -62,19 +45,19 @@ workflow CLUST_SEQ_EXTRACT { // Function to get list of [ meta, [ fastq_1, fastq_2 ] ] def create_member_ref_channel(ArrayList row) { - members = row[1] - centroids = row[2] - sequence = row[3] + meta = row[0] + centroids = row[1] + members = row[2] - sample = String.valueOf(row[0].id) // just to make sure we don't pass by reference - regex = (members =~ /.*\/.*_([0-9]+)_n([0-9]+)_members.txt/) // try and extract the correct cluster ID and size associated to the sample - cluster = regex[0][1] - cluster_size = regex[0][2] as int - id = "${sample}_${cluster}" + sample = String.valueOf(meta.id) // just to make sure we don't pass by reference + regex = (members =~ /.*\/.*_([0-9]+)_n([0-9]+)_members/) // try and extract the correct cluster ID and size associated to the sample + cluster = regex[0][1] + cluster_size = regex[0][2] as int + id = "${sample}_${cluster}" - new_meta = row[0] + [ id: id, cluster: cluster, cluster_size: cluster_size, sample: sample] + new_meta = meta + [ id: id, cluster: cluster, cluster_size: cluster_size, sample: sample] - def result = [ new_meta, members, centroids, sequence] + def result = [ new_meta, centroids, members] return result } diff --git a/subworkflows/local/fastq_spades_trinity_megahit.nf b/subworkflows/local/fastq_spades_trinity_megahit.nf index d131a26f..5f798c63 100644 --- a/subworkflows/local/fastq_spades_trinity_megahit.nf +++ b/subworkflows/local/fastq_spades_trinity_megahit.nf @@ -57,8 +57,6 @@ workflow FASTQ_SPADES_TRINITY_MEGAHIT { .groupTuple() .set{ch_scaffolds_combined} - ch_scaffolds_combined.view() - CAT_CAT(ch_scaffolds_combined) ch_versions = CAT_CAT.out.versions.first()