Skip to content

Commit

Permalink
seqkit grep mulit
Browse files Browse the repository at this point in the history
  • Loading branch information
Joon-Klaps committed Sep 28, 2023
1 parent 12d2274 commit 7497270
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 45 deletions.
4 changes: 2 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ process {
path: { "${params.outdir}/assembly/polishing/intermediate/cluster/members" },
mode: params.publish_dir_mode,
enabled: params.save_intermediate_polishing,
pattern: "*.fa.gz"
pattern: "*.fa"
]
}

Expand All @@ -391,7 +391,7 @@ process {
path: { "${params.outdir}/assembly/polishing/intermediate/cluster/centroids" },
mode: params.publish_dir_mode,
enabled: params.save_intermediate_polishing,
pattern: "*.fa.gz"
pattern: "*.fa"
]
}

Expand Down
63 changes: 63 additions & 0 deletions modules/local/seqkit_grep_multi.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
process SEQKIT_GREP_MULTI {
tag "$meta.id"
label 'process_low'

conda "bioconda::seqkit=2.4.0"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/seqkit:2.4.0--h9ee0642_0':
'biocontainers/seqkit:2.4.0--h9ee0642_0' }"

// Modified from original nf-core file
input:
tuple val(meta), path(txt_pattern_files)
path sequence

output:
tuple val(meta), path("*.{fa,fq}") , emit: filter
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
// fasta or fastq. Exact pattern match .fasta or .fa suffix with optional .gz (gzip) suffix
def suffix = task.ext.suffix ?: "${sequence}" ==~ /(.*f[astn]*a(.gz)?$)/ ? "fa" : "fq"

"""
for file in ${txt_pattern_files.join(' ')};
do
prefix=\$( basename \${file} | sed 's/\\.[^.]*\$//' )
echo "Processing file: \${prefix} .."
seqkit \\
grep \\
$args \\
--threads $task.cpus \\
-f \${file} \\
${sequence} \\
-o \$prefix.${suffix} \\
&& echo "Done" || echo "Failed"
done
cat <<-END_VERSIONS > versions.yml
"${task.process}":
seqkit: \$( seqkit version | sed 's/seqkit v//' )
END_VERSIONS
"""

stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
// fasta or fastq. Exact pattern match .fasta or .fa suffix with optional .gz (gzip) suffix
def suffix = task.ext.suffix ?: "${sequence}" ==~ /(.*f[astn]*a(.gz)?$)/ ? "fa" : "fq"

"""
touch ${prefix}.${suffix}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
seqkit: \$( seqkit version | sed 's/seqkit v//' )
END_VERSIONS
"""
}
65 changes: 24 additions & 41 deletions subworkflows/local/clust_seq_extract.nf
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
// modules
include { CLUSTER_EXTRACT } from '../../modules/local/cluster_extract'
include { SEQKIT_GREP as SEQKIT_GREP_MEMBERS } from '../../modules/nf-core/seqkit/grep/main'
include { SEQKIT_GREP as SEQKIT_GREP_CENTROIDS } from '../../modules/nf-core/seqkit/grep/main'
include { GUNZIP as GUNZIP_MEMBERS } from '../../modules/nf-core/gunzip/main'
include { GUNZIP as GUNZIP_CENTROIDS } from '../../modules/nf-core/gunzip/main'

include { SEQKIT_GREP_MULTI as SEQKIT_GREP_MEMBERS } from '../../modules/local/seqkit_grep_multi'
include { SEQKIT_GREP_MULTI as SEQKIT_GREP_CENTROIDS } from '../../modules/local/seqkit_grep_multi'

workflow CLUST_SEQ_EXTRACT {

Expand All @@ -21,38 +18,24 @@ workflow CLUST_SEQ_EXTRACT {
CLUSTER_EXTRACT(ch_clusters, cluster_method)
ch_versions = ch_versions.mix(CLUSTER_EXTRACT.out.versions.first())

// join with sequence data based on meta - transpose to a long format so we can split up the channels
CLUSTER_EXTRACT.out.members_centroids
.join(db_seq)
.transpose() //wide to long
.set { members_centroids_transposed }

// Update the meta data to include the cluster ID
ch_members_centroids = members_centroids_transposed.map { create_member_ref_channel(it) }
members_centroids = CLUSTER_EXTRACT.out.members_centroids
// [sample_meta, [members], [centroids] ]

// split up members, centroids and db for downstream processing
ch_members_centroids
.map { [ it[1] ] }
.set { ch_members }
ch_members_centroids
.map { [ it[2] ] }
.set { ch_centroids }
ch_members_centroids
.map { [ it[0], it[3] ] }
.set { ch_db_seq_anno }
ch_centroids = members_centroids.map {meta, members, centroids -> [meta, centroids] }
ch_members = members_centroids.map {meta, members, centroids -> [meta, members] }

SEQKIT_GREP_MEMBERS(ch_db_seq_anno,ch_members )
GUNZIP_MEMBERS(SEQKIT_GREP_MEMBERS.out.filter)

SEQKIT_GREP_CENTROIDS(ch_db_seq_anno,ch_centroids )
GUNZIP_CENTROIDS(SEQKIT_GREP_CENTROIDS.out.filter)
// extract members and centroids from db

SEQKIT_GREP_MEMBERS (ch_members, db_seq.map { it[1] } )
ch_versions = ch_versions.mix(SEQKIT_GREP_MEMBERS.out.versions.first())

SEQKIT_GREP_CENTROIDS( ch_centroids, db_seq.map { it[1] })
ch_versions = ch_versions.mix(SEQKIT_GREP_CENTROIDS.out.versions.first())
ch_versions = ch_versions.mix(GUNZIP_CENTROIDS.out.versions.first())

GUNZIP_CENTROIDS.out.gunzip
.join(GUNZIP_MEMBERS.out.gunzip, remainder: true)
SEQKIT_GREP_CENTROIDS.out.filter
.join(SEQKIT_GREP_MEMBERS.out.filter, remainder: true)
.transpose() //wide to long
.map { create_member_ref_channel(it) }
.set { seq_centroids_members }

emit:
Expand All @@ -62,19 +45,19 @@ workflow CLUST_SEQ_EXTRACT {

// Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
def create_member_ref_channel(ArrayList row) {
members = row[1]
centroids = row[2]
sequence = row[3]
meta = row[0]
centroids = row[1]
members = row[2]

sample = String.valueOf(row[0].id) // just to make sure we don't pass by reference
regex = (members =~ /.*\/.*_([0-9]+)_n([0-9]+)_members.txt/) // try and extract the correct cluster ID and size associated to the sample
cluster = regex[0][1]
cluster_size = regex[0][2] as int
id = "${sample}_${cluster}"
sample = String.valueOf(meta.id) // just to make sure we don't pass by reference
regex = (members =~ /.*\/.*_([0-9]+)_n([0-9]+)_members/) // try and extract the correct cluster ID and size associated to the sample
cluster = regex[0][1]
cluster_size = regex[0][2] as int
id = "${sample}_${cluster}"

new_meta = row[0] + [ id: id, cluster: cluster, cluster_size: cluster_size, sample: sample]
new_meta = meta + [ id: id, cluster: cluster, cluster_size: cluster_size, sample: sample]

def result = [ new_meta, members, centroids, sequence]
def result = [ new_meta, centroids, members]
return result
}

2 changes: 0 additions & 2 deletions subworkflows/local/fastq_spades_trinity_megahit.nf
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ workflow FASTQ_SPADES_TRINITY_MEGAHIT {
.groupTuple()
.set{ch_scaffolds_combined}

ch_scaffolds_combined.view()

CAT_CAT(ch_scaffolds_combined)
ch_versions = CAT_CAT.out.versions.first()

Expand Down

0 comments on commit 7497270

Please sign in to comment.