Skip to content

Commit

Permalink
Merge branch 'master' into fix/combine_metaphlan2_humann2
Browse files Browse the repository at this point in the history
  • Loading branch information
bernt-matthias authored Jan 25, 2021
2 parents 9c26253 + dc55dc3 commit a5f349f
Show file tree
Hide file tree
Showing 10 changed files with 186 additions and 34,812 deletions.
103 changes: 47 additions & 56 deletions tools/combine_metaphlan2_humann2/combine_metaphlan2_humann2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import argparse


def extract_clade_abundance(metaphlan2_filepath):
clade_abundance = {}
with open(metaphlan2_filepath, 'r') as metaphlan2_file:
for line in metaphlan2_file.readlines():
def extract_clade_abundance(metaphlan2_fp):
clade_abund = {}
with open(metaphlan2_fp, 'r') as metaphlan2_f:
for line in metaphlan2_f.readlines():
if line.find('g__') == -1:
continue

Expand All @@ -18,23 +18,23 @@ def extract_clade_abundance(metaphlan2_filepath):
genus = taxo[(taxo.find('g__')+3):]
if genus.find('|') != -1:
genus = genus[:(genus.find('|'))]
clade_abundance.setdefault(genus, {'abundance': 0, 'species': {}})
clade_abund.setdefault(genus, {'abundance': 0, 'species': {}})
if taxo.find('t__') != -1:
continue
elif taxo.find('s__') != -1:
species = taxo[(taxo.find('s__')+3):]
clade_abundance[genus]['species'].setdefault(
clade_abund[genus]['species'].setdefault(
species,
abundance)
else:
clade_abundance[genus]['abundance'] = abundance
return clade_abundance
clade_abund[genus]['abundance'] = abundance
return clade_abund


def compute_overall_abundance(humann2_file):
def compute_overall_abundance(humann2_fp):
overall_abundance = 0
with open(args.humann2_file, 'r') as humann2_file:
for line in humann2_file.readlines():
with open(humann2_fp, 'r') as humann2_f:
for line in humann2_f.readlines():
if line.find('|') != -1 or line.startswith('#'):
continue
split_line = line[:-1].split('\t')
Expand All @@ -43,72 +43,63 @@ def compute_overall_abundance(humann2_file):


def format_characteristic_name(name):
format_name = name
format_name = format_name.replace('/', ' ')
format_name = format_name.replace('-', ' ')
format_name = format_name.replace("'", '')
if format_name.find('(') != -1 and format_name.find(')') != -1:
open_bracket = format_name.find('(')
close_bracket = format_name.find(')')+1
format_name = format_name[:open_bracket] + format_name[close_bracket:]
return format_name
formatted_n = name
formatted_n = formatted_n.replace('/', ' ')
formatted_n = formatted_n.replace('-', ' ')
formatted_n = formatted_n.replace("'", '')
if formatted_n.find('(') != -1 and formatted_n.find(')') != -1:
open_bracket = formatted_n.find('(')
close_bracket = formatted_n.find(')')+1
formatted_n = formatted_n[:open_bracket] + formatted_n[close_bracket:]
return formatted_n


def combine_metaphlan2_humann2(args):
clade_abundance = extract_clade_abundance(args.metaphlan2_file)
overall_abundance = compute_overall_abundance(args.humann2_file)

with open(args.output_file, 'w') as output_file:
string = 'genus\t'
string += 'genus_abundance\t'
string += 'species\t'
string += 'species_abundance\t'
string += args.type + '_id\t'
string += args.type + '_name\t'
string += args.type + '_abundance\n'
output_file.write(string)
with open(args.humann2_file, 'r') as humann2_file:
for line in humann2_file.readlines():
clade_abund = extract_clade_abundance(args.metaphlan2_fp)
overall_abund = compute_overall_abundance(args.humann2_fp)

with open(args.output_fp, 'w') as output_f:
s = 'genus\tgenus_abundance\tspecies\tspecies_abundance\t'
s = '%s\t%s_id\t%s_name\t%s_abundance\n' % (s, args.type, args.type, args.type)
output_f.write(s)
with open(args.humann2_fp, 'r') as humann2_f:
for line in humann2_f.readlines():
if line.find('|') == -1:
continue

split_line = line[:-1].split('\t')
abundance = 100*float(split_line[1])/overall_abundance
abundance = 100*float(split_line[1])/overall_abund
annotation = split_line[0].split('|')
characteristic = annotation[0].split(':')
characteristic_id = characteristic[0]
charact = annotation[0].split(':')
charact_id = charact[0]
char_name = ''
if len(characteristic) > 1:
char_name = format_characteristic_name(characteristic[-1])
if len(charact) > 1:
char_name = format_characteristic_name(charact[-1])
taxo = annotation[1].split('.')

if taxo[0] == 'unclassified':
continue
genus = taxo[0][3:]
species = taxo[1][3:]

if genus not in clade_abundance:
print("no" + genus + "found in" + args.metaphlan2_file)
if genus not in clade_abund:
print("no %s found in %s" % (genus, args.metaphlan2_fp))
continue
if species not in clade_abundance[genus]['species']:
string = "no" + species + "found in" + args.metaphlan2_file
string += "for" + genus
print(string)
if species not in clade_abund[genus]['species']:
print("no %s found in %s for % s" % (species, args.metaphlan2_fp, genus))
continue
string = genus + '\t'
string += clade_abundance[genus]['abundance'] + '\t'
string += species + '\t'
string += clade_abundance[genus]['species'][species] + '\t'
string += characteristic_id + '\t'
string += char_name + '\t'
string += str(abundance) + '\n'
output_file.write(string)

s = "%s\t%s\t" % (genus, clade_abund[genus]['abundance'])
s += "%s\t%s\t" % (species, clade_abund[genus]['species'][species])
s += "%s\t%s\t%s\n" % (charact_id, char_name, abundance)
output_f.write(s)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--humann2_file', required=True)
parser.add_argument('--metaphlan2_file', required=True)
parser.add_argument('--output_file', required=True)
parser.add_argument('--humann2_fp', required=True)
parser.add_argument('--metaphlan2_fp', required=True)
parser.add_argument('--output_fp', required=True)
parser.add_argument(
'--type',
required=True,
Expand Down
53 changes: 30 additions & 23 deletions tools/combine_metaphlan2_humann2/combine_metaphlan2_humann2.xml
Original file line number Diff line number Diff line change
@@ -1,41 +1,33 @@
<tool id="combine_metaphlan2_humann2" name="Combine MetaPhlAn2 and HUMAnN2 outputs" version="0.1.0">
<tool id="combine_metaphlan2_humann2" name="Combine MetaPhlAn2 and HUMAnN2 outputs" version="0.2.0">
<description>to relate genus/species abundances and gene families/pathways abundances</description>

<requirements>
<requirement type="package" version="2.7.18">python</requirement>
</requirements>

<stdio>
<exit_code range="1:" />
<exit_code range=":-1" />
</stdio>

<version_command></version_command>

<command><![CDATA[
python '$__tool_directory__/combine_metaphlan2_humann2.py'
--metaphlan2_file '$metaphlan2_file'
--humann2_file '$humann2_file'
--type $type
--metaphlan2_fp '$metaphlan2_file'
--humann2_fp '$humann2_file'
--type '$type'
#if str($type) == 'gene_families'
--output_file '$gene_families_output_file'
--output_fp '$gene_families_output_file'
#else
--output_file '$pathway_output_file'
--output_fp '$pathway_output_file'
#end if
]]></command>

<inputs>
<param name="metaphlan2_file" format="txt,tabular" type="data" label="Input file corresponding to MetaPhlAn2 output" help="The MetaPhlAn2 output file contains relative abundance of clades at different taxonomic levels (--metaphlan2_file)"/>

<param name="humann2_file" format="txt,tabular" type="data" label="Input file corresponding to HUMAnN2 output" help="The HUMAnN2 output file contains relative abundance of gene families or pathways with corresponding taxonomic stratification (--humann2_file)"/>

<param name='type' type="select" label="Type of characteristics in HUMAnN2 file" help="(--type)">
<option value="gene_families" selected="true">Gene families</option>
<option value="pathways">Pathways</option>
</param>
</inputs>

<outputs>
<data name="gene_families_output_file" format="tabular"
label="${tool.name} on ${on_string}: Gene family abundances related to genus/species abundances" >
Expand All @@ -46,30 +38,45 @@
<filter>type=="pathways"</filter>
</data>
</outputs>

<tests>
<test>
<param name="metaphlan2_file" value="metaphlan2_input.txt"/>
<param name="humann2_file" value="humann2_gene_families_input.tabular"/>
<param name='type' value="gene_families"/>
<output name="gene_families_output_file" file="gene_families_output.tabular"/>
<param name="metaphlan2_file" value="metaphlan2_input.txt"/>
<param name="humann2_file" value="humann2_gene_families_input.tabular"/>
<param name='type' value="gene_families"/>
<output name="gene_families_output_file">
<assert_contents>
<has_n_columns n="8"/>
<has_n_lines n="29434"/>
<has_text text="Staphylococcus_epidermidis"/>
<has_text text="Putative transposon Tn552 DNA invertase bin3"/>
<has_size value="3467947"/>
</assert_contents>
</output>
</test>
<test>
<param name="metaphlan2_file" value="metaphlan2_input.txt"/>
<param name="humann2_file" value="humann2_pathways_input.tabular"/>
<param name='type' value="pathways"/>
<output name="pathway_output_file" file="pathways_output.tabular"/>
<output name="pathway_output_file">
<assert_contents>
<has_n_columns n="8"/>
<has_n_lines n="1533"/>
<has_text text="Rhodobacter_sphaeroides"/>
<has_text text="superpathway of acetyl CoA biosynthesis"/>
<has_size value="186363"/>
</assert_contents>
</output>
</test>
</tests>

<help><![CDATA[
**What it does**
This tool combine MetaPhlAn2 outputs and HUMANnN2 outputs.
For each gene families/pathways and the corresponding taxonomic stratification, you get relative abundance of this gene family/pathway and the relative abundance of corresponding species and genus.
For each gene families/pathways and the corresponding taxonomic stratification,
you get relative abundance of this gene family/pathway and the relative abundance
of corresponding species and genus.
]]></help>

<citations>
</citations>
</tool>
Loading

0 comments on commit a5f349f

Please sign in to comment.