Merge branch 'master' into fix/combine_metaphlan2_humann2

ASaiM · Jan 25, 2021 · a5f349f · a5f349f
2 parents 9c26253 + dc55dc3
commit a5f349f
Show file tree

Hide file tree

Showing 10 changed files with 186 additions and 34,812 deletions.
diff --git a/tools/combine_metaphlan2_humann2/combine_metaphlan2_humann2.py b/tools/combine_metaphlan2_humann2/combine_metaphlan2_humann2.py
@@ -4,10 +4,10 @@
 import argparse
 
 
-def extract_clade_abundance(metaphlan2_filepath):
-    clade_abundance = {}
-    with open(metaphlan2_filepath, 'r') as metaphlan2_file:
-        for line in metaphlan2_file.readlines():
+def extract_clade_abundance(metaphlan2_fp):
+    clade_abund = {}
+    with open(metaphlan2_fp, 'r') as metaphlan2_f:
+        for line in metaphlan2_f.readlines():
             if line.find('g__') == -1:
                 continue
 
@@ -18,23 +18,23 @@ def extract_clade_abundance(metaphlan2_filepath):
             genus = taxo[(taxo.find('g__')+3):]
             if genus.find('|') != -1:
                 genus = genus[:(genus.find('|'))]
-            clade_abundance.setdefault(genus, {'abundance': 0, 'species': {}})
+            clade_abund.setdefault(genus, {'abundance': 0, 'species': {}})
             if taxo.find('t__') != -1:
                 continue
             elif taxo.find('s__') != -1:
                 species = taxo[(taxo.find('s__')+3):]
-                clade_abundance[genus]['species'].setdefault(
+                clade_abund[genus]['species'].setdefault(
                     species,
                     abundance)
             else:
-                clade_abundance[genus]['abundance'] = abundance
-    return clade_abundance
+                clade_abund[genus]['abundance'] = abundance
+    return clade_abund
 
 
-def compute_overall_abundance(humann2_file):
+def compute_overall_abundance(humann2_fp):
     overall_abundance = 0
-    with open(args.humann2_file, 'r') as humann2_file:
-        for line in humann2_file.readlines():
+    with open(humann2_fp, 'r') as humann2_f:
+        for line in humann2_f.readlines():
             if line.find('|') != -1 or line.startswith('#'):
                 continue
             split_line = line[:-1].split('\t')
@@ -43,72 +43,63 @@ def compute_overall_abundance(humann2_file):
 
 
 def format_characteristic_name(name):
-    format_name = name
-    format_name = format_name.replace('/', ' ')
-    format_name = format_name.replace('-', ' ')
-    format_name = format_name.replace("'", '')
-    if format_name.find('(') != -1 and format_name.find(')') != -1:
-        open_bracket = format_name.find('(')
-        close_bracket = format_name.find(')')+1
-        format_name = format_name[:open_bracket] + format_name[close_bracket:]
-    return format_name
+    formatted_n = name
+    formatted_n = formatted_n.replace('/', ' ')
+    formatted_n = formatted_n.replace('-', ' ')
+    formatted_n = formatted_n.replace("'", '')
+    if formatted_n.find('(') != -1 and formatted_n.find(')') != -1:
+        open_bracket = formatted_n.find('(')
+        close_bracket = formatted_n.find(')')+1
+        formatted_n = formatted_n[:open_bracket] + formatted_n[close_bracket:]
+    return formatted_n
 
 
 def combine_metaphlan2_humann2(args):
-    clade_abundance = extract_clade_abundance(args.metaphlan2_file)
-    overall_abundance = compute_overall_abundance(args.humann2_file)
-
-    with open(args.output_file, 'w') as output_file:
-        string = 'genus\t'
-        string += 'genus_abundance\t'
-        string += 'species\t'
-        string += 'species_abundance\t'
-        string += args.type + '_id\t'
-        string += args.type + '_name\t'
-        string += args.type + '_abundance\n'
-        output_file.write(string)
-        with open(args.humann2_file, 'r') as humann2_file:
-            for line in humann2_file.readlines():
+    clade_abund = extract_clade_abundance(args.metaphlan2_fp)
+    overall_abund = compute_overall_abundance(args.humann2_fp)
+
+    with open(args.output_fp, 'w') as output_f:
+        s = 'genus\tgenus_abundance\tspecies\tspecies_abundance\t'
+        s = '%s\t%s_id\t%s_name\t%s_abundance\n' % (s, args.type, args.type, args.type)
+        output_f.write(s)
+        with open(args.humann2_fp, 'r') as humann2_f:
+            for line in humann2_f.readlines():
                 if line.find('|') == -1:
                     continue
 
                 split_line = line[:-1].split('\t')
-                abundance = 100*float(split_line[1])/overall_abundance
+                abundance = 100*float(split_line[1])/overall_abund
                 annotation = split_line[0].split('|')
-                characteristic = annotation[0].split(':')
-                characteristic_id = characteristic[0]
+                charact = annotation[0].split(':')
+                charact_id = charact[0]
                 char_name = ''
-                if len(characteristic) > 1:
-                    char_name = format_characteristic_name(characteristic[-1])
+                if len(charact) > 1:
+                    char_name = format_characteristic_name(charact[-1])
                 taxo = annotation[1].split('.')
 
                 if taxo[0] == 'unclassified':
                     continue
                 genus = taxo[0][3:]
                 species = taxo[1][3:]
 
-                if genus not in clade_abundance:
-                    print("no" + genus + "found in" + args.metaphlan2_file)
+                if genus not in clade_abund:
+                    print("no %s found in %s" % (genus, args.metaphlan2_fp))
                     continue
-                if species not in clade_abundance[genus]['species']:
-                    string = "no" + species + "found in" + args.metaphlan2_file
-                    string += "for" + genus
-                    print(string)
+                if species not in clade_abund[genus]['species']:
+                    print("no %s found in %s for % s" % (species, args.metaphlan2_fp, genus))
                     continue
-                string = genus + '\t'
-                string += clade_abundance[genus]['abundance'] + '\t'
-                string += species + '\t'
-                string += clade_abundance[genus]['species'][species] + '\t'
-                string += characteristic_id + '\t'
-                string += char_name + '\t'
-                string += str(abundance) + '\n'
-                output_file.write(string)
+
+                s = "%s\t%s\t" % (genus, clade_abund[genus]['abundance'])
+                s += "%s\t%s\t" % (species, clade_abund[genus]['species'][species])
+                s += "%s\t%s\t%s\n" % (charact_id, char_name, abundance)
+                output_f.write(s)
+
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--humann2_file', required=True)
-    parser.add_argument('--metaphlan2_file', required=True)
-    parser.add_argument('--output_file', required=True)
+    parser.add_argument('--humann2_fp', required=True)
+    parser.add_argument('--metaphlan2_fp', required=True)
+    parser.add_argument('--output_fp', required=True)
     parser.add_argument(
         '--type',
         required=True,

diff --git a/tools/combine_metaphlan2_humann2/combine_metaphlan2_humann2.xml b/tools/combine_metaphlan2_humann2/combine_metaphlan2_humann2.xml
@@ -1,41 +1,33 @@
-<tool id="combine_metaphlan2_humann2" name="Combine MetaPhlAn2 and HUMAnN2 outputs" version="0.1.0">
+<tool id="combine_metaphlan2_humann2" name="Combine MetaPhlAn2 and HUMAnN2 outputs" version="0.2.0">
     <description>to relate genus/species abundances and gene families/pathways abundances</description>
-
     <requirements>
         <requirement type="package" version="2.7.18">python</requirement>
     </requirements>
-
     <stdio>
         <exit_code range="1:" />
         <exit_code range=":-1" />
     </stdio>
-
     <version_command></version_command>
-
     <command><![CDATA[
         python '$__tool_directory__/combine_metaphlan2_humann2.py'
-            --metaphlan2_file '$metaphlan2_file'
-            --humann2_file '$humann2_file'
-            --type $type    
+            --metaphlan2_fp '$metaphlan2_file'
+            --humann2_fp '$humann2_file'
+            --type '$type'   
 
             #if str($type) == 'gene_families'
-                --output_file '$gene_families_output_file'
+                --output_fp '$gene_families_output_file'
             #else
-                --output_file '$pathway_output_file'
+                --output_fp '$pathway_output_file'
             #end if
     ]]></command>
-
     <inputs>
         <param name="metaphlan2_file" format="txt,tabular" type="data" label="Input file corresponding to MetaPhlAn2 output" help="The MetaPhlAn2 output file contains relative abundance of clades at different taxonomic levels (--metaphlan2_file)"/>
-
         <param name="humann2_file" format="txt,tabular" type="data" label="Input file corresponding to HUMAnN2 output" help="The HUMAnN2 output file contains relative abundance of gene families or pathways with corresponding taxonomic stratification (--humann2_file)"/>
-
         <param name='type' type="select" label="Type of characteristics in HUMAnN2 file" help="(--type)">
             <option value="gene_families" selected="true">Gene families</option>
             <option value="pathways">Pathways</option>
         </param>
     </inputs>
-
     <outputs>
         <data name="gene_families_output_file" format="tabular"
             label="${tool.name} on ${on_string}: Gene family abundances related to genus/species abundances" >
@@ -46,30 +38,45 @@
             <filter>type=="pathways"</filter>
         </data>
     </outputs>
-
     <tests>
     	<test>
-    		 <param name="metaphlan2_file" value="metaphlan2_input.txt"/>
-    		 <param name="humann2_file" value="humann2_gene_families_input.tabular"/>
-    		 <param name='type' value="gene_families"/>
-    		 <output name="gene_families_output_file" file="gene_families_output.tabular"/>
+    		<param name="metaphlan2_file" value="metaphlan2_input.txt"/>
+    		<param name="humann2_file" value="humann2_gene_families_input.tabular"/>
+    		<param name='type' value="gene_families"/>
+    		<output name="gene_families_output_file">
+                <assert_contents>
+                    <has_n_columns n="8"/>
+                    <has_n_lines n="29434"/>
+                    <has_text text="Staphylococcus_epidermidis"/>
+                    <has_text text="Putative transposon Tn552 DNA invertase bin3"/>
+                    <has_size value="3467947"/>
+                </assert_contents>
+            </output>
     	</test>
     	<test>
     		 <param name="metaphlan2_file" value="metaphlan2_input.txt"/>
     		 <param name="humann2_file" value="humann2_pathways_input.tabular"/>
     		 <param name='type' value="pathways"/>
-    		 <output name="pathway_output_file" file="pathways_output.tabular"/>
+             <output name="pathway_output_file">
+                <assert_contents>
+                    <has_n_columns n="8"/>
+                    <has_n_lines n="1533"/>
+                    <has_text text="Rhodobacter_sphaeroides"/>
+                    <has_text text="superpathway of acetyl CoA biosynthesis"/>
+                    <has_size value="186363"/>
+                </assert_contents>
+            </output>
     	</test>
     </tests>
-
     <help><![CDATA[
 **What it does**
 
 This tool combine MetaPhlAn2 outputs and HUMANnN2 outputs.
 
-For each gene families/pathways and the corresponding taxonomic stratification, you get relative abundance of this gene family/pathway and the relative abundance of corresponding species and genus.
+For each gene families/pathways and the corresponding taxonomic stratification, 
+you get relative abundance of this gene family/pathway and the relative abundance 
+of corresponding species and genus.
     ]]></help>
-
     <citations>
     </citations>
 </tool>