nf-core · Darcy220606 · Feb 4, 2025 · Feb 2, 2025 · Feb 2, 2025 · Feb 4, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#435](https://github.com/nf-core/funcscan/pull/435) Fixed dependency errors within taxonomy merging scripts, updated the code and output for all three workflows. Bumped to version 0.1.1. (by @darcy220606)
 - [#437](https://github.com/nf-core/funcscan/pull/437) Fixed file name error when supplying already preprocessed CARD database for ARG workflow. (by @jasmezz)
 - [#446](https://github.com/nf-core/funcscan/pull/446) Updated antiSMASH modules to fix apptainer execution. (by @jasmezz and @jfy133)
+- [#448](https://github.com/nf-core/funcscan/pull/448) Fixed taxonomy merge to work with output from GTDB/SILVA/KALAMARI. (by @darcy220606)
 
 ### `Dependencies`
 

diff --git a/bin/merge_taxonomy.py b/bin/merge_taxonomy.py
@@ -68,16 +68,23 @@
 def reformat_mmseqs_taxonomy(mmseqs_taxonomy):
     """_summary_
     Reformats the taxonomy files and joins them in a list to be passed on to the tools functions
+    Note: Every database from MMseqs outputs a different number of columns only the first 4 and last 2 columns are constant
+            and the most important.
 
     Args:
         mmseqs_taxonomy (tsv): mmseqs output file per sample
 
     Returns:
         data frame: reformatted tables
     """
-    mmseqs2_df = pd.read_csv(mmseqs_taxonomy, sep='\t', header=None, names=['contig_id', 'taxid', 'rank_label', 'scientific_name', 'lineage', 'mmseqs_lineage_contig'])
+    col_numbers = pd.read_csv(mmseqs_taxonomy, sep='\t', header=None, nrows=1).shape[1]
+    selected_cols_numbers = [0, 1, 2, 3, col_numbers - 1]
+    mmseqs2_df = pd.read_csv(mmseqs_taxonomy,
+                                sep='\t',
+                                header=None,
+                                usecols= selected_cols_numbers,
+                                names=['contig_id', 'taxid', 'rank_label', 'scientific_name', 'mmseqs_lineage_contig'])
     # remove the lineage column
-    mmseqs2_df.drop('lineage', axis=1, inplace=True)
     mmseqs2_df['mmseqs_lineage_contig'].unique()
     # convert any classification that has Eukaryota/root to NaN as funcscan targets bacteria ONLY **
     for i, row in mmseqs2_df.iterrows():