qiita-spots · antgonza · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/README.rst b/README.rst
@@ -1,4 +1,20 @@
 Job Output Folder Data Type Plugin
 ==================================
 
-This is the data type plugin for Qiita jobs that only create a folder and the command is responsible of generating all the files.
+The goal of this Qiita type plugin is to validate and summarize any kind of folder output.
+
+Note that `job-output-folder` expects a single folder and this will become an artifact that will live in
+`[qiita-base-path]/job-output-folder/[artifact-id]/[output-folder]` and this plugin will generate:
+
+- `summary.html`: a browser friendly file listing that will include all files at `[artifact-id]/[output-folder]` and
+  any `index.html` files in any subfolder. As a reminder, the Qiita nginx basic configuration allows to display/load any
+  html/JS available files; thus, able to display properly `index.html` files available
+- `MANIFEST.txt`: a comprehensive list of all available files in the folder.
+
+The two main plugins using this output are:
+
+- https://github.com/qiita-spots/qp-knight-lab-processing: which will generate an `[output-folder]` contaning all the logs,
+  files and summaries from BCL to clean FASTQ processing. Note that multiqc resoults are part of this and the outputs are
+  properly displayed in Qiita using this method.
+- https://github.com/qiita-spots/qp-pacbio: `PacBio processing`, the output are MAG, LCG and other output, which will be used
+  for dowstream analyses.
diff --git a/qtp_job_output_folder/summary.py b/qtp_job_output_folder/summary.py
@@ -6,36 +6,64 @@
 # The full license is in the file LICENSE, distributed with this software.
 # -----------------------------------------------------------------------------
 
-from os.path import exists, isdir, join, dirname
-from glob import glob
 from json import dumps
+from os import sep, walk
+from os.path import basename, dirname, exists, isdir, join
 
 
 def _folder_listing(folder):
-    results = []
-    for f in glob(f'{folder}/*'):
-        if isdir(f):
-            results.append(('folder', f))
-            results.extend(_folder_listing(f'{f}/*'))
-        else:
-            results.append(('file', f))
-    return results
+    index, manifest = [], []
+    # only adding main files on top directory
+    # and index.html at any level
+    separator = "|--"
+    for dpath, _, files in walk(folder):
+        # assuring same order, mainly for testing
+        files.sort()
+
+        # if we are at the top, we should add
+        # all files
+        if dpath == folder:
+            for f in files:
+                index.append(("file", f"{dpath}/{f}"))
+        # if we are not at the top, we should only add
+        # the index.html files
+        elif "index.html" in files:
+            index.append(("file", f"{dpath}/index.html"))
+
+        depth = dpath.replace(folder, "").count(sep)
+        space = separator * depth
+        manifest.append(f"{space} {basename(dpath)}/")
+        for filename in files:
+            manifest.append(f"{space}{separator} {filename}")
+
+    return index, manifest
 
 
 def _generate_html_summary(jid, folder, out_dir):
-    summary = f'<h3><b>{folder}</b> does not exist.</h3>'
+    summary = f"<h3><b>{folder}</b> does not exist.</h3>"
+    manifest_fp = join(folder, "MANIFEST.txt")
+    index_fp = join(out_dir, "summary.html")
 
     if exists(folder) and isdir(folder):
         # calculating the "trimming" for the fullpaths, +1 is to remove /
         tname = len(dirname(folder)) + 1
         tlink = len(dirname(dirname(folder)))
-        summary = '<br/>\n'.join([
-            f'<a href=".{f[tlink:]}" type="{ft}" target="_blank">'
-            f'{f[tname:]}</a>'
-            for ft, f in _folder_listing(folder)])
+        link = '<a href=".%s" type="%s" target="_blank">%s</a>'
+        index, manifest = _folder_listing(folder)
 
-    index_fp = join(out_dir, "summary.html")
-    with open(index_fp, 'w') as of:
+        with open(manifest_fp, "w") as of:
+            of.write("\n".join(manifest))
+
+        links = [link % (manifest_fp[tlink:], "file", manifest_fp[tname:])]
+        for ft, f in index:
+            # to avoid any duplication of lines:
+            _link = link % (f[tlink:], ft, f[tname:])
+            if _link not in links:
+                links.append(_link)
+
+        summary = "<br/>\n".join(links)
+
+    with open(index_fp, "w") as of:
         of.write(summary)
 
     # we could add a support folder for the summary
@@ -69,22 +97,22 @@ def generate_html_summary(qclient, job_id, parameters, out_dir):
     # we are going to use the "raw" code for retrieving artifact_info vs. the
     # qiita_client.artifact_and_preparation_files method because this only
     # expects a single filepath
-    artifact_id = parameters['input_data']
+    artifact_id = parameters["input_data"]
     qclient_url = "/qiita_db/artifacts/%s/" % artifact_id
     artifact_info = qclient.get(qclient_url)
 
     # [0] there is only one directory
-    folder = artifact_info['files']['directory'][0]['filepath']
+    folder = artifact_info["files"]["directory"][0]["filepath"]
 
     # 2. Generate summary
     index_fp, viz_fp = _generate_html_summary(job_id, folder, out_dir)
 
     # Step 3: add the new file to the artifact using REST api
     success = True
-    error_msg = ''
+    error_msg = ""
     try:
-        fps = dumps({'html': index_fp, 'dir': viz_fp})
-        qclient.patch(qclient_url, 'add', '/html_summary/', value=fps)
+        fps = dumps({"html": index_fp, "dir": viz_fp})
+        qclient.patch(qclient_url, "add", "/html_summary/", value=fps)
     except Exception as e:
         success = False
         error_msg = str(e)

diff --git a/qtp_job_output_folder/tests/test_data/folder_1/index.html b/qtp_job_output_folder/tests/test_data/folder_1/index.html
diff --git a/qtp_job_output_folder/tests/test_data/folder_a/folder_b/index.html b/qtp_job_output_folder/tests/test_data/folder_a/folder_b/index.html
diff --git a/qtp_job_output_folder/tests/test_summary.py b/qtp_job_output_folder/tests/test_summary.py
@@ -6,13 +6,13 @@
 # The full license is in the file LICENSE, distributed with this software.
 # -----------------------------------------------------------------------------
 
-from unittest import main
-from tempfile import mkdtemp
-from os import remove
-from os.path import exists, isdir, join, dirname, abspath
 from inspect import currentframe, getfile
-from shutil import rmtree, copytree
 from json import dumps
+from os import remove
+from os.path import abspath, dirname, exists, isdir, join
+from shutil import copytree, rmtree
+from tempfile import mkdtemp
+from unittest import main
 
 from qiita_client.testing import PluginTestCase
 
@@ -23,8 +23,8 @@
 class SummaryTests(PluginTestCase):
     def setUp(self):
         self.out_dir = mkdtemp()
-        self.source_dir = join(mkdtemp(), 'test_data')
-        source = join(dirname(abspath(getfile(currentframe()))), 'test_data')
+        self.source_dir = join(mkdtemp(), "test_data")
+        source = join(dirname(abspath(getfile(currentframe()))), "test_data")
         copytree(source, self.source_dir)
         self._clean_up_files = [self.out_dir]
 
@@ -37,21 +37,29 @@ def tearDown(self):
                     remove(fp)
 
     def test_summary(self):
-        files = [(self.source_dir, 'directory')]
-        data = {'filepaths': dumps(files), 'type': 'job-output-folder',
-                'name': "A name", 'data_type': 'Job Output Folder'}
-        aid = self.qclient.post('/apitest/artifact/', data=data)['artifact']
-        parameters = {'input_data': aid}
-        data = {'command': dumps(['qtp-job-output-folder', __version__,
-                                  'Generate HTML summary']),
-                'parameters': dumps(parameters),
-                'status': 'running'}
-        job_id = self.qclient.post(
-            '/apitest/processing_job/', data=data)['job']
+        files = [(self.source_dir, "directory")]
+        data = {
+            "filepaths": dumps(files),
+            "type": "job-output-folder",
+            "name": "A name",
+            "data_type": "Job Output Folder",
+        }
+        aid = self.qclient.post("/apitest/artifact/", data=data)["artifact"]
+        parameters = {"input_data": aid}
+        data = {
+            "command": dumps(
+                ["qtp-job-output-folder", __version__, "Generate HTML summary"]
+            ),
+            "parameters": dumps(parameters),
+            "status": "running",
+        }
+        url = "/apitest/processing_job/"
+        job_id = self.qclient.post(url, data=data)["job"]
 
         # Run the test
         obs_success, obs_ainfo, obs_error = generate_html_summary(
-            self.qclient, job_id, parameters, self.out_dir)
+            self.qclient, job_id, parameters, self.out_dir
+        )
 
         # asserting reply
         self.assertTrue(obs_success)
@@ -61,36 +69,58 @@ def test_summary(self):
         # asserting content of html
         res = self.qclient.get("/qiita_db/artifacts/%s/" % aid)
         # cleaning artifact files, to avoid errors
-        [self._clean_up_files.extend([ff['filepath']])
-         for f in res['files'].values() for ff in f]
-        html_fp = res['files']['html_summary'][0]['filepath']
+        [
+            self._clean_up_files.extend([ff["filepath"]])
+            for f in res["files"].values()
+            for ff in f
+        ]
+        html_fp = res["files"]["html_summary"][0]["filepath"]
         with open(html_fp) as html_f:
             html = html_f.read()
 
-        self.assertCountEqual(
-            sorted(html.replace('<br/>', '').split('\n')),
-            sorted(EXP_HTML.format(aid=aid).replace('<br/>', '').split('\n')))
+        print("-------------")
+        print("-------------")
+        print(html)
+        print("-------------")
+        print(EXP_HTML.format(aid=aid))
+        print("-------------")
+        print("-------------")
+        self.assertEqual(html, EXP_HTML.format(aid=aid))
+
+        # verifying the new MANIFEST.txt
+        mfp = join(res["files"]["directory"][0]["filepath"], "MANIFEST.txt")
+        self.assertTrue(exists(f"{mfp}"))
+        with open(mfp, "r") as f:
+            obs = f.readlines()
+        self.assertCountEqual(obs, EXP_MANIFEST)
 
 
 EXP_HTML = (
-    '<a href="./{aid}/test_data/folder_a" type="folder" target="_blank">'
-    'test_data/folder_a</a><br/>\n'
-    '<a href="./{aid}/test_data/folder_a/folder_b/folder_c" type="folder" '
-    'target="_blank">test_data/folder_a/folder_b/folder_c</a><br/>\n'
-    '<a href="./{aid}/test_data/file_2" type="file" target="_blank">'
-    'test_data/file_2</a><br/>\n'
+    '<a href="./{aid}/test_data/MANIFEST.txt" type="file" target="_blank">'
+    "test_data/MANIFEST.txt</a><br/>\n"
     '<a href="./{aid}/test_data/file_1" type="file" target="_blank">'
-    'test_data/file_1</a><br/>\n'
-    '<a href="./{aid}/test_data/test_data" type="folder" target="_blank">'
-    'test_data/test_data</a><br/>\n'
-    '<a href="./{aid}/test_data/test_data/folder_a/folder_b" type="folder" '
-    'target="_blank">test_data/test_data/folder_a/folder_b</a><br/>\n'
-    '<a href="./{aid}/test_data/test_data/folder_a/folder_b/folder_c/file_c" '
-    'type="file" target="_blank">test_data/test_data/folder_a/folder_b/'
-    'folder_c/file_c</a><br/>\n'
-    '<a href="./{aid}/test_data/test_data/folder_a/file_a" type="file" '
-    'target="_blank">test_data/test_data/folder_a/file_a</a>')
-
-
-if __name__ == '__main__':
+    "test_data/file_1</a><br/>\n"
+    '<a href="./{aid}/test_data/file_2" type="file" target="_blank">'
+    "test_data/file_2</a><br/>\n"
+    '<a href="./{aid}/test_data/folder_a/folder_b/index.html" type="file" '
+    'target="_blank">test_data/folder_a/folder_b/index.html</a><br/>\n'
+    '<a href="./{aid}/test_data/folder_1/index.html" type="file" '
+    'target="_blank">test_data/folder_1/index.html</a>'
+)
+EXP_MANIFEST = [
+    " test_data/\n",
+    "|-- file_1\n",
+    "|-- file_2\n",
+    "|-- folder_a/\n",
+    "|--|-- file_a\n",
+    "|--|-- folder_b/\n",
+    "|--|--|-- index.html\n",
+    "|--|--|-- folder_c/\n",
+    "|--|--|--|-- file_c\n",
+    "|-- folder_1/\n",
+    "|--|-- index.html",
+]
+
+
+if __name__ == "__main__":
     main()