Skip to content

Commit 162c2f1

Browse files
josenavasantgonza
authored andcommitted
Adding @wasade's parallel to per sample fasta (#4)
* Adding @wasade's parallel to per sample fasta * Fixing docs * Adding FASTQ output to per sample files and reduce code duplication * fq -> fastq * Trying to fix coveralls
1 parent 1683ddc commit 162c2f1

File tree

5 files changed

+144
-33
lines changed

5 files changed

+144
-33
lines changed

.coveragerc

Lines changed: 0 additions & 19 deletions
This file was deleted.

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ install:
1515
- source activate env_name
1616
- travis_retry pip install .
1717
script:
18-
- nosetests --with-doctest --with-coverage
18+
- nosetests --with-doctest --with-coverage --cover-package=qiita_files
1919
- flake8 qiita_files setup.py
2020
after_success:
2121
- coveralls

qiita_files/demux.py

Lines changed: 70 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
from re import search
5555

5656
import numpy as np
57+
import joblib
5758
from future.utils import viewitems, viewvalues
5859
from future.builtins import zip
5960

@@ -389,6 +390,20 @@ def to_hdf5(fp, h5file, max_barcode_length=12):
389390
buffers[pjoin(dset_paths['qual'])].write(qual)
390391

391392

393+
def _to_ascii(demux, samples, formatter):
394+
"""Aux function to change from hdf5 to ascii"""
395+
id_fmt = (b"%(sample)s_%(idx)d orig_bc=%(bc_ori)s new_bc=%(bc_cor)s "
396+
b"bc_diffs=%(bc_diff)d")
397+
398+
for samp, idx, seq, qual, bc_ori, bc_cor, bc_err in fetch(demux, samples):
399+
seq_id = id_fmt % {b'sample': samp, b'idx': idx, b'bc_ori': bc_ori,
400+
b'bc_cor': bc_cor, b'bc_diff': bc_err}
401+
if qual != []:
402+
qual = qual.astype(np.uint8)
403+
404+
yield formatter(seq_id, seq, qual)
405+
406+
392407
def to_ascii(demux, samples=None):
393408
"""Consume a demuxed HDF5 file and yield sequence records
394409
@@ -412,19 +427,10 @@ def to_ascii(demux, samples=None):
412427
else:
413428
formatter = format_fasta_record
414429

415-
id_fmt = (b"%(sample)s_%(idx)d orig_bc=%(bc_ori)s new_bc=%(bc_cor)s "
416-
b"bc_diffs=%(bc_diff)d")
417-
418430
if samples is None:
419431
samples = demux.keys()
420432

421-
for samp, idx, seq, qual, bc_ori, bc_cor, bc_err in fetch(demux, samples):
422-
seq_id = id_fmt % {b'sample': samp, b'idx': idx, b'bc_ori': bc_ori,
423-
b'bc_cor': bc_cor, b'bc_diff': bc_err}
424-
if qual != []:
425-
qual = qual.astype(np.uint8)
426-
427-
yield formatter(seq_id, seq, qual)
433+
return _to_ascii(demux, samples, formatter)
428434

429435

430436
def to_per_sample_ascii(demux, samples=None):
@@ -455,6 +461,60 @@ def to_per_sample_ascii(demux, samples=None):
455461
yield samp, to_ascii(demux, samples=[samp])
456462

457463

464+
def _to_file(demux_fp, sample, fp, formatter):
465+
with open_file(demux_fp, 'r+') as demux:
466+
with open(fp, 'wb') as out:
467+
for rec in _to_ascii(demux, [sample], formatter):
468+
out.write(rec)
469+
470+
471+
def to_per_sample_files(demux_fp, samples=None, out_dir='./', n_jobs=1,
472+
out_format='fastq'):
473+
"""Writes per sample files
474+
475+
Parameters
476+
----------
477+
demux_fp : str
478+
The demux file path
479+
samples : list of str, optional
480+
Samples to pull out. If None, then all samples will be examined.
481+
Defaults to None.
482+
out_dir : str, optional
483+
Path to output directory to store the per sample fasta.
484+
Defaults to current directory
485+
n_jobs : int, optional
486+
Number of jobs to run in parallel. Defaults to 1
487+
out_format : {'fastq', 'fasta'}
488+
The format in which the output files should be written.
489+
"""
490+
if out_format == 'fastq':
491+
formatter = format_fastq_record
492+
file_name_fmt = "%s.fastq"
493+
elif out_format == 'fasta':
494+
formatter = format_fasta_record
495+
file_name_fmt = "%s.fna"
496+
else:
497+
raise ValueError("'out_format' should be either 'fastq' or 'fasta', "
498+
"found: %s" % out_format)
499+
if samples is None:
500+
with open_file(demux_fp, 'r') as demux:
501+
# We need to call list because demux.keys() is a KeysView object
502+
# from the file, and the file will be closed once we exit the
503+
# context manager
504+
samples = list(demux.keys())
505+
506+
if out_dir is None:
507+
out_dir = './'
508+
509+
path_builder = partial(os.path.join, out_dir)
510+
samples_and_paths = [(s.encode(), path_builder(file_name_fmt % s))
511+
for s in samples]
512+
513+
with joblib.Parallel(n_jobs=n_jobs) as par:
514+
par(joblib.delayed(_to_file)(demux_fp, sample, s_fp, formatter)
515+
for sample, s_fp in samples_and_paths)
516+
517+
458518
def fetch(demux, samples=None, k=None):
459519
"""Fetch sequences from a HDF5 demux file
460520

qiita_files/tests/test_demux.py

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
import os
1212
import tempfile
1313
from unittest import TestCase, main
14+
from functools import partial
15+
from shutil import rmtree
1416

1517
import h5py
1618
import numpy as np
@@ -19,7 +21,8 @@
1921
from qiita_files.demux import (buffer1d, buffer2d, _has_qual,
2022
_per_sample_lengths, _summarize_lengths,
2123
_set_attr_stats, _construct_datasets, to_hdf5,
22-
to_ascii, stat, to_per_sample_ascii)
24+
to_ascii, stat, to_per_sample_ascii,
25+
to_per_sample_files)
2326

2427

2528
class BufferTests(TestCase):
@@ -126,7 +129,11 @@ def setUp(self):
126129
def tearDown(self):
127130
self.hdf5_file.close()
128131
for f in self.to_remove:
129-
os.remove(f)
132+
if os.path.exists(f):
133+
if os.path.isdir(f):
134+
rmtree(f)
135+
else:
136+
os.remove(f)
130137

131138
def test_has_qual(self):
132139
with tempfile.NamedTemporaryFile('r+', suffix='.fna') as f:
@@ -321,6 +328,69 @@ def test_to_per_sample_ascii(self):
321328
obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)]
322329
self.assertEqual(obs, exp)
323330

331+
def test_to_files(self):
332+
# implicitly tested with test_to_per_sample_fasta
333+
pass
334+
335+
def test_to_per_sample_files(self):
336+
with tempfile.NamedTemporaryFile('r+', suffix='.fq',
337+
delete=False) as f:
338+
f.write(fqdata_variable_length)
339+
340+
self.to_remove.append(f.name)
341+
342+
with tempfile.NamedTemporaryFile('r+', suffix='.demux',
343+
delete=False) as demux_f:
344+
pass
345+
346+
self.to_remove.append(demux_f.name)
347+
348+
with h5py.File(demux_f.name, 'w') as demux:
349+
to_hdf5(f.name, demux)
350+
351+
tmp_dir = tempfile.mkdtemp()
352+
self.to_remove.append(tmp_dir)
353+
path_builder = partial(os.path.join, tmp_dir)
354+
355+
# Test to fastq
356+
to_per_sample_files(demux_f.name, out_dir=tmp_dir, n_jobs=1,
357+
out_format='fastq')
358+
sample_a_path = path_builder("a.fastq")
359+
sample_b_path = path_builder("b.fastq")
360+
self.assertTrue(os.path.exists(sample_a_path))
361+
self.assertTrue(os.path.exists(sample_b_path))
362+
363+
with open(sample_a_path, 'rb') as af:
364+
obs = af.read()
365+
self.assertEqual(
366+
obs, b'@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n')
367+
368+
with open(sample_b_path, 'rb') as bf:
369+
obs = bf.read()
370+
self.assertEqual(
371+
obs, b'@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n'
372+
b'@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\nDEF#G\n')
373+
374+
# Test to fasta and parallel
375+
to_per_sample_files(demux_f.name, out_dir=tmp_dir, n_jobs=2,
376+
out_format='fasta')
377+
378+
sample_a_path = path_builder("a.fna")
379+
sample_b_path = path_builder("b.fna")
380+
self.assertTrue(os.path.exists(sample_a_path))
381+
self.assertTrue(os.path.exists(sample_b_path))
382+
383+
with open(sample_a_path, 'rb') as af:
384+
obs = af.read()
385+
self.assertEqual(
386+
obs, b'>a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n')
387+
388+
with open(sample_b_path, 'rb') as bf:
389+
obs = bf.read()
390+
self.assertEqual(
391+
obs, b'>b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n'
392+
b'>b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n')
393+
324394
def test_fetch(self):
325395
# implicitly tested with test_to_ascii
326396
pass

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,6 @@
4343
'qiita_files/format',
4444
'qiita_files/parse'],
4545
extras_require={'test': ["nose >= 0.10.1", "pep8"]},
46-
install_requires=['future', 'numpy', 'six', 'h5py'],
46+
install_requires=['future', 'numpy', 'six', 'h5py', 'joblib'],
4747
classifiers=classifiers
4848
)

0 commit comments

Comments
 (0)