5454from re import search
5555
5656import numpy as np
57+ import joblib
5758from future .utils import viewitems , viewvalues
5859from future .builtins import zip
5960
@@ -389,6 +390,20 @@ def to_hdf5(fp, h5file, max_barcode_length=12):
389390 buffers [pjoin (dset_paths ['qual' ])].write (qual )
390391
391392
393+ def _to_ascii (demux , samples , formatter ):
394+ """Aux function to change from hdf5 to ascii"""
395+ id_fmt = (b"%(sample)s_%(idx)d orig_bc=%(bc_ori)s new_bc=%(bc_cor)s "
396+ b"bc_diffs=%(bc_diff)d" )
397+
398+ for samp , idx , seq , qual , bc_ori , bc_cor , bc_err in fetch (demux , samples ):
399+ seq_id = id_fmt % {b'sample' : samp , b'idx' : idx , b'bc_ori' : bc_ori ,
400+ b'bc_cor' : bc_cor , b'bc_diff' : bc_err }
401+ if qual != []:
402+ qual = qual .astype (np .uint8 )
403+
404+ yield formatter (seq_id , seq , qual )
405+
406+
392407def to_ascii (demux , samples = None ):
393408 """Consume a demuxed HDF5 file and yield sequence records
394409
@@ -412,19 +427,10 @@ def to_ascii(demux, samples=None):
412427 else :
413428 formatter = format_fasta_record
414429
415- id_fmt = (b"%(sample)s_%(idx)d orig_bc=%(bc_ori)s new_bc=%(bc_cor)s "
416- b"bc_diffs=%(bc_diff)d" )
417-
418430 if samples is None :
419431 samples = demux .keys ()
420432
421- for samp , idx , seq , qual , bc_ori , bc_cor , bc_err in fetch (demux , samples ):
422- seq_id = id_fmt % {b'sample' : samp , b'idx' : idx , b'bc_ori' : bc_ori ,
423- b'bc_cor' : bc_cor , b'bc_diff' : bc_err }
424- if qual != []:
425- qual = qual .astype (np .uint8 )
426-
427- yield formatter (seq_id , seq , qual )
433+ return _to_ascii (demux , samples , formatter )
428434
429435
430436def to_per_sample_ascii (demux , samples = None ):
@@ -455,6 +461,60 @@ def to_per_sample_ascii(demux, samples=None):
455461 yield samp , to_ascii (demux , samples = [samp ])
456462
457463
464+ def _to_file (demux_fp , sample , fp , formatter ):
465+ with open_file (demux_fp , 'r+' ) as demux :
466+ with open (fp , 'wb' ) as out :
467+ for rec in _to_ascii (demux , [sample ], formatter ):
468+ out .write (rec )
469+
470+
471+ def to_per_sample_files (demux_fp , samples = None , out_dir = './' , n_jobs = 1 ,
472+ out_format = 'fastq' ):
473+ """Writes per sample files
474+
475+ Parameters
476+ ----------
477+ demux_fp : str
478+ The demux file path
479+ samples : list of str, optional
480+ Samples to pull out. If None, then all samples will be examined.
481+ Defaults to None.
482+ out_dir : str, optional
483+ Path to output directory to store the per sample fasta.
484+ Defaults to current directory
485+ n_jobs : int, optional
486+ Number of jobs to run in parallel. Defaults to 1
487+ out_format : {'fastq', 'fasta'}
488+ The format in which the output files should be written.
489+ """
490+ if out_format == 'fastq' :
491+ formatter = format_fastq_record
492+ file_name_fmt = "%s.fastq"
493+ elif out_format == 'fasta' :
494+ formatter = format_fasta_record
495+ file_name_fmt = "%s.fna"
496+ else :
497+ raise ValueError ("'out_format' should be either 'fastq' or 'fasta', "
498+ "found: %s" % out_format )
499+ if samples is None :
500+ with open_file (demux_fp , 'r' ) as demux :
501+ # We need to call list because demux.keys() is a KeysView object
502+ # from the file, and the file will be closed once we exit the
503+ # context manager
504+ samples = list (demux .keys ())
505+
506+ if out_dir is None :
507+ out_dir = './'
508+
509+ path_builder = partial (os .path .join , out_dir )
510+ samples_and_paths = [(s .encode (), path_builder (file_name_fmt % s ))
511+ for s in samples ]
512+
513+ with joblib .Parallel (n_jobs = n_jobs ) as par :
514+ par (joblib .delayed (_to_file )(demux_fp , sample , s_fp , formatter )
515+ for sample , s_fp in samples_and_paths )
516+
517+
458518def fetch (demux , samples = None , k = None ):
459519 """Fetch sequences from a HDF5 demux file
460520
0 commit comments