diff --git a/python/obitools3/commands/import.pyx b/python/obitools3/commands/import.pyx index 4b16d70..1d064d9 100644 --- a/python/obitools3/commands/import.pyx +++ b/python/obitools3/commands/import.pyx @@ -1,133 +1,284 @@ -# from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport -# from obitools3.files.universalopener cimport uopen -# from obitools3.parsers.fasta import fastaIterator -# from obitools3.parsers.fastq import fastqIterator -# from obitools3.dms.dms import OBIDMS # TODO cimport doesn't work -# -# import time -# +#cython: language_level=3 +# TODO cimport generate errors with argument numbers, but without them some variables can't be declared + +from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport +from obitools3.files.universalopener cimport uopen +from obitools3.parsers.fasta import fastaIterator +from obitools3.parsers.fastq import fastqIterator +from obitools3.dms.dms import DMS # TODO cimport doesn't work +from obitools3.dms.view.view cimport View +from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS # TODO cimport doesn't work +from obitools3.dms.column.column cimport Column + +from obitools3.utils cimport tobytes, \ + get_obitype, \ + update_obitype + +from obitools3.dms.capi.obitypes cimport obitype_t, \ + OBI_VOID + +from obitools3.dms.capi.obierrno cimport obi_errno + +import time + +import pickle + + +__title__="Imports sequences from different formats into a DMS" + + +default_config = { 'destview' : None, + 'skip' : 0, + 'only' : None, + 'skiperror' : False, + 'seqinformat' : None, + 'moltype' : 'nuc', + 'filename' : None + } + +def addOptions(parser): + parser.add_argument(dest='import:filename', + metavar='', + nargs='?', + default=None, + help='Name of the sequence file to import' ) + + group=parser.add_argument_group('obi import specific options') + + group.add_argument('--default-dms','-d', + action="store", dest="obi:defaultdms", + metavar='', + default=None, + type=str, + help="Name of the default DMS for reading and writing data") + + group.add_argument('--destination-view','-v', + action="store", dest="import:destview", + metavar='', + default=None, + type=str, + required=True, + help="Name of the default DMS for reading and writing data") + + group.add_argument('--skip', + action="store", dest="import:skip", + metavar='', + default=0, + type=int, + help="Skip the N first sequences") + + group.add_argument('--only', + action="store", dest="import:only", + metavar='', + default=None, + type=int, + help="Treat only N sequences") + + group.add_argument('--skip-on-error', + action="store_true", dest="import:skiperror", + default=None, + help="Skip sequence entries with parse error") + + group.add_argument('--fasta', + action="store_const", dest="import:seqinformat", + default=None, + const='fasta', + help="Input file is in fasta nucleic format (including obitools fasta extentions)") + + group.add_argument('--fastq', + action="store_const", dest="import:seqinformat", + default=None, + const='fastq', + help="Input file is in sanger fastq nucleic format (standard fastq)") + + group.add_argument('--nuc', + action="store_const", dest="import:moltype", + default=None, + const='nuc', + help="Input file contains nucleic sequences") + + group.add_argument('--prot', + action="store_const", dest="import:moltype", + default=None, + const='pep', + help="Input file contains protein sequences") + + +# TODO: Handling of NA values. Check None. Specify in doc? None or NA? Possiblity to specify in option? +# look in R read.table option to specify NA value def run(config): - pass + + cdef int i + cdef type value_type + cdef obitype_t value_obitype + cdef obitype_t old_type + cdef obitype_t new_type + cdef bint get_quality + cdef bint NUC_SEQS_view + cdef int nb_elts + cdef object d + cdef View view + cdef object iseq + cdef object seq + cdef object inputs + cdef Column id_col + cdef Column def_col + cdef Column seq_col + cdef Column qual_col + cdef Column old_column + cdef bint rewrite + cdef dict dcols + cdef int skipping + cdef str tag + cdef object value + cdef list elt_names + cdef int old_nb_elements_per_line + cdef int new_nb_elements_per_line + cdef list old_elements_names + cdef list new_elements_names + cdef ProgressBar pb + global obi_errno + + pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file + + inputs = uopen(config['import']['filename']) + + # Create or open DMS + try: + d = DMS.test_open(config['obi']['defaultdms']) + except : + d = DMS.new(config['obi']['defaultdms']) + + get_quality = False + NUC_SEQS_view = False + if config['import']['seqinformat']=='fasta': + get_quality = False + NUC_SEQS_view = True + iseq = fastaIterator(inputs) + view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) + elif config['import']['seqinformat']=='fastq': + get_quality = True + NUC_SEQS_view = True + iseq = fastqIterator(inputs) + view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) + else: + raise RuntimeError('File format not handled') + + # Save basic columns in variables for optimization + if NUC_SEQS_view : + id_col = view["ID"] + def_col = view["DEFINITION"] + seq_col = view["NUC_SEQ"] + if get_quality : + qual_col = view["QUALITY"] + + dcols = {} + + skipping = 0 + i = 0 + for seq in iseq : + if skipping < config['import']['skip'] : # TODO not efficient because sequences are parsed + skipping+=1 + elif i == config['import']['only'] : + break + else : + pb(i) + if NUC_SEQS_view : + id_col[i] = seq['id'] + def_col[i] = seq['definition'] + seq_col[i] = seq['sequence'] + if get_quality : + qual_col[i] = seq['quality'] + + for tag in seq['tags'] : + + value = seq['tags'][tag] + + if tag not in dcols : + + value_type = type(value) + nb_elts = 1 + value_obitype = OBI_VOID + + if value_type == dict or value_type == list : + nb_elts = len(value) + elt_names = list(value) + else : + nb_elts = 1 + elt_names = None + + value_obitype = get_obitype(value) + + if value_obitype != OBI_VOID : + dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype) + + # Fill value + dcols[tag][0][i] = value + + # TODO else log error? + + else : + + rewrite = False + + # Check type adequation + old_type = dcols[tag][1] + new_type = OBI_VOID + new_type = update_obitype(old_type, value) + if old_type != new_type : + rewrite = True + + try: + # Fill value + dcols[tag][0][i] = value + + except IndexError : + + value_type = type(value) + old_column = dcols[tag][0] + old_nb_elements_per_line = old_column.nb_elements_per_line + new_nb_elements_per_line = 0 + old_elements_names = old_column.elements_names + new_elements_names = None + + ##################################################################### + + # Check the length and keys of column lines if needed + if value_type == dict : # Check dictionary keys + for k in value : + if k not in old_elements_names : + new_elements_names = list(value) + rewrite = True + break + + elif value_type == list or value_type == tuple : # Check vector length + if old_nb_elements_per_line < len(value) : + new_nb_elements_per_line = len(value) + rewrite = True + + ##################################################################### + + if rewrite : + if new_nb_elements_per_line == 0 and new_elements_names is not None : + new_nb_elements_per_line = len(new_elements_names) + + dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, + new_data_type=new_type, + new_nb_elements_per_line=new_nb_elements_per_line, + new_elements_names=new_elements_names), + value_obitype) + + # Reset obierrno + obi_errno = 0 + + # Fill value + dcols[tag][0][i] = value + + i+=1 + + print("\n") + print(view.__repr__()) + + d.close() -# __title__="Counts sequences in a sequence set" -# -# -# default_config = { 'destview' : None, -# 'skip' : 0, -# 'only' : None, -# 'skiperror' : False, -# 'seqinformat' : None, -# 'moltype' : 'nuc', -# 'filename' : None -# } -# -# def addOptions(parser): -# parser.add_argument(dest='import:filename', -# metavar='', -# nargs='?', -# default=None, -# help='sequence file name to be imported' ) -# -# group=parser.add_argument_group('obi import specific options') -# -# group.add_argument('--default-dms','-d', -# action="store", dest="obi:defaultdms", -# metavar='', -# default=None, -# type=str, -# help="Name of the default DMS for reading and writing data") -# -# group.add_argument('--destination-view','-v', -# action="store", dest="import:destview", -# metavar='', -# default=None, -# type=str, -# required=True, -# help="Name of the default DMS for reading and writing data") -# -# group.add_argument('--skip', -# action="store", dest="import:skip", -# metavar='', -# default=None, -# type=int, -# help="skip the N first sequences") -# -# group.add_argument('--only', -# action="store", dest="import:only", -# metavar='', -# default=None, -# type=int, -# help="treat only N sequences") -# -# group.add_argument('--skip-on-error', -# action="store_true", dest="import:skiperror", -# default=None, -# help="Skip sequence entries with parse error") -# -# group.add_argument('--fasta', -# action="store_const", dest="import:seqinformat", -# default=None, -# const='fasta', -# help="Input file is in fasta nucleic format (including obitools fasta extentions)") -# -# group.add_argument('--fastq', -# action="store_const", dest="import:seqinformat", -# default=None, -# const='fastq', -# help="Input file is in sanger fastq nucleic format (standard fastq)") -# -# group.add_argument('--nuc', -# action="store_const", dest="import:moltype", -# default=None, -# const='nuc', -# help="Input file contains nucleic sequences") -# -# group.add_argument('--prot', -# action="store_const", dest="import:moltype", -# default=None, -# const='pep', -# help="Input file contains protein sequences") -# -# -# -# # TODO: Handling of NA values -# def run(config): -# pb = ProgressBar(35000000, config, seconde=5) # TODO should be number of records in file -# -# inputs = uopen(config['import']['filename']) -# -# get_quality = False -# if config['import']['seqinformat']=='fasta': -# iseq = fastaIterator(inputs) -# view_type="NUC_SEQS_VIEW" -# elif config['import']['seqinformat']=='fastq': -# iseq = fastqIterator(inputs) -# view_type="NUC_SEQS_VIEW" -# get_quality = True -# else: -# raise RuntimeError('No file format specified') -# -# # Create DMS -# d = OBIDMS(config['obi']['defaultdms']) -# -# # Create view -# # view = d.new_view(config['import']['destview'], view_type=view_type, quality_column=get_quality) -# # -# # i = 0 -# # for seq in iseq: -# # pb(i) -# # view[i].id = seq['id'] -# # view[i].definition = seq['definition'] -# # view[i].nuc_seq = seq['sequence'] -# # if get_quality : -# # view[i].quality = seq['quality'] -# # for tag in seq['tags'] : -# # view[i][tag] = seq['tags'][tag] -# # i+=1 -# # -# # #print(view.__repr__()) -# # -# # view.close() -# d.close() -#