diff --git a/python/obitools3/commands/import.pyx b/python/obitools3/commands/import.pyx index 1a61166..7443149 100644 --- a/python/obitools3/commands/import.pyx +++ b/python/obitools3/commands/import.pyx @@ -8,12 +8,10 @@ from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport from obitools3.files.universalopener cimport uopen from obitools3.parsers.fasta import fastaIterator from obitools3.parsers.fastq import fastqIterator -from obitools3.dms.dms import DMS # TODO cimport doesn't work from obitools3.dms.view.view cimport View -from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS # TODO cimport doesn't work +from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS from obitools3.dms.column.column cimport Column - -from obitools3.dms.obiseq import Nuc_Seq +from obitools3.dms.obiseq cimport Nuc_Seq from obitools3.utils cimport tobytes, \ get_obitype, \ @@ -25,6 +23,7 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \ from obitools3.dms.capi.obierrno cimport obi_errno from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption + from obitools3.uri.decode import open_uri from obitools3.apps.config import logger @@ -50,6 +49,8 @@ def addOptions(parser): def run(config): + cdef tuple input + cdef tuple output cdef int i cdef type value_type cdef obitype_t value_obitype @@ -62,7 +63,6 @@ def run(config): cdef View view cdef object iseq cdef object seq - cdef object inputs cdef Column id_col cdef Column def_col cdef Column seq_col @@ -71,7 +71,7 @@ def run(config): cdef bint rewrite cdef dict dcols cdef int skipping - cdef str tag + cdef bytes tag cdef object value cdef list elt_names cdef int old_nb_elements_per_line @@ -84,163 +84,157 @@ def run(config): logger("info","obi import : imports file into an DMS") - inputs = open_uri(config['obi']['inputURI']) + input = open_uri(config['obi']['inputURI']) - if inputs[2]==Nuc_Seq: + if input[2]==Nuc_Seq: v = View_NUC_SEQS else: - v= View - + v = View + output = open_uri(config['obi']['outputURI'], input=False, newviewtype=v) - print(input) - print(output) + #print(input) + #print(output) - sys.exit() + pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file -# pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file -# -# inputs = uopen(config['import']['filename']) -# -# # Create or open DMS -# d = DMS.open_or_new(config['obi']['defaultdms']) -# -# get_quality = False -# NUC_SEQS_view = False -# if config['import']['seqinformat']=='fasta': -# get_quality = False -# NUC_SEQS_view = True -# iseq = fastaIterator(inputs, skip=config['import']['skip']) -# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) -# elif config['import']['seqinformat']=='fastq': -# get_quality = True -# NUC_SEQS_view = True -# iseq = fastqIterator(inputs, skip=config['import']['skip']) -# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) -# else: -# raise RuntimeError('File format not handled') -# -# # Save basic columns in variables for optimization -# if NUC_SEQS_view : -# id_col = view["ID"] -# def_col = view["DEFINITION"] -# seq_col = view["NUC_SEQ"] -# if get_quality : -# qual_col = view["QUALITY"] -# -# dcols = {} -# -# i = 0 -# for seq in iseq : -# if i == config['import']['only'] : -# break -# else : -# pb(i) -# if NUC_SEQS_view : -# id_col[i] = seq['id'] -# def_col[i] = seq['definition'] -# seq_col[i] = seq['sequence'] -# if get_quality : -# qual_col[i] = seq['quality'] -# -# for tag in seq['tags'] : -# -# value = seq['tags'][tag] -# -# # Check NA value -# if value == config['import']['NA'] : -# value = None -# -# if tag not in dcols : -# -# value_type = type(value) -# nb_elts = 1 -# value_obitype = OBI_VOID -# -# if value_type == dict or value_type == list : -# nb_elts = len(value) -# elt_names = list(value) -# else : -# nb_elts = 1 -# elt_names = None -# -# value_obitype = get_obitype(value) -# -# if value_obitype != OBI_VOID : -# dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype) -# -# # Fill value -# dcols[tag][0][i] = value -# -# # TODO else log error? -# -# else : -# -# rewrite = False -# -# # Check type adequation -# old_type = dcols[tag][1] -# new_type = OBI_VOID -# new_type = update_obitype(old_type, value) -# if old_type != new_type : -# rewrite = True -# -# try: -# # Fill value -# dcols[tag][0][i] = value -# -# except IndexError : -# -# value_type = type(value) -# old_column = dcols[tag][0] -# old_nb_elements_per_line = old_column.nb_elements_per_line -# new_nb_elements_per_line = 0 -# old_elements_names = old_column.elements_names -# new_elements_names = None -# -# ##################################################################### -# -# # Check the length and keys of column lines if needed -# if value_type == dict : # Check dictionary keys -# for k in value : -# if k not in old_elements_names : -# new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value])) -# rewrite = True -# break -# -# elif value_type == list or value_type == tuple : # Check vector length -# if old_nb_elements_per_line < len(value) : -# new_nb_elements_per_line = len(value) -# rewrite = True -# -# ##################################################################### -# -# if rewrite : -# if new_nb_elements_per_line == 0 and new_elements_names is not None : -# new_nb_elements_per_line = len(new_elements_names) -# -# # Reset obierrno -# obi_errno = 0 -# -# dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, -# new_data_type=new_type, -# new_nb_elements_per_line=new_nb_elements_per_line, -# new_elements_names=new_elements_names), -# value_obitype) -# -# # Update the dictionary: -# for t in dcols : -# dcols[t] = (view[t], dcols[t][1]) -# -# # Fill value -# dcols[tag][0][i] = value -# -# i+=1 -# -# print("\n") -# print(view.__repr__()) -# -# d.close() + iseq = input[1] + + get_quality = False + NUC_SEQS_view = False + if isinstance(output[1], View) : + view = output[1] + if output[2] == View_NUC_SEQS : + NUC_SEQS_view = True + if "QUALITY" in view : # TODO + get_quality = True + else: + raise NotImplementedError() + + # Save basic columns in variables for optimization + if NUC_SEQS_view : + id_col = view[b"ID"] + def_col = view[b"DEFINITION"] + seq_col = view[b"NUC_SEQ"] + if get_quality : + qual_col = view[b"QUALITY"] + + dcols = {} + + i = 0 + for seq in iseq : + + pb(i) + + if NUC_SEQS_view : + id_col[i] = seq.id + def_col[i] = seq.definition + seq_col[i] = seq.seq + + if get_quality : + qual_col[i] = seq.quality + + for tag in seq : + + if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" : # TODO hmmm... + + value = seq[tag] + + # Check NA value + if value == config['obi']['nastring'] : + value = None + + if tag not in dcols : + + value_type = type(value) + nb_elts = 1 + value_obitype = OBI_VOID + + if value_type == dict or value_type == list : + nb_elts = len(value) + elt_names = list(value) + else : + nb_elts = 1 + elt_names = None + + value_obitype = get_obitype(value) + + if value_obitype != OBI_VOID : + dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype) + + # Fill value + dcols[tag][0][i] = value + + # TODO else log error? + + else : + + rewrite = False + + # Check type adequation + old_type = dcols[tag][1] + new_type = OBI_VOID + new_type = update_obitype(old_type, value) + if old_type != new_type : + rewrite = True + + try: + # Fill value + dcols[tag][0][i] = value + + except IndexError : + + value_type = type(value) + old_column = dcols[tag][0] + old_nb_elements_per_line = old_column.nb_elements_per_line + new_nb_elements_per_line = 0 + old_elements_names = old_column.elements_names + new_elements_names = None + + ##################################################################### + + # Check the length and keys of column lines if needed + if value_type == dict : # Check dictionary keys + for k in value : + if k not in old_elements_names : + new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value])) + rewrite = True + break + + elif value_type == list or value_type == tuple : # Check vector length + if old_nb_elements_per_line < len(value) : + new_nb_elements_per_line = len(value) + rewrite = True + + ##################################################################### + + if rewrite : + if new_nb_elements_per_line == 0 and new_elements_names is not None : + new_nb_elements_per_line = len(new_elements_names) + + # Reset obierrno + obi_errno = 0 + + dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, + new_data_type=new_type, + new_nb_elements_per_line=new_nb_elements_per_line, + new_elements_names=new_elements_names), + value_obitype) + + # Update the dictionary: + for t in dcols : + dcols[t] = (view[t], dcols[t][1]) + + # Fill value + dcols[tag][0][i] = value + + i+=1 + + print("\n") + print(view.__repr__()) + + input[0].close() # TODO + output[0].close() diff --git a/python/obitools3/commands/uniq.pxd b/python/obitools3/commands/uniq.pxd new file mode 100644 index 0000000..3452e17 --- /dev/null +++ b/python/obitools3/commands/uniq.pxd @@ -0,0 +1,8 @@ +#cython: language_level=3 + +from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport +from obitools3.dms.taxo.taxo cimport Taxonomy +from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS + + +cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=*, list mergedKeys_list=*, bint mergeIds=*, list categories=*) diff --git a/python/obitools3/commands/uniq.pyx b/python/obitools3/commands/uniq.pyx index 82d1884..37ff7a5 100644 --- a/python/obitools3/commands/uniq.pyx +++ b/python/obitools3/commands/uniq.pyx @@ -1,59 +1,29 @@ #cython: language_level=3 from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport -from obitools3.dms.dms import DMS # TODO cimport doesn't work -from obitools3.dms.view.view import View # TODO cimport doesn't work -from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS -from obitools3.dms.obiseq cimport Nuc_Seq -from obitools3.dms.column.column cimport Column -from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN -from obitools3.dms.capi.obitypes cimport OBI_INT -from obitools3.utils cimport tostr +from obitools3.dms.dms cimport DMS +from obitools3.dms.taxo.taxo cimport Taxonomy +from obitools3.dms.view.view cimport View, Line +from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS +from obitools3.dms.column.column cimport Column, Column_line +from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN +from obitools3.dms.capi.obitypes cimport OBI_INT, index_t +from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption +from obitools3.uri.decode import open_uri +from obitools3.apps.config import logger # TODO silence non-implemented options __title__="Groups records together" -default_config = { 'inputview' : None, - 'outputview' : None - } + def addOptions(parser): - # TODO put this common group somewhere else but I don't know where - group=parser.add_argument_group('DMS and view options') + addSequenceInputOption(parser) + addMinimalOutputOption(parser) - group.add_argument('--default-dms','-d', - action="store", dest="obi:defaultdms", - metavar='', - default=None, - type=str, - help="Name of the default DMS for reading and writing data.") - - group.add_argument('--input-view','-i', - action="store", dest="obi:inputview", - metavar='', - default=None, - type=str, - help="Name of the input view, either raw if the view is in the default DMS," - " or in the form 'dms:view' if it is in another DMS.") - - group.add_argument('--output-view','-o', - action="store", dest="obi:outputview", - metavar='', - default=None, - type=str, - help="Name of the output view, either raw if the view is in the default DMS," - " or in the form 'dms:view' if it is in another DMS.") - - group.add_argument('--taxo','-t', - action="store", dest="obi:taxo", - metavar='', - default='', # TODO not None because if it's None, the option is not entered in the option dictionary. - type=str, - help="Name of the taxonomy to use.") - group = parser.add_argument_group('obi uniq specific options') @@ -89,152 +59,198 @@ def addOptions(parser): # TODO taxonomy -# TODO -COUNT_COLUMN_str = tostr(COUNT_COLUMN) - - -def uniqSequence(view, pb, o_view, taxonomy=None, mergedKey=None, mergeIds=False, categories=None) : +cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=None, list mergedKeys_list=None, bint mergeIds=False, list categories=None) : + cdef int i + cdef int o_idx + cdef int u_idx + cdef int u_id + cdef int i_count + cdef set mergedKeys + cdef dict uniques + cdef dict merged_infos + cdef object iter_view + cdef Line i_seq + cdef Line o_seq + cdef str key + cdef bytes key_b + cdef str mkey + cdef str merged_col_name + cdef Column i_col + cdef Column seq_col + cdef object to_merge + cdef Column_line mcol + cdef Column_line i_mcol + uniques = {} if categories is None: categories=[] - if mergedKey is not None: - mergedKey=set(mergedKey) + if mergedKeys_list is not None: + mergedKeys=set(mergedKeys_list) else: - mergedKey=set() + mergedKeys=set() - if taxonomy is not None: - mergedKey.add('taxid') +# if taxonomy is not None: +# mergedKeys.add('taxid') - # Faire parcours de la view des colonnes à merged pour créer les merged_col avant et les remplir au fur et à mesure - - o_idx = 0 + # Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names + #logger("info", "obi uniq", "First browsing through the input") + merged_infos = {} i = 0 - seq_col = view[NUC_SEQUENCE_COLUMN] - iter_view = iter(view) - for i_seq in iter_view : - pass + for i_seq in iter_view: + pb(i) + for key in mergedKeys: + mkey = "merged_%s" % key + if key in i_seq: # TODO what if mkey already in i_seq? + if mkey not in merged_infos: + merged_infos[mkey] = {} + mkey_infos = merged_infos[mkey] + mkey_infos['nb_elts'] = 1 + mkey_infos['elt_names'] = [i_seq[key]] + else: + mkey_infos = merged_infos[mkey] + if i_seq[key] not in mkey_infos['elt_names']: # TODO make faster? but how? + mkey_infos['elt_names'].append(i_seq[key]) + mkey_infos['nb_elts'] += 1 + i+=1 + + for key in mergedKeys: + merged_col_name = "merged_%s" % key + i_col = view[key] + Column.new_column(o_view, + merged_col_name, + OBI_INT, + nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'], + elements_names=merged_infos[merged_col_name]['elt_names'], + comments=i_col.comments, + alias=merged_col_name # TODO what if it already exists + ) + del(merged_infos) + + #logger("info", "obi uniq", "Second browsing through the input") + i = 0 + o_idx = 0 + seq_col = view[NUC_SEQUENCE_COLUMN] + iter_view = iter(view) for i_seq in iter_view : pb(i) - # utiliser l'index des AVLs, faire l'API - #u_id = tuple(i_seq[x] for x in categories) + (str(i_seq),) + + #u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),) u_id = seq_col.get_line_idx(i) + if u_id in uniques: - u_seq = uniques[u_id] - o_seq = o_view[u_seq['idx']] - if COUNT_COLUMN_str in i_seq: - o_seq[COUNT_COLUMN_str] += i_seq[COUNT_COLUMN_str] + + if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None: + i_count = 1 else: - o_seq[COUNT_COLUMN_str] += 1 -# seq['COUNT']=1 + i_count = i_seq[COUNT_COLUMN] + + u_idx = uniques[u_id] + o_seq = o_view[u_idx] + o_seq[COUNT_COLUMN] += i_count + # if taxonomy is not None and 'taxid' in seq: # s['merged_taxid'][seq['taxid']]= - for key in mergedKey: - if key=='taxid' and mergeIds: # TODO - if 'taxid_dist' in i_seq: - u_seq["taxid_dist"].update(i_seq["taxid_dist"]) - if 'taxid' in i_seq: - u_seq["taxid_dist"][i_seq.id] = i_seq['taxid'] - - mkey = "merged_%s" % key + for key in mergedKeys: +# if key=='taxid' and mergeIds: # TODO +# if 'taxid_dist' in i_seq: +# u_seq["taxid_dist"].update(i_seq["taxid_dist"]) +# if 'taxid' in i_seq: +# u_seq["taxid_dist"][i_seq.id] = i_seq['taxid'] + + mkey = "merged_%s" % key #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive if key in i_seq: - u_seq[mkey][i_seq[key]] = u_seq[mkey].get(i_seq[key], 0) + i_seq[COUNT_COLUMN_str] + to_merge = i_seq[key] + mcol = o_seq[mkey] + if mcol[to_merge] is None: + mcol[to_merge] = i_count + else: + mcol[to_merge] = mcol[to_merge] + i_count #cas ou merged_keys existe deja - else: + else: # TODO is this a good else if mkey in i_seq: - for skey in i_seq[mkey]: - u_seq[mkey][skey] = u_seq[mkey].get(skey,0) + i_seq[mkey][skey] - - for key in i_seq.keys(): + mcol = o_seq[mkey] + i_mcol = i_seq[mkey] + for key_b in i_mcol: + if mcol[key_b] is None: + mcol[key_b] = i_mcol[key_b] + else: + mcol[key_b] = mcol[key_b] + i_mcol[key_b] + + for key_b in i_seq.keys(): # Merger proprement l'attribut merged s'il existe - if key in o_seq and o_seq[key] != i_seq[key] and tostr(key) != COUNT_COLUMN_str : #and key[0:7]!='merged_' and key!='merged': TODO check this - o_seq[key] = None - - if mergeIds: - u_seq['merged'].append(i_seq.id) - - else: + # TODO make special columns list? + if key_b != COUNT_COLUMN and key_b != ID_COLUMN and key_b != NUC_SEQUENCE_COLUMN and key_b in o_seq and o_seq[key_b] != i_seq[key_b] : + o_seq[key_b] = None + + # if mergeIds: # TODO + # u_seq['merged'].append(i_seq.id) + else: o_view[o_idx] = i_seq o_seq = o_view[o_idx] - uniques[u_id] = {'idx':o_idx} - u_seq = uniques[u_id] + uniques[u_id] = o_idx o_idx += 1 - if COUNT_COLUMN_str not in o_seq: - o_seq[COUNT_COLUMN_str] = 1 - - for key in mergedKey: - if key=='taxid' and mergeIds: - if 'taxid' in o_seq and 'taxid_dist' not in o_seq: - u_seq["taxid_dist"] = {} - else : - u_seq["taxid_dist"] = o_seq["taxid_dist"] - if 'taxid' in o_seq: - u_seq["taxid_dist"][o_seq.id] = o_seq['taxid'] - mkey = "merged_%s" % key - if mkey not in o_seq: - u_seq[mkey]={} - else : - u_seq[mkey] = o_seq[mkey] + if COUNT_COLUMN not in o_seq or o_seq[COUNT_COLUMN] is None: + o_seq[COUNT_COLUMN] = 1 + + for key in mergedKeys: +# if key=='taxid' and mergeIds: +# if 'taxid' in o_seq and 'taxid_dist' not in o_seq: +# u_seq["taxid_dist"] = {} +# else : +# u_seq["taxid_dist"] = o_seq["taxid_dist"] +# if 'taxid' in o_seq: +# u_seq["taxid_dist"][o_seq.id] = o_seq['taxid'] + mkey = "merged_%s" % key if key in o_seq: - u_seq[mkey][o_seq[key]] = u_seq[mkey].get(o_seq[key],0) + o_seq[COUNT_COLUMN_str] - o_seq[key] = None - - if mergeIds: - u_seq['merged']=[o_seq.id] - + to_merge = o_seq[key] + mcol = o_seq[mkey] + if to_merge in mcol and mcol[to_merge] is not None: + mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN] + else: + mcol[to_merge] = o_seq[COUNT_COLUMN] + o_seq[key] = None # TODO delete column eventually -> make C function? + +# if mergeIds: +# u_seq['merged']=[o_seq.id] i+=1 - + #TODO #if taxonomy is not None: # mergeTaxonomyClassification(uniqSeq, taxonomy) - # Get informations to build the columns with merged attributes - merged_infos = {} - for u_id in uniques : - u_seq = uniques[u_id] - for mkey in u_seq : - if mkey != 'idx' : - mkey_dict = u_seq[mkey] - if mkey not in merged_infos : - merged_infos[mkey] = {} - mkey_infos = merged_infos[mkey] - mkey_infos['nb_elts'] = len(mkey_dict.keys()) - mkey_infos['elt_names'] = [k for k in mkey_dict] - else : - mkey_infos = merged_infos[mkey] - for k in mkey_dict : - if k not in mkey_infos['elt_names'] : - mkey_infos['elt_names'].append(k) - mkey_infos['nb_elts'] += 1 - - keys_to_del = [] - for k in merged_infos : - if merged_infos[k]['nb_elts'] == 0: - keys_to_del.append(k) - for k in keys_to_del : - del merged_infos[k] - - return (uniques, merged_infos) def run(config): - # TODO declare variables + cdef tuple input + cdef tuple output + cdef View_NUC_SEQS entries + cdef View_NUC_SEQS o_view + cdef ProgressBar pb + + logger("info","obi uniq") - # Open DMS - d = DMS.open(config['obi']['defaultdms']) - - # Open input view - entries = View.open(d, config['obi']['inputview']) - + input = open_uri(config['obi']['inputURI']) + + if input[2] != View_NUC_SEQS: + raise NotImplementedError('obi uniq only works on NUC_SEQS views') + + output = open_uri(config['obi']['outputURI'], + input=False, + newviewtype=View_NUC_SEQS) + + entries = input[1] + o_view = output[1] + # Initialize the progress bar pb = ProgressBar(len(entries), config, seconde=5) @@ -243,55 +259,20 @@ def run(config): # usm = uniqPrefixSequence # else: usm = uniqSequence - - # Create output view - view_class = View.get_view_class(entries.type) - if view_class == View_NUC_SEQS : - get_quality = tostr(QUALITY_COLUMN) in entries # TODO - o_view = View_NUC_SEQS.new(d, config['obi']['outputview'], quality=get_quality) - else : - o_view = view_class.new(d, config['obi']['outputview']) - (uniques, merged_infos) = usm(entries, pb, o_view, config['obi']['taxo'], config['uniq']['merge'], config['uniq']['mergeids'], config['uniq']['categories']) + usm(entries, o_view, pb, taxonomy=None, mergedKeys_list=config['uniq']['merge'], mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories']) - if 'merge' in config['uniq'] : - merged_keys=set(config['uniq']['merge']) - else: - merged_keys=set() - - if 'taxo' in config['obi'] : - merged_keys.add('taxid') - - # TODO gotta handle special merged columns - for k in merged_keys: - merged_col_name = "merged_%s" % k - if merged_col_name in merged_infos : - i_col = entries[k] - Column.new_column(o_view, - merged_col_name, - OBI_INT, - nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'], - elements_names=merged_infos[merged_col_name]['elt_names'], - comments=i_col.comments, - alias=merged_col_name - ) - - for u_id in uniques: - u_dict = uniques[u_id] - for merged_k in u_dict : - if merged_k in merged_infos : # TODO don't enter irrelevant keys to begin with, instead - o_view[u_dict['idx']][merged_k] = u_dict[merged_k] +# if 'merge' in config['uniq'] : +# merged_keys=set(config['uniq']['merge']) +# else: +# merged_keys=set() +# +# if 'taxo' in config['obi'] : +# merged_keys.add('taxid') print("\n") print(repr(o_view)) - - d.close() - - - - - - - - - \ No newline at end of file + + input[0].close() + output[0].close() + diff --git a/python/obitools3/dms/column/column.pyx b/python/obitools3/dms/column/column.pyx index 9d64b22..65f37a3 100644 --- a/python/obitools3/dms/column/column.pyx +++ b/python/obitools3/dms/column/column.pyx @@ -371,8 +371,18 @@ cdef class Column_line : self._column.set_item(self._index, elt_id, value) - def __contains__(self, object element_name): - return (tobytes(element_name) in self._column.elements_names) + def get(self, object elt_id, object default=None): # TODO returns default if None??? + if elt_id in self: + return self._column.get_item(self._index, elt_id) + else: + return default + + + def __contains__(self, object elt_id): + if type(elt_id) == int: + return elt_id < self._column.nb_elements_per_line + else: + return (tobytes(elt_id) in self._column.elements_names) def __repr__(self) : diff --git a/python/obitools3/dms/obiseq.pyx b/python/obitools3/dms/obiseq.pyx index 20ac4d2..5ee1d89 100644 --- a/python/obitools3/dms/obiseq.pyx +++ b/python/obitools3/dms/obiseq.pyx @@ -21,7 +21,7 @@ cdef class Seq(dict) : if tags is not None : for k in tags: k_b = tobytes(k) - self[k_b] = tags[k_b] + self[k_b] = tags[k] def __contains__(self, object key): @@ -70,11 +70,10 @@ cdef class Nuc_Seq(Seq) : # nuc sequence property getter and setter @property def seq(self): - return self._seq + return self[NUC_SEQUENCE_COLUMN] @seq.setter def seq(self, object new_seq): # @DuplicatedSignature - self._seq = new_seq self[NUC_SEQUENCE_COLUMN] = tobytes(new_seq) # sequence quality property getter and setter diff --git a/python/obitools3/dms/view/typed_view/view_NUC_SEQS.pyx b/python/obitools3/dms/view/typed_view/view_NUC_SEQS.pyx index bc0f89a..508bc6f 100644 --- a/python/obitools3/dms/view/typed_view/view_NUC_SEQS.pyx +++ b/python/obitools3/dms/view/typed_view/view_NUC_SEQS.pyx @@ -58,12 +58,25 @@ cdef class View_NUC_SEQS(View): return view -# TODO + # TODO test time gain without + @OBIWrapper.checkIsActive def __getitem__(self, object item) : if type(item) == int : return Nuc_Seq_Stored(self, item) else : # TODO assume str or bytes for optimization? return self.get_column(item) # TODO hyper lent dans la pratique + + + @OBIWrapper.checkIsActive + def __iter__(self): + # Iteration on each line of all columns + + # Declarations + cdef index_t line_nb + + # Yield each line + for line_nb in range(self.line_count) : + yield Nuc_Seq_Stored(self, line_nb) # TODO? test if efficiency gain diff --git a/python/obitools3/parsers/fasta.pyx b/python/obitools3/parsers/fasta.pyx index b2f0b43..c619d03 100644 --- a/python/obitools3/parsers/fasta.pyx +++ b/python/obitools3/parsers/fasta.pyx @@ -6,7 +6,9 @@ Created on 30 mars 2016 @author: coissac ''' -from obitools3.dms.obiseq import Nuc_Seq +import types + +from obitools3.dms.obiseq cimport Nuc_Seq def fastaIterator(lineiterator, @@ -48,7 +50,7 @@ def fastaIterator(lineiterator, while True: - if read >= ionly: + if ionly >= 0 and read >= ionly: break while skipped < skip : @@ -79,7 +81,7 @@ def fastaIterator(lineiterator, # definition, # tags=tags, # ) - + # TODO yield { "id" : ident, "definition" : definition, "sequence" : sequence, @@ -105,65 +107,65 @@ def fastaNucIterator(lineiterator, cdef list s cdef bytes sequence cdef int lines_to_skip, ionly, read -# cdef OBI_Seq seq + cdef Nuc_Seq seq if only is None: - ionly=-1 + ionly = -1 else: - ionly=int(only) + ionly = int(only) - if isinstance(lineiterator,(str,bytes)): + if isinstance(lineiterator, (str, bytes)): lineiterator=uopen(lineiterator) + if isinstance(lineiterator, types.GeneratorType): + iterator = lineiterator if isinstance(lineiterator, LineBuffer): - lb=lineiterator + iterator = iter(lineiterator) else: - lb=LineBuffer(lineiterator,buffersize) - + iterator = iter(LineBuffer(lineiterator, buffersize)) skipped = 0 read = 0 - i = iter(lb) if firstline is None: - line = next(i) + line = next(iterator) else: - line = firstline - + line = firstline + while True: - - if read >= ionly: + + if ionly >= 0 and read >= ionly: break - + while skipped < skip : - line = next(i) + line = next(iterator) try: while line[0]!='>': - line = next(i) + line = next(iterator) except StopIteration: pass skipped += 1 ident,tags,definition = parseHeader(line) s = [] - line = next(i) - + line = next(iterator) + try: while line[0]!='>': s.append(str2bytes(line)[0:-1]) - line = next(i) + line = next(iterator) except StopIteration: pass sequence = b"".join(s) -# seq = seq = Nuc_Seq(ident, sequence, - definition, - None,-1, - tags) - + definition=definition, + quality=None, + offset=-1, + tags=tags) + yield seq # yield { "id" : ident, diff --git a/python/obitools3/parsers/fastq.pyx b/python/obitools3/parsers/fastq.pyx index 18bc2c8..83ad7c4 100644 --- a/python/obitools3/parsers/fastq.pyx +++ b/python/obitools3/parsers/fastq.pyx @@ -6,7 +6,7 @@ Created on 30 mars 2016 @author: coissac ''' -from obitools3.dms.obiseq import Nuc_Seq +from obitools3.dms.obiseq cimport Nuc_Seq def fastqIterator(lineiterator, @@ -74,12 +74,11 @@ def fastqWithQualityIterator(lineiterator, else: hline = firstline - for line in i: - if read >= ionly: + if ionly >= 0 and read >= ionly: break - + ident,tags,definition = parseHeader(hline) sequence = str2bytes(line[0:-1]) next(i) @@ -87,9 +86,10 @@ def fastqWithQualityIterator(lineiterator, seq = Nuc_Seq(ident, sequence, - definition, - quality,qualityoffset, - tags) + definition=definition, + quality=quality, + offset=qualityoffset, + tags=tags) yield seq @@ -149,22 +149,23 @@ def fastqWithoutQualityIterator(lineiterator, hline = next(i) else: hline = firstline - + for line in i: - - if read >= ionly: + + if ionly >= 0 and read >= ionly: break ident,tags,definition = parseHeader(hline) sequence = str2bytes(line[0:-1]) next(i) next(i) - + seq = Nuc_Seq(ident, sequence, - definition, - None,-1, - tags) + definition=definition, + quality=None, + offset=-1, + tags=tags) yield seq diff --git a/python/obitools3/parsers/universal.pyx b/python/obitools3/parsers/universal.pyx index 8d3d0ab..2aa0bb5 100644 --- a/python/obitools3/parsers/universal.pyx +++ b/python/obitools3/parsers/universal.pyx @@ -41,11 +41,11 @@ def entryIteratorFactory(lineiterator, if isinstance(lineiterator, LineBuffer): lb=lineiterator else: - lb=LineBuffer(lineiterator,buffersize) - + lb=LineBuffer(lineiterator, buffersize) + i = iter(lb) - first=next(i) + first=next(i) format=b"tabular" @@ -61,26 +61,29 @@ def entryIteratorFactory(lineiterator, format=b"ecopcrfile" elif is_ngsfilter_line(first): format=b"ngsfilter" - + + # TODO Temporary fix + first=None + lineiterator.seek(0) + if format==b'fasta': if seqtype == b'nuc': return (fastaNucIterator(lineiterator, - skip,only, - first), + skip=skip,only=only, + firstline=first, + buffersize=buffersize), Nuc_Seq) else: raise NotImplementedError() elif format==b'fastq': return (fastqIterator(lineiterator, - skip,only, - qualityoffset, - first), + skip=skip,only=only, + qualityoffset=qualityoffset, + noquality=noquality, + firstline=first, + buffersize=buffersize), Nuc_Seq) raise NotImplementedError('File format not yet implemented') - - - - diff --git a/python/obitools3/uri/decode.pyx b/python/obitools3/uri/decode.pyx index b3b254a..4a3d9cf 100644 --- a/python/obitools3/uri/decode.pyx +++ b/python/obitools3/uri/decode.pyx @@ -364,19 +364,22 @@ def open_uri(uri, if qualifiers[b"seqtype"]==b"nuc": objclass = Nuc_Seq if format==b"fasta": - iseq = fastaNucIterator(file,skip,only) + iseq = fastaNucIterator(file, + skip=skip, + only=only) elif format==b"fastq": iseq = fastqIterator(file, - skip,only, - offset, - noquality) + skip=skip, + only=only, + offset=offset, + noquality=noquality) else: raise NotImplementedError('Sequence file format not implemented') elif qualifiers[b"seqtype"]==b"prot": raise NotImplementedError() else: iseq,objclass = entryIteratorFactory(file, - skip,only, + skip, only, seqtype, offset, noquality, @@ -388,13 +391,12 @@ def open_uri(uri, stripwhite, blanklineskip, commentchar) - - tmpdms = get_temp_dms() - - return (file,iseq,objclass,urib) - - + #tmpdms = get_temp_dms() + + return (file, iseq, objclass, urib) + + + + - - diff --git a/src/bloom.h b/src/bloom.h index 651e2e5..74f64ca 100755 --- a/src/bloom.h +++ b/src/bloom.h @@ -136,7 +136,7 @@ int bloom_init_size(struct bloom * bloom, int entries, double error, /** *************************************************************************** * Check if the given element is in the bloom filter. Remember this may - * return false positive if a collision occured. + * return false positive if a collision occurred. * * Parameters: * ----------- diff --git a/src/obiavl.c b/src/obiavl.c index dc1b58d..3fff360 100644 --- a/src/obiavl.c +++ b/src/obiavl.c @@ -2463,7 +2463,7 @@ index_t obi_avl_group_add(OBIDMS_avl_group_p avl_group, Obi_blob_p value) // Check if the AVL group is writable if (!(avl_group->writable)) { - obi_set_errno(OBI_READ_ONLY_INDEXER_ERROR); + obi_set_errno(OBI_READ_ONLY_INDEXER_ERROR); // Note: this error is read by the calling functions to clone the AVL group if needed return -1; } @@ -2476,6 +2476,9 @@ index_t obi_avl_group_add(OBIDMS_avl_group_p avl_group, Obi_blob_p value) // Add in the current AVL index_in_avl = (int32_t) obi_avl_add((avl_group->sub_avls)[avl_group->last_avl_idx], value); + if (index_in_avl < 0) + return -1; + bloom_add(&((((avl_group->sub_avls)[avl_group->last_avl_idx])->header)->bloom_filter), value, obi_blob_sizeof(value)); // Build the index containing the AVL index diff --git a/src/obiblob.c b/src/obiblob.c index 1547770..66a5c9c 100644 --- a/src/obiblob.c +++ b/src/obiblob.c @@ -32,7 +32,7 @@ Obi_blob_p obi_blob(byte_t* encoded_value, uint8_t element_size, int32_t length_ Obi_blob_p blob; // Allocate the memory for the blob structure - blob = (Obi_blob_p) malloc(sizeof(Obi_blob_t) + length_encoded_value); + blob = (Obi_blob_p) calloc(sizeof(Obi_blob_t) + length_encoded_value, sizeof(byte_t)); if (blob == NULL) { obi_set_errno(OBI_MALLOC_ERROR); diff --git a/src/obidms.c b/src/obidms.c index 6304f49..8db9592 100644 --- a/src/obidms.c +++ b/src/obidms.c @@ -240,7 +240,6 @@ OBIDMS_p obi_create_dms(const char* dms_path) char* directory_name; DIR* dms_dir; int dms_file_descriptor; - size_t i, j; // Build and check the directory name directory_name = build_directory_name(dms_path); @@ -318,7 +317,7 @@ OBIDMS_p obi_create_dms(const char* dms_path) */ // Create the informations file - if (create_dms_infos_file(dms_file_descriptor, basename(dms_path)) < 0) + if (create_dms_infos_file(dms_file_descriptor, basename((char*)dms_path)) < 0) return NULL; return obi_open_dms(dms_path); @@ -333,7 +332,6 @@ OBIDMS_p obi_open_dms(const char* dms_path) int infos_file_descriptor; bool little_endian_dms; bool little_endian_platform; - size_t i, j; dms = NULL; @@ -356,7 +354,7 @@ OBIDMS_p obi_open_dms(const char* dms_path) i++; } */ - strcpy(dms->dms_name, basename(dms_path)); + strcpy(dms->dms_name, basename((char*)dms_path)); // Build and check the directory name including the relative path complete_dms_path = build_directory_name(dms_path); diff --git a/src/obidmscolumn.c b/src/obidmscolumn.c index e3fdbc4..085c335 100644 --- a/src/obidmscolumn.c +++ b/src/obidmscolumn.c @@ -703,7 +703,7 @@ static int get_formatted_elt_names_length(const char* elements_names) static index_t get_line_count_per_page(OBIType_t data_type, index_t nb_elements_per_line) { - return getpagesize() / (obi_sizeof(data_type) * nb_elements_per_line); + return getpagesize() / obi_sizeof(data_type) / nb_elements_per_line; } @@ -919,6 +919,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms, // The initial line count should be between the minimum (corresponding to the page size) and the maximum allowed minimum_line_count = get_line_count_per_page(stored_data_type, nb_elements_per_line); + if (minimum_line_count == 0) // Happens if high number of elements per line + minimum_line_count = 1; if (nb_lines > MAXIMUM_LINE_COUNT) { obidebug(1, "\nCan't create column because of line count greater than the maximum allowed (%d)", MAXIMUM_LINE_COUNT); @@ -1023,7 +1025,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms, if (new_column->data == MAP_FAILED) { obi_set_errno(OBICOL_UNKNOWN_ERROR); - obidebug(1, "\nError mmapping the data of a column"); + obidebug(1, "\nError mmapping the data of a column.\nArguments: data_size=%lu, column_file_descriptor=%d, header_size=%lu", + data_size, column_file_descriptor, header_size); munmap(new_column->header, header_size); close(column_file_descriptor); free(new_column);