Merge branch 'master' of

git@git.metabarcoding.org:obitools/obitools3.git Conflicts: python/obitools3/commands/import.pyx
2017-09-05 08:59:45 +02:00
parent 5045d0c2e9 73bca6288f
commit 4bb071c048
15 changed files with 450 additions and 433 deletions
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -8,12 +8,10 @@ from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
 from obitools3.files.universalopener cimport uopen
 from obitools3.parsers.fasta import fastaIterator
 from obitools3.parsers.fastq import fastqIterator
 from obitools3.dms.dms import DMS       # TODO cimport doesn't work
 from obitools3.dms.view.view cimport View
-from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS    # TODO cimport doesn't work
+from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
 from obitools3.dms.column.column cimport Column
-
+from obitools3.dms.obiseq cimport Nuc_Seq
 from obitools3.dms.obiseq import Nuc_Seq
 from obitools3.utils cimport tobytes, \
                             get_obitype, \
@ -25,6 +23,7 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \
 from obitools3.dms.capi.obierrno cimport obi_errno
 from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
 from obitools3.uri.decode import open_uri
 from obitools3.apps.config import logger
@ -50,6 +49,8 @@ def addOptions(parser):
 def run(config):
    cdef   tuple       input
    cdef   tuple       output 
    cdef   int         i
    cdef   type        value_type
    cdef   obitype_t   value_obitype
@ -62,7 +63,6 @@ def run(config):
    cdef   View        view
    cdef   object      iseq
    cdef   object      seq
    cdef   object      inputs
    cdef   Column      id_col
    cdef   Column      def_col
    cdef   Column      seq_col
@ -71,7 +71,7 @@ def run(config):
    cdef   bint        rewrite
    cdef   dict        dcols
    cdef   int         skipping
-    cdef   str         tag
+    cdef   bytes       tag
    cdef   object      value
    cdef   list        elt_names
    cdef   int         old_nb_elements_per_line
@ -84,163 +84,157 @@ def run(config):
    logger("info","obi import : imports file into an DMS")
-    inputs = open_uri(config['obi']['inputURI'])
+    input = open_uri(config['obi']['inputURI'])
-    if inputs[2]==Nuc_Seq:
+    if input[2]==Nuc_Seq:
        v = View_NUC_SEQS
    else:
-        v= View 
+        v = View 
-            
+        
    output = open_uri(config['obi']['outputURI'],
                      input=False,
                      newviewtype=v)
-    print(input)
+    #print(input)
-    print(output)
+    #print(output)
-    sys.exit()
+    pb = ProgressBar(1000000, config, seconde=5)   # TODO should be number of records in file
-#     pb = ProgressBar(1000000, config, seconde=5)   # TODO should be number of records in file
+    iseq = input[1]
-#        
+    
-#     inputs = uopen(config['import']['filename'])
+    get_quality = False
-# 
+    NUC_SEQS_view = False
-#     # Create or open DMS
+    if isinstance(output[1], View) :
-#     d = DMS.open_or_new(config['obi']['defaultdms'])
+        view = output[1]
-#     
+        if output[2] == View_NUC_SEQS :
-#     get_quality = False
+            NUC_SEQS_view = True
-#     NUC_SEQS_view = False
+            if "QUALITY" in view :      # TODO
-#     if config['import']['seqinformat']=='fasta':
+                get_quality = True
-#         get_quality = False
+    else: 
-#         NUC_SEQS_view = True
+        raise NotImplementedError()
-#         iseq = fastaIterator(inputs, skip=config['import']['skip'])
+    
-#         view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
+    # Save basic columns in variables for optimization
-#     elif config['import']['seqinformat']=='fastq':
+    if NUC_SEQS_view :
-#         get_quality = True
+        id_col = view[b"ID"]
-#         NUC_SEQS_view = True
+        def_col = view[b"DEFINITION"]
-#         iseq = fastqIterator(inputs, skip=config['import']['skip'])
+        seq_col = view[b"NUC_SEQ"]
-#         view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
+        if get_quality :
-#     else:
+            qual_col = view[b"QUALITY"]
-#         raise RuntimeError('File format not handled')
+     
-#         
+    dcols = {}
-#     # Save basic columns in variables for optimization
+     
-#     if NUC_SEQS_view :
+    i = 0
-#         id_col = view["ID"]
+    for seq in iseq :
-#         def_col = view["DEFINITION"]
+        
-#         seq_col = view["NUC_SEQ"]
+        pb(i)
-#         if get_quality :
+        
-#             qual_col = view["QUALITY"]
+        if NUC_SEQS_view :
-#     
+            id_col[i] = seq.id
-#     dcols = {}
+            def_col[i] = seq.definition
-#     
+            seq_col[i] = seq.seq
-#     i = 0
+            
-#     for seq in iseq :
+            if get_quality :
-#         if i == config['import']['only'] :
+                qual_col[i] = seq.quality
-#             break
+         
-#         else :
+        for tag in seq :
-#             pb(i)
+            
-#             if NUC_SEQS_view :
+            if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" :  # TODO hmmm... 
-#                 id_col[i] = seq['id']
+                                
-#                 def_col[i] = seq['definition']
+                value = seq[tag]
-#                 seq_col[i] = seq['sequence']
+                 
-#                 if get_quality :
+                # Check NA value
-#                     qual_col[i] = seq['quality']
+                if value == config['obi']['nastring'] :
-#             
+                    value = None
-#             for tag in seq['tags'] :
+                 
-#                             
+                if tag not in dcols :
-#                 value = seq['tags'][tag]
+                     
-#                 
+                    value_type = type(value)
-#                 # Check NA value
+                    nb_elts = 1
-#                 if value == config['import']['NA'] :
+                    value_obitype = OBI_VOID
-#                     value = None
+                     
-#                 
+                    if value_type == dict or value_type == list :
-#                 if tag not in dcols :
+                        nb_elts = len(value)
-#                     
+                        elt_names = list(value)
-#                     value_type = type(value)
+                    else :
-#                     nb_elts = 1
+                        nb_elts = 1
-#                     value_obitype = OBI_VOID
+                        elt_names = None
-#                     
+                     
-#                     if value_type == dict or value_type == list :
+                    value_obitype = get_obitype(value)
-#                         nb_elts = len(value)
+                     
-#                         elt_names = list(value)
+                    if value_obitype != OBI_VOID :
-#                     else :
+                        dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
-#                         nb_elts = 1
+                                                 
-#                         elt_names = None
+                        # Fill value
-#                     
+                        dcols[tag][0][i] = value
-#                     value_obitype = get_obitype(value)
+                     
-#                     
+                    # TODO else log error?
-#                     if value_obitype != OBI_VOID :
+ 
-#                         dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
+                else :
-#                                                 
+         
-#                         # Fill value
+                    rewrite = False
-#                         dcols[tag][0][i] = value
+ 
-#                     
+                    # Check type adequation
-#                     # TODO else log error?
+                    old_type = dcols[tag][1]
-# 
+                    new_type = OBI_VOID
-#                 else :
+                    new_type = update_obitype(old_type, value)
-#         
+                    if old_type != new_type :
-#                     rewrite = False
+                        rewrite = True
-# 
+ 
-#                     # Check type adequation
+                    try:
-#                     old_type = dcols[tag][1]
+                        # Fill value
-#                     new_type = OBI_VOID
+                        dcols[tag][0][i] = value
-#                     new_type = update_obitype(old_type, value)
+                     
-#                     if old_type != new_type :
+                    except IndexError :
-#                         rewrite = True
+                                                 
-# 
+                        value_type = type(value)
-#                     try:
+                        old_column = dcols[tag][0]
-#                         # Fill value
+                        old_nb_elements_per_line = old_column.nb_elements_per_line
-#                         dcols[tag][0][i] = value
+                        new_nb_elements_per_line = 0
-#                     
+                        old_elements_names = old_column.elements_names
-#                     except IndexError :
+                        new_elements_names = None
-#                                                 
+     
-#                         value_type = type(value)
+                        #####################################################################
-#                         old_column = dcols[tag][0]
+                         
-#                         old_nb_elements_per_line = old_column.nb_elements_per_line
+                        # Check the length and keys of column lines if needed
-#                         new_nb_elements_per_line = 0
+                        if value_type == dict :    # Check dictionary keys
-#                         old_elements_names = old_column.elements_names
+                            for k in value :
-#                         new_elements_names = None
+                                if k not in old_elements_names :
-#     
+                                    new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
-#                         #####################################################################
+                                    rewrite = True
-#                         
+                                    break
-#                         # Check the length and keys of column lines if needed
+                         
-#                         if value_type == dict :    # Check dictionary keys
+                        elif value_type == list or value_type == tuple :  # Check vector length
-#                             for k in value :
+                            if old_nb_elements_per_line < len(value) :
-#                                 if k not in old_elements_names :
+                                new_nb_elements_per_line = len(value)
-#                                     new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
+                                rewrite = True
-#                                     rewrite = True
+                         
-#                                     break
+                        #####################################################################
-#                         
+                         
-#                         elif value_type == list or value_type == tuple :  # Check vector length
+                        if rewrite :
-#                             if old_nb_elements_per_line < len(value) :
+                            if new_nb_elements_per_line == 0 and new_elements_names is not None :
-#                                 new_nb_elements_per_line = len(value)
+                                new_nb_elements_per_line = len(new_elements_names)
-#                                 rewrite = True
+                             
-#                         
+                            # Reset obierrno 
-#                         #####################################################################
+                            obi_errno = 0
-#                         
+ 
-#                         if rewrite :
+                            dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
-#                             if new_nb_elements_per_line == 0 and new_elements_names is not None :
+                                                                                   new_data_type=new_type, 
-#                                 new_nb_elements_per_line = len(new_elements_names)
+                                                                                   new_nb_elements_per_line=new_nb_elements_per_line,
-#                             
+                                                                                   new_elements_names=new_elements_names), 
-#                             # Reset obierrno 
+                                          value_obitype)
-#                             obi_errno = 0
+                             
-# 
+                            # Update the dictionary:
-#                             dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
+                            for t in dcols :
-#                                                                                    new_data_type=new_type, 
+                                dcols[t] = (view[t], dcols[t][1])
-#                                                                                    new_nb_elements_per_line=new_nb_elements_per_line,
+                             
-#                                                                                    new_elements_names=new_elements_names), 
+                            # Fill value
-#                                           value_obitype)
+                            dcols[tag][0][i] = value
-#                             
+                                    
-#                             # Update the dictionary:
+        i+=1
-#                             for t in dcols :
+ 
-#                                 dcols[t] = (view[t], dcols[t][1])
+    print("\n")
-#                             
+    print(view.__repr__())
-#                             # Fill value
+  
-#                             dcols[tag][0][i] = value
+    input[0].close()    # TODO
-#                                     
+    output[0].close()
 #             i+=1
 # 
 #     print("\n")
 #     print(view.__repr__())
 #  
 #     d.close()
--- a/python/obitools3/commands/uniq.pxd
+++ b/python/obitools3/commands/uniq.pxd
@ -0,0 +1,8 @@
 #cython: language_level=3
 from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
 from obitools3.dms.taxo.taxo cimport Taxonomy
 from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
 cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=*, list mergedKeys_list=*, bint mergeIds=*, list categories=*)
--- a/python/obitools3/commands/uniq.pyx
+++ b/python/obitools3/commands/uniq.pyx
@ -1,59 +1,29 @@
 #cython: language_level=3
 from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
-from obitools3.dms.dms import DMS       # TODO cimport doesn't work
+from obitools3.dms.dms cimport DMS
-from obitools3.dms.view.view import View    # TODO cimport doesn't work
+from obitools3.dms.taxo.taxo cimport Taxonomy
-from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS
+from obitools3.dms.view.view cimport View, Line
-from obitools3.dms.obiseq cimport Nuc_Seq
+from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
-from obitools3.dms.column.column cimport Column
+from obitools3.dms.column.column cimport Column, Column_line
-from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN
+from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
-from obitools3.dms.capi.obitypes cimport OBI_INT
+from obitools3.dms.capi.obitypes cimport OBI_INT, index_t
-from obitools3.utils cimport tostr
+from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
 from obitools3.uri.decode import open_uri
 from obitools3.apps.config import logger
 # TODO silence non-implemented options
 __title__="Groups records together"
-default_config = {   'inputview'    : None,
+
                     'outputview'   : None
                 }
 def addOptions(parser):
-    # TODO put this common group somewhere else but I don't know where
+    addSequenceInputOption(parser)
-    group=parser.add_argument_group('DMS and view options')
+    addMinimalOutputOption(parser)
    group.add_argument('--default-dms','-d', 
                       action="store", dest="obi:defaultdms",
                       metavar='<DMS NAME>',
                       default=None,
                       type=str,
                       help="Name of the default DMS for reading and writing data.")
    group.add_argument('--input-view','-i',
                       action="store", dest="obi:inputview",
                       metavar='<INPUT VIEW NAME>',
                       default=None,
                       type=str,
                       help="Name of the input view, either raw if the view is in the default DMS,"
                            " or in the form 'dms:view' if it is in another DMS.")
    group.add_argument('--output-view','-o',
                       action="store", dest="obi:outputview",
                       metavar='<OUTPUT VIEW NAME>',
                       default=None,
                       type=str,
                       help="Name of the output view, either raw if the view is in the default DMS,"
                            " or in the form 'dms:view' if it is in another DMS.")
    group.add_argument('--taxo','-t',
                       action="store", dest="obi:taxo",
                       metavar='<TAXONOMY NAME>',
                       default='',  # TODO not None because if it's None, the option is not entered in the option dictionary.
                       type=str,
                       help="Name of the taxonomy to use.")
    group = parser.add_argument_group('obi uniq specific options')
@ -89,152 +59,198 @@ def addOptions(parser):
 # TODO taxonomy
-# TODO
+cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=None, list mergedKeys_list=None, bint mergeIds=False, list categories=None) :
 COUNT_COLUMN_str = tostr(COUNT_COLUMN)
 def uniqSequence(view, pb, o_view, taxonomy=None, mergedKey=None, mergeIds=False, categories=None) :
    cdef int            i
    cdef int            o_idx
    cdef int            u_idx
    cdef int            u_id
    cdef int            i_count
    cdef set            mergedKeys
    cdef dict           uniques
    cdef dict           merged_infos
    cdef object         iter_view
    cdef Line           i_seq
    cdef Line           o_seq
    cdef str            key
    cdef bytes          key_b
    cdef str            mkey
    cdef str            merged_col_name
    cdef Column         i_col
    cdef Column         seq_col
    cdef object         to_merge
    cdef Column_line    mcol
    cdef Column_line    i_mcol
    uniques = {}
    if categories is None:
        categories=[]
-    if mergedKey is not None:
+    if mergedKeys_list is not None:
-        mergedKey=set(mergedKey)
+        mergedKeys=set(mergedKeys_list)
    else:
-        mergedKey=set() 
+        mergedKeys=set() 
-    if taxonomy is not None:
+#    if taxonomy is not None:
-        mergedKey.add('taxid')
+#        mergedKeys.add('taxid')
-    # Faire parcours de la view des colonnes à merged pour créer les merged_col avant et les remplir au fur et à mesure
+    # Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
-    
+    #logger("info", "obi uniq", "First browsing through the input")
-    o_idx = 0
+    merged_infos = {}
    i = 0
    seq_col = view[NUC_SEQUENCE_COLUMN]
    iter_view = iter(view)
-    for i_seq in iter_view :
+    for i_seq in iter_view:
-        pass
+        pb(i)
        for key in mergedKeys:
            mkey = "merged_%s" % key 
            if key in i_seq:    # TODO what if mkey already in i_seq?
                if mkey not in merged_infos:
                    merged_infos[mkey] = {}
                    mkey_infos = merged_infos[mkey]
                    mkey_infos['nb_elts'] = 1
                    mkey_infos['elt_names'] = [i_seq[key]]
                else:
                    mkey_infos = merged_infos[mkey]
                    if i_seq[key] not in mkey_infos['elt_names']:     # TODO make faster? but how?
                        mkey_infos['elt_names'].append(i_seq[key])
                        mkey_infos['nb_elts'] += 1
        i+=1
    for key in mergedKeys:
        merged_col_name = "merged_%s" % key
        i_col = view[key]
        Column.new_column(o_view, 
                          merged_col_name, 
                          OBI_INT, 
                          nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
                          elements_names=merged_infos[merged_col_name]['elt_names'],
                          comments=i_col.comments,
                          alias=merged_col_name     # TODO what if it already exists
                         )
    del(merged_infos)
    #logger("info", "obi uniq", "Second browsing through the input")
    i = 0
    o_idx = 0
    seq_col = view[NUC_SEQUENCE_COLUMN]
    iter_view = iter(view)
    for i_seq in iter_view :
        pb(i)
-        # utiliser l'index des AVLs, faire l'API
+        
-        #u_id = tuple(i_seq[x] for x in categories) + (str(i_seq),)
+        #u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),)
        u_id = seq_col.get_line_idx(i)
        if u_id in uniques:
-            u_seq = uniques[u_id]
+                        
-            o_seq = o_view[u_seq['idx']]
+            if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
-            if COUNT_COLUMN_str in i_seq:
+                i_count = 1
                o_seq[COUNT_COLUMN_str] += i_seq[COUNT_COLUMN_str]
            else:
-                o_seq[COUNT_COLUMN_str] += 1
+                i_count = i_seq[COUNT_COLUMN]
-#                seq['COUNT']=1
+
            u_idx = uniques[u_id]
            o_seq = o_view[u_idx]                    
            o_seq[COUNT_COLUMN] += i_count
 #            if taxonomy is not None and 'taxid' in seq:
 #                s['merged_taxid'][seq['taxid']]=
-            for key in mergedKey:
+            for key in mergedKeys:
-                if key=='taxid' and mergeIds:   # TODO
+#                if key=='taxid' and mergeIds:   # TODO
-                    if 'taxid_dist' in i_seq:
+#                    if 'taxid_dist' in i_seq:
-                        u_seq["taxid_dist"].update(i_seq["taxid_dist"])
+#                        u_seq["taxid_dist"].update(i_seq["taxid_dist"])
-                    if 'taxid' in i_seq:
+#                    if 'taxid' in i_seq:
-                        u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
+#                        u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
-                        
+                
-                mkey = "merged_%s" % key 
+                mkey = "merged_%s" % key
                #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
                if key in i_seq:
-                    u_seq[mkey][i_seq[key]] = u_seq[mkey].get(i_seq[key], 0) + i_seq[COUNT_COLUMN_str]
+                    to_merge = i_seq[key]
                    mcol = o_seq[mkey]
                    if mcol[to_merge] is None:
                        mcol[to_merge] = i_count
                    else:
                        mcol[to_merge] = mcol[to_merge] + i_count
                #cas ou merged_keys existe deja
-                else:
+                else:   # TODO is this a good else
                    if mkey in i_seq:
-                        for skey in i_seq[mkey]:
+                        mcol = o_seq[mkey]
-                            u_seq[mkey][skey] = u_seq[mkey].get(skey,0) + i_seq[mkey][skey]
+                        i_mcol = i_seq[mkey]
-                            
+                        for key_b in i_mcol:
-            for key in i_seq.keys():
+                            if mcol[key_b] is None:
                                mcol[key_b] = i_mcol[key_b]
                            else:
                                mcol[key_b] = mcol[key_b] + i_mcol[key_b]
            for key_b in i_seq.keys():
                # Merger proprement l'attribut merged s'il existe
-                if key in o_seq and o_seq[key] != i_seq[key] and tostr(key) != COUNT_COLUMN_str : #and key[0:7]!='merged_' and key!='merged': TODO check this
+                # TODO make special columns list?
-                    o_seq[key] = None
+                if key_b != COUNT_COLUMN and key_b != ID_COLUMN and key_b != NUC_SEQUENCE_COLUMN and key_b in o_seq and o_seq[key_b] != i_seq[key_b] :
-                            
+                    o_seq[key_b] = None
-            if mergeIds:        
+                                        
-                u_seq['merged'].append(i_seq.id)
+ #           if mergeIds:                           # TODO
-        
+ #               u_seq['merged'].append(i_seq.id)
        else:
        else:
            o_view[o_idx] = i_seq
            o_seq = o_view[o_idx]
-            uniques[u_id] = {'idx':o_idx}
+            uniques[u_id] = o_idx
            u_seq = uniques[u_id]
            o_idx += 1
-            if COUNT_COLUMN_str not in o_seq:
+            if COUNT_COLUMN not in o_seq or o_seq[COUNT_COLUMN] is None:
-                o_seq[COUNT_COLUMN_str] = 1
+                o_seq[COUNT_COLUMN] = 1
-            
+                        
-            for key in mergedKey:
+            for key in mergedKeys:
-                if key=='taxid' and mergeIds:
+#                 if key=='taxid' and mergeIds:
-                    if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
+#                     if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
-                        u_seq["taxid_dist"] = {}
+#                         u_seq["taxid_dist"] = {}
-                    else :
+#                     else :
-                        u_seq["taxid_dist"] = o_seq["taxid_dist"]
+#                         u_seq["taxid_dist"] = o_seq["taxid_dist"]
-                    if 'taxid' in o_seq:
+#                     if 'taxid' in o_seq:
-                        u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
+#                         u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
-                mkey = "merged_%s" % key 
+                mkey = "merged_%s" % key
                if mkey not in o_seq:
                    u_seq[mkey]={}
                else :
                    u_seq[mkey] = o_seq[mkey]
                if key in o_seq:
-                    u_seq[mkey][o_seq[key]] = u_seq[mkey].get(o_seq[key],0) + o_seq[COUNT_COLUMN_str]
+                    to_merge = o_seq[key]
-                    o_seq[key] = None
+                    mcol = o_seq[mkey]
-
+                    if to_merge in mcol and mcol[to_merge] is not None:
-            if mergeIds:
+                        mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN]
-                u_seq['merged']=[o_seq.id]
+                    else:
-         
+                        mcol[to_merge] = o_seq[COUNT_COLUMN]
                    o_seq[key] = None   # TODO delete column eventually -> make C function?
 #            if mergeIds:
 #                u_seq['merged']=[o_seq.id]
        i+=1 
-                
+   
    #TODO
    #if taxonomy is not None:
    #    mergeTaxonomyClassification(uniqSeq, taxonomy)
    # Get informations to build the columns with merged attributes
    merged_infos = {}
    for u_id in uniques :
        u_seq = uniques[u_id]
        for mkey in u_seq :
            if mkey != 'idx' :
                mkey_dict = u_seq[mkey]
                if mkey not in merged_infos :
                    merged_infos[mkey] = {}
                    mkey_infos = merged_infos[mkey]
                    mkey_infos['nb_elts'] = len(mkey_dict.keys())
                    mkey_infos['elt_names'] = [k for k in mkey_dict]
                else :
                    mkey_infos = merged_infos[mkey]
                    for k in mkey_dict :
                        if k not in mkey_infos['elt_names'] :
                            mkey_infos['elt_names'].append(k)
                            mkey_infos['nb_elts'] += 1
    keys_to_del = []
    for k in merged_infos :
        if merged_infos[k]['nb_elts'] == 0:
            keys_to_del.append(k)
    for k in keys_to_del :
        del merged_infos[k]
    return (uniques, merged_infos)
 def run(config):
-    # TODO declare variables 
+    cdef tuple         input
    cdef tuple         output 
    cdef View_NUC_SEQS entries
    cdef View_NUC_SEQS o_view
    cdef ProgressBar   pb
    logger("info","obi uniq")
-    # Open DMS
+    input = open_uri(config['obi']['inputURI'])
-    d = DMS.open(config['obi']['defaultdms'])
+    
-     
+    if input[2] != View_NUC_SEQS:
-    # Open input view
+        raise NotImplementedError('obi uniq only works on NUC_SEQS views')
-    entries = View.open(d, config['obi']['inputview'])
+        
-     
+    output = open_uri(config['obi']['outputURI'],
                      input=False,
                      newviewtype=View_NUC_SEQS)
    entries = input[1]
    o_view = output[1]
    # Initialize the progress bar
    pb = ProgressBar(len(entries), config, seconde=5)
@ -243,55 +259,20 @@ def run(config):
 #        usm = uniqPrefixSequence
 #    else:
    usm = uniqSequence
    # Create output view
    view_class = View.get_view_class(entries.type)
    if view_class == View_NUC_SEQS :
        get_quality = tostr(QUALITY_COLUMN) in entries # TODO
        o_view = View_NUC_SEQS.new(d, config['obi']['outputview'], quality=get_quality)
    else :
        o_view = view_class.new(d, config['obi']['outputview'])
-    (uniques, merged_infos) = usm(entries, pb, o_view, config['obi']['taxo'], config['uniq']['merge'], config['uniq']['mergeids'], config['uniq']['categories'])       
+    usm(entries, o_view, pb, taxonomy=None, mergedKeys_list=config['uniq']['merge'], mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])       
-    if 'merge' in config['uniq'] :
+#     if 'merge' in config['uniq'] :
-        merged_keys=set(config['uniq']['merge'])
+#         merged_keys=set(config['uniq']['merge'])
-    else:
+#     else:
-        merged_keys=set() 
+#         merged_keys=set() 
-      
+#       
-    if 'taxo' in config['obi'] :
+#     if 'taxo' in config['obi'] :
-        merged_keys.add('taxid')
+#         merged_keys.add('taxid')
    # TODO gotta handle special merged columns
    for k in merged_keys:
        merged_col_name = "merged_%s" % k
        if merged_col_name in merged_infos :
            i_col = entries[k]
            Column.new_column(o_view, 
                              merged_col_name, 
                              OBI_INT, 
                              nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
                              elements_names=merged_infos[merged_col_name]['elt_names'],
                              comments=i_col.comments,
                              alias=merged_col_name
                             )
    for u_id in uniques:
        u_dict = uniques[u_id]
        for merged_k in u_dict :
            if merged_k in merged_infos :   # TODO don't enter irrelevant keys to begin with, instead
                o_view[u_dict['idx']][merged_k] = u_dict[merged_k]
    print("\n")
    print(repr(o_view))
- 
+    
-    d.close()
+    input[0].close()
-     
+    output[0].close()     
-     
+
--- a/python/obitools3/dms/column/column.pyx
+++ b/python/obitools3/dms/column/column.pyx
@ -371,8 +371,18 @@ cdef class Column_line :
        self._column.set_item(self._index, elt_id, value)
-    def __contains__(self, object element_name):
+    def get(self, object elt_id, object default=None):      # TODO returns default if None???
-        return (tobytes(element_name) in self._column.elements_names)
+        if elt_id in self:
            return self._column.get_item(self._index, elt_id)
        else:
            return default
    def __contains__(self, object elt_id):
        if type(elt_id) == int:
            return elt_id < self._column.nb_elements_per_line
        else:
            return (tobytes(elt_id) in self._column.elements_names)
    def __repr__(self) :
--- a/python/obitools3/dms/obiseq.pyx
+++ b/python/obitools3/dms/obiseq.pyx
@ -21,7 +21,7 @@ cdef class Seq(dict) :
        if tags is not None :
            for k in tags:
                k_b = tobytes(k)
-                self[k_b] = tags[k_b]
+                self[k_b] = tags[k]
    def __contains__(self, object key):
@ -70,11 +70,10 @@ cdef class Nuc_Seq(Seq) :
    # nuc sequence property getter and setter
    @property
    def seq(self):
-        return self._seq
+        return self[NUC_SEQUENCE_COLUMN]
    @seq.setter
    def seq(self, object new_seq):  # @DuplicatedSignature
        self._seq = new_seq
        self[NUC_SEQUENCE_COLUMN] = tobytes(new_seq)
    # sequence quality property getter and setter
--- a/python/obitools3/dms/view/typed_view/view_NUC_SEQS.pyx
+++ b/python/obitools3/dms/view/typed_view/view_NUC_SEQS.pyx
@ -58,12 +58,25 @@ cdef class View_NUC_SEQS(View):
        return view
-# TODO
+    # TODO test time gain without
    @OBIWrapper.checkIsActive
    def __getitem__(self, object item) :
        if type(item) == int :
            return Nuc_Seq_Stored(self, item)
        else :  # TODO assume str or bytes for optimization?
            return self.get_column(item)    # TODO hyper lent dans la pratique
    @OBIWrapper.checkIsActive
    def __iter__(self):
        # Iteration on each line of all columns
        # Declarations
        cdef index_t line_nb
        # Yield each line    
        for line_nb in range(self.line_count) :
            yield Nuc_Seq_Stored(self, line_nb)
 # TODO? test if efficiency gain
--- a/python/obitools3/parsers/fasta.pyx
+++ b/python/obitools3/parsers/fasta.pyx
@ -6,7 +6,9 @@ Created on 30 mars 2016
@author: coissac
 '''
-from obitools3.dms.obiseq import Nuc_Seq
+import types
 from obitools3.dms.obiseq cimport Nuc_Seq
 def fastaIterator(lineiterator, 
@ -48,7 +50,7 @@ def fastaIterator(lineiterator,
    while True:
-        if read >= ionly:
+        if ionly >= 0 and read >= ionly:
            break
        while skipped < skip :
@ -79,7 +81,7 @@ def fastaIterator(lineiterator,
    #                       definition,
    #                       tags=tags,
    #                       )
-                
+          # TODO      
        yield { "id"         : ident,
                "definition" : definition,
                "sequence"   : sequence,
@ -105,65 +107,65 @@ def fastaNucIterator(lineiterator,
    cdef list       s
    cdef bytes      sequence
    cdef int        lines_to_skip, ionly, read
-#    cdef OBI_Seq    seq
+    cdef Nuc_Seq    seq
    if only is None:
-        ionly=-1
+        ionly = -1
    else:
-        ionly=int(only)
+        ionly = int(only)
-    if isinstance(lineiterator,(str,bytes)):
+    if isinstance(lineiterator, (str, bytes)):
        lineiterator=uopen(lineiterator)
    if isinstance(lineiterator, types.GeneratorType):
        iterator = lineiterator
    if isinstance(lineiterator, LineBuffer):
-        lb=lineiterator
+        iterator = iter(lineiterator)
    else:
-        lb=LineBuffer(lineiterator,buffersize)
+        iterator = iter(LineBuffer(lineiterator, buffersize))
    skipped = 0
    read = 0
    i = iter(lb)
    if firstline is None:
-        line = next(i)
+        line = next(iterator)
    else:
-        line = firstline
+        line = firstline       
-        
+    
    while True:
-        
+                
-        if read >= ionly:
+        if ionly >= 0 and read >= ionly:
            break
-       
+                
        while skipped < skip :
-            line = next(i)
+            line = next(iterator)
            try:
                while line[0]!='>':
-                    line = next(i)
+                    line = next(iterator)
            except StopIteration:
                pass
            skipped += 1
        ident,tags,definition = parseHeader(line)
        s = []
-        line = next(i)
+        line = next(iterator)
-        
+    
        try:
            while line[0]!='>':
                s.append(str2bytes(line)[0:-1])
-                line = next(i)
+                line = next(iterator)
        except StopIteration:
            pass
        sequence  = b"".join(s)        
 #        seq = 
        seq = Nuc_Seq(ident,
                      sequence,
-                      definition,
+                      definition=definition,
-                      None,-1,
+                      quality=None,
-                      tags)
+                      offset=-1,
-        
+                      tags=tags)
        yield seq
 #         yield { "id"         : ident,
--- a/python/obitools3/parsers/fastq.pyx
+++ b/python/obitools3/parsers/fastq.pyx
@ -6,7 +6,7 @@ Created on 30 mars 2016
@author: coissac
 '''
-from obitools3.dms.obiseq import Nuc_Seq
+from obitools3.dms.obiseq cimport Nuc_Seq
 def fastqIterator(lineiterator, 
@ -74,12 +74,11 @@ def fastqWithQualityIterator(lineiterator,
    else:
        hline = firstline
    for line in i:
-        if read >= ionly:
+        if ionly >= 0 and read >= ionly:
            break
-       
+        
        ident,tags,definition = parseHeader(hline)
        sequence  = str2bytes(line[0:-1])
        next(i)
@ -87,9 +86,10 @@ def fastqWithQualityIterator(lineiterator,
        seq = Nuc_Seq(ident,
                      sequence,
-                      definition,
+                      definition=definition,
-                      quality,qualityoffset,
+                      quality=quality,
-                      tags)
+                      offset=qualityoffset,
                      tags=tags)
        yield seq
@ -149,22 +149,23 @@ def fastqWithoutQualityIterator(lineiterator,
        hline = next(i)
    else:
        hline = firstline
-            
+    
    for line in i:
- 
+                
-        if read >= ionly:
+        if ionly >= 0 and read >= ionly:
            break
        ident,tags,definition = parseHeader(hline)
        sequence  = str2bytes(line[0:-1])
        next(i)
        next(i)
-
+                
        seq = Nuc_Seq(ident,
                      sequence,
-                      definition,
+                      definition=definition,
-                      None,-1,
+                      quality=None,
-                      tags)
+                      offset=-1,
                      tags=tags)
        yield seq
--- a/python/obitools3/parsers/universal.pyx
+++ b/python/obitools3/parsers/universal.pyx
@ -41,11 +41,11 @@ def entryIteratorFactory(lineiterator,
    if isinstance(lineiterator, LineBuffer):
        lb=lineiterator
    else:
-        lb=LineBuffer(lineiterator,buffersize)
+        lb=LineBuffer(lineiterator, buffersize)
-        
+
    i = iter(lb)
-    first=next(i)
+    first=next(i)    
    format=b"tabular"
@ -61,26 +61,29 @@ def entryIteratorFactory(lineiterator,
        format=b"ecopcrfile"
    elif is_ngsfilter_line(first):
        format=b"ngsfilter"
-        
+    
    # TODO Temporary fix
    first=None
    lineiterator.seek(0)
    if format==b'fasta':
        if seqtype == b'nuc':
            return (fastaNucIterator(lineiterator,
-                                    skip,only,
+                                    skip=skip,only=only,
-                                    first),
+                                    firstline=first,
                                    buffersize=buffersize),
                    Nuc_Seq)
        else:
            raise NotImplementedError()
    elif format==b'fastq':
            return (fastqIterator(lineiterator,
-                                 skip,only,
+                                 skip=skip,only=only,
-                                 qualityoffset,
+                                 qualityoffset=qualityoffset,
-                                 first),
+                                 noquality=noquality,
                                 firstline=first,
                                 buffersize=buffersize),
                    Nuc_Seq)
    raise NotImplementedError('File format not yet implemented')
--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@ -364,19 +364,22 @@ def open_uri(uri,
            if qualifiers[b"seqtype"]==b"nuc":
                objclass = Nuc_Seq
                if format==b"fasta":
-                    iseq = fastaNucIterator(file,skip,only)
+                    iseq = fastaNucIterator(file, 
                                            skip=skip, 
                                            only=only)
                elif format==b"fastq":
                    iseq = fastqIterator(file,
-                                         skip,only,
+                                         skip=skip, 
-                                         offset,
+                                         only=only,
-                                         noquality)
+                                         offset=offset,
                                         noquality=noquality)
                else:
                    raise NotImplementedError('Sequence file format not implemented')
            elif qualifiers[b"seqtype"]==b"prot":
                raise NotImplementedError()
        else:
            iseq,objclass = entryIteratorFactory(file,
-                                          skip,only,
+                                          skip, only,
                                          seqtype,
                                          offset,
                                          noquality,
@ -388,13 +391,12 @@ def open_uri(uri,
                                          stripwhite,
                                          blanklineskip,
                                          commentchar)
-        tmpdms = get_temp_dms()
+        #tmpdms = get_temp_dms()
-            
+        
-        return (file,iseq,objclass,urib)
+        return (file, iseq, objclass, urib)
-                    
+
-             
+
--- a/src/bloom.h
+++ b/src/bloom.h
@ -136,7 +136,7 @@ int bloom_init_size(struct bloom * bloom, int entries, double error,
 /** ***************************************************************************
 * Check if the given element is in the bloom filter. Remember this may
- * return false positive if a collision occured.
+ * return false positive if a collision occurred.
 *
 * Parameters:
 * -----------
--- a/src/obiavl.c
+++ b/src/obiavl.c
@ -2463,7 +2463,7 @@ index_t obi_avl_group_add(OBIDMS_avl_group_p avl_group, Obi_blob_p value)
 	// Check if the AVL group is writable
 	if (!(avl_group->writable))
 	{
-		obi_set_errno(OBI_READ_ONLY_INDEXER_ERROR);
+		obi_set_errno(OBI_READ_ONLY_INDEXER_ERROR);	// Note: this error is read by the calling functions to clone the AVL group if needed
 		return -1;
 	}
@ -2476,6 +2476,9 @@ index_t obi_avl_group_add(OBIDMS_avl_group_p avl_group, Obi_blob_p value)
 	// Add in the current AVL
 	index_in_avl = (int32_t) obi_avl_add((avl_group->sub_avls)[avl_group->last_avl_idx], value);
 	if (index_in_avl < 0)
 		return -1;
 	bloom_add(&((((avl_group->sub_avls)[avl_group->last_avl_idx])->header)->bloom_filter), value, obi_blob_sizeof(value));
 	// Build the index containing the AVL index
--- a/src/obiblob.c
+++ b/src/obiblob.c
@ -32,7 +32,7 @@ Obi_blob_p obi_blob(byte_t* encoded_value, uint8_t element_size, int32_t length_
 	Obi_blob_p blob;
 	// Allocate the memory for the blob structure
-	blob = (Obi_blob_p) malloc(sizeof(Obi_blob_t) + length_encoded_value);
+	blob = (Obi_blob_p) calloc(sizeof(Obi_blob_t) + length_encoded_value, sizeof(byte_t));
 	if (blob == NULL)
 	{
 		obi_set_errno(OBI_MALLOC_ERROR);
--- a/src/obidms.c
+++ b/src/obidms.c
@ -240,7 +240,6 @@ OBIDMS_p obi_create_dms(const char* dms_path)
 	char*  directory_name;
 	DIR*   dms_dir;
 	int    dms_file_descriptor;
 	size_t i, j;
 	// Build and check the directory name
 	directory_name = build_directory_name(dms_path);
@ -318,7 +317,7 @@ OBIDMS_p obi_create_dms(const char* dms_path)
 */
 	// Create the informations file
-	if (create_dms_infos_file(dms_file_descriptor, basename(dms_path)) < 0)
+	if (create_dms_infos_file(dms_file_descriptor, basename((char*)dms_path)) < 0)
 		return NULL;
 	return obi_open_dms(dms_path);
@ -333,7 +332,6 @@ OBIDMS_p obi_open_dms(const char* dms_path)
 	int			infos_file_descriptor;
 	bool		little_endian_dms;
 	bool 		little_endian_platform;
 	size_t		i, j;
 	dms = NULL;
@ -356,7 +354,7 @@ OBIDMS_p obi_open_dms(const char* dms_path)
 		i++;
 	}
 */
-	strcpy(dms->dms_name, basename(dms_path));
+	strcpy(dms->dms_name, basename((char*)dms_path));
 	// Build and check the directory name including the relative path
 	complete_dms_path = build_directory_name(dms_path);
--- a/src/obidmscolumn.c
+++ b/src/obidmscolumn.c
@ -703,7 +703,7 @@ static int get_formatted_elt_names_length(const char* elements_names)
 static index_t get_line_count_per_page(OBIType_t data_type, index_t nb_elements_per_line)
 {
-	return getpagesize() / (obi_sizeof(data_type) * nb_elements_per_line);
+	return getpagesize() / obi_sizeof(data_type) / nb_elements_per_line;
 }
@ -919,6 +919,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p     dms,
 	// The initial line count should be between the minimum (corresponding to the page size) and the maximum allowed
 	minimum_line_count = get_line_count_per_page(stored_data_type, nb_elements_per_line);
 	if (minimum_line_count == 0)	// Happens if high number of elements per line
 		minimum_line_count = 1;
 	if (nb_lines > MAXIMUM_LINE_COUNT)
 	{
 		obidebug(1, "\nCan't create column because of line count greater than the maximum allowed (%d)", MAXIMUM_LINE_COUNT);
@ -1023,7 +1025,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p     dms,
 	if (new_column->data == MAP_FAILED)
 	{
 		obi_set_errno(OBICOL_UNKNOWN_ERROR);
-		obidebug(1, "\nError mmapping the data of a column");
+		obidebug(1, "\nError mmapping the data of a column.\nArguments: data_size=%lu, column_file_descriptor=%d, header_size=%lu",
 				data_size, column_file_descriptor, header_size);
 		munmap(new_column->header, header_size);
 		close(column_file_descriptor);
 		free(new_column);