Merge branch 'master' of

git@git.metabarcoding.org:obitools/obitools3.git Conflicts: python/obitools3/commands/import.pyx
2017-09-05 08:59:45 +02:00
parent 5045d0c2e9 73bca6288f
commit 4bb071c048
15 changed files with 450 additions and 433 deletions
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -8,12 +8,10 @@ from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
 from obitools3.files.universalopener cimport uopen
 from obitools3.parsers.fasta import fastaIterator
 from obitools3.parsers.fastq import fastqIterator
-from obitools3.dms.dms import DMS       # TODO cimport doesn't work
 from obitools3.dms.view.view cimport View
-from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS    # TODO cimport doesn't work
+from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
 from obitools3.dms.column.column cimport Column
-
-from obitools3.dms.obiseq import Nuc_Seq
+from obitools3.dms.obiseq cimport Nuc_Seq

 from obitools3.utils cimport tobytes, \
                             get_obitype, \
@ -25,6 +23,7 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \
 from obitools3.dms.capi.obierrno cimport obi_errno

 from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
+
 from obitools3.uri.decode import open_uri

 from obitools3.apps.config import logger
@ -50,6 +49,8 @@ def addOptions(parser):

 def run(config):
    
+    cdef   tuple       input
+    cdef   tuple       output 
    cdef   int         i
    cdef   type        value_type
    cdef   obitype_t   value_obitype
@ -62,7 +63,6 @@ def run(config):
    cdef   View        view
    cdef   object      iseq
    cdef   object      seq
-    cdef   object      inputs
    cdef   Column      id_col
    cdef   Column      def_col
    cdef   Column      seq_col
@ -71,7 +71,7 @@ def run(config):
    cdef   bint        rewrite
    cdef   dict        dcols
    cdef   int         skipping
-    cdef   str         tag
+    cdef   bytes       tag
    cdef   object      value
    cdef   list        elt_names
    cdef   int         old_nb_elements_per_line
@ -84,163 +84,157 @@ def run(config):
    
    logger("info","obi import : imports file into an DMS")
    
-    inputs = open_uri(config['obi']['inputURI'])
+    input = open_uri(config['obi']['inputURI'])
    
-    if inputs[2]==Nuc_Seq:
+    if input[2]==Nuc_Seq:
        v = View_NUC_SEQS
    else:
-        v= View 
+        v = View 
        
    output = open_uri(config['obi']['outputURI'],
                      input=False,
                      newviewtype=v)
    
-    print(input)
-    print(output)
+    #print(input)
+    #print(output)
        
-    sys.exit()
+    pb = ProgressBar(1000000, config, seconde=5)   # TODO should be number of records in file
    
-#     pb = ProgressBar(1000000, config, seconde=5)   # TODO should be number of records in file
-#        
-#     inputs = uopen(config['import']['filename'])
-# 
-#     # Create or open DMS
-#     d = DMS.open_or_new(config['obi']['defaultdms'])
-#     
-#     get_quality = False
-#     NUC_SEQS_view = False
-#     if config['import']['seqinformat']=='fasta':
-#         get_quality = False
-#         NUC_SEQS_view = True
-#         iseq = fastaIterator(inputs, skip=config['import']['skip'])
-#         view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
-#     elif config['import']['seqinformat']=='fastq':
-#         get_quality = True
-#         NUC_SEQS_view = True
-#         iseq = fastqIterator(inputs, skip=config['import']['skip'])
-#         view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
-#     else:
-#         raise RuntimeError('File format not handled')
-#         
-#     # Save basic columns in variables for optimization
-#     if NUC_SEQS_view :
-#         id_col = view["ID"]
-#         def_col = view["DEFINITION"]
-#         seq_col = view["NUC_SEQ"]
-#         if get_quality :
-#             qual_col = view["QUALITY"]
-#     
-#     dcols = {}
-#     
-#     i = 0
-#     for seq in iseq :
-#         if i == config['import']['only'] :
-#             break
-#         else :
-#             pb(i)
-#             if NUC_SEQS_view :
-#                 id_col[i] = seq['id']
-#                 def_col[i] = seq['definition']
-#                 seq_col[i] = seq['sequence']
-#                 if get_quality :
-#                     qual_col[i] = seq['quality']
-#             
-#             for tag in seq['tags'] :
-#                             
-#                 value = seq['tags'][tag]
-#                 
-#                 # Check NA value
-#                 if value == config['import']['NA'] :
-#                     value = None
-#                 
-#                 if tag not in dcols :
-#                     
-#                     value_type = type(value)
-#                     nb_elts = 1
-#                     value_obitype = OBI_VOID
-#                     
-#                     if value_type == dict or value_type == list :
-#                         nb_elts = len(value)
-#                         elt_names = list(value)
-#                     else :
-#                         nb_elts = 1
-#                         elt_names = None
-#                     
-#                     value_obitype = get_obitype(value)
-#                     
-#                     if value_obitype != OBI_VOID :
-#                         dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
-#                                                 
-#                         # Fill value
-#                         dcols[tag][0][i] = value
-#                     
-#                     # TODO else log error?
-# 
-#                 else :
-#         
-#                     rewrite = False
-# 
-#                     # Check type adequation
-#                     old_type = dcols[tag][1]
-#                     new_type = OBI_VOID
-#                     new_type = update_obitype(old_type, value)
-#                     if old_type != new_type :
-#                         rewrite = True
-# 
-#                     try:
-#                         # Fill value
-#                         dcols[tag][0][i] = value
-#                     
-#                     except IndexError :
-#                                                 
-#                         value_type = type(value)
-#                         old_column = dcols[tag][0]
-#                         old_nb_elements_per_line = old_column.nb_elements_per_line
-#                         new_nb_elements_per_line = 0
-#                         old_elements_names = old_column.elements_names
-#                         new_elements_names = None
-#     
-#                         #####################################################################
-#                         
-#                         # Check the length and keys of column lines if needed
-#                         if value_type == dict :    # Check dictionary keys
-#                             for k in value :
-#                                 if k not in old_elements_names :
-#                                     new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
-#                                     rewrite = True
-#                                     break
-#                         
-#                         elif value_type == list or value_type == tuple :  # Check vector length
-#                             if old_nb_elements_per_line < len(value) :
-#                                 new_nb_elements_per_line = len(value)
-#                                 rewrite = True
-#                         
-#                         #####################################################################
-#                         
-#                         if rewrite :
-#                             if new_nb_elements_per_line == 0 and new_elements_names is not None :
-#                                 new_nb_elements_per_line = len(new_elements_names)
-#                             
-#                             # Reset obierrno 
-#                             obi_errno = 0
-# 
-#                             dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
-#                                                                                    new_data_type=new_type, 
-#                                                                                    new_nb_elements_per_line=new_nb_elements_per_line,
-#                                                                                    new_elements_names=new_elements_names), 
-#                                           value_obitype)
-#                             
-#                             # Update the dictionary:
-#                             for t in dcols :
-#                                 dcols[t] = (view[t], dcols[t][1])
-#                             
-#                             # Fill value
-#                             dcols[tag][0][i] = value
-#                                     
-#             i+=1
-# 
-#     print("\n")
-#     print(view.__repr__())
-#  
-#     d.close()
+    iseq = input[1]
+    
+    get_quality = False
+    NUC_SEQS_view = False
+    if isinstance(output[1], View) :
+        view = output[1]
+        if output[2] == View_NUC_SEQS :
+            NUC_SEQS_view = True
+            if "QUALITY" in view :      # TODO
+                get_quality = True
+    else: 
+        raise NotImplementedError()
+    
+    # Save basic columns in variables for optimization
+    if NUC_SEQS_view :
+        id_col = view[b"ID"]
+        def_col = view[b"DEFINITION"]
+        seq_col = view[b"NUC_SEQ"]
+        if get_quality :
+            qual_col = view[b"QUALITY"]
+     
+    dcols = {}
+     
+    i = 0
+    for seq in iseq :
+        
+        pb(i)
+        
+        if NUC_SEQS_view :
+            id_col[i] = seq.id
+            def_col[i] = seq.definition
+            seq_col[i] = seq.seq
+            
+            if get_quality :
+                qual_col[i] = seq.quality
+         
+        for tag in seq :
+            
+            if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" :  # TODO hmmm... 
+                                
+                value = seq[tag]
+                 
+                # Check NA value
+                if value == config['obi']['nastring'] :
+                    value = None
+                 
+                if tag not in dcols :
+                     
+                    value_type = type(value)
+                    nb_elts = 1
+                    value_obitype = OBI_VOID
+                     
+                    if value_type == dict or value_type == list :
+                        nb_elts = len(value)
+                        elt_names = list(value)
+                    else :
+                        nb_elts = 1
+                        elt_names = None
+                     
+                    value_obitype = get_obitype(value)
+                     
+                    if value_obitype != OBI_VOID :
+                        dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
+                                                 
+                        # Fill value
+                        dcols[tag][0][i] = value
+                     
+                    # TODO else log error?
+ 
+                else :
+         
+                    rewrite = False
+ 
+                    # Check type adequation
+                    old_type = dcols[tag][1]
+                    new_type = OBI_VOID
+                    new_type = update_obitype(old_type, value)
+                    if old_type != new_type :
+                        rewrite = True
+ 
+                    try:
+                        # Fill value
+                        dcols[tag][0][i] = value
+                     
+                    except IndexError :
+                                                 
+                        value_type = type(value)
+                        old_column = dcols[tag][0]
+                        old_nb_elements_per_line = old_column.nb_elements_per_line
+                        new_nb_elements_per_line = 0
+                        old_elements_names = old_column.elements_names
+                        new_elements_names = None
+     
+                        #####################################################################
+                         
+                        # Check the length and keys of column lines if needed
+                        if value_type == dict :    # Check dictionary keys
+                            for k in value :
+                                if k not in old_elements_names :
+                                    new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
+                                    rewrite = True
+                                    break
+                         
+                        elif value_type == list or value_type == tuple :  # Check vector length
+                            if old_nb_elements_per_line < len(value) :
+                                new_nb_elements_per_line = len(value)
+                                rewrite = True
+                         
+                        #####################################################################
+                         
+                        if rewrite :
+                            if new_nb_elements_per_line == 0 and new_elements_names is not None :
+                                new_nb_elements_per_line = len(new_elements_names)
+                             
+                            # Reset obierrno 
+                            obi_errno = 0
+ 
+                            dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
+                                                                                   new_data_type=new_type, 
+                                                                                   new_nb_elements_per_line=new_nb_elements_per_line,
+                                                                                   new_elements_names=new_elements_names), 
+                                          value_obitype)
+                             
+                            # Update the dictionary:
+                            for t in dcols :
+                                dcols[t] = (view[t], dcols[t][1])
+                             
+                            # Fill value
+                            dcols[tag][0][i] = value
+                                    
+        i+=1
+ 
+    print("\n")
+    print(view.__repr__())
+  
+    input[0].close()    # TODO
+    output[0].close()

--- a/python/obitools3/commands/uniq.pxd
+++ b/python/obitools3/commands/uniq.pxd
@ -0,0 +1,8 @@
+#cython: language_level=3
+
+from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
+from obitools3.dms.taxo.taxo cimport Taxonomy
+from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
+
+
+cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=*, list mergedKeys_list=*, bint mergeIds=*, list categories=*)
--- a/python/obitools3/commands/uniq.pyx
+++ b/python/obitools3/commands/uniq.pyx
@ -1,58 +1,28 @@
 #cython: language_level=3

 from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
-from obitools3.dms.dms import DMS       # TODO cimport doesn't work
-from obitools3.dms.view.view import View    # TODO cimport doesn't work
-from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS
-from obitools3.dms.obiseq cimport Nuc_Seq
-from obitools3.dms.column.column cimport Column
-from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN
-from obitools3.dms.capi.obitypes cimport OBI_INT
-from obitools3.utils cimport tostr
+from obitools3.dms.dms cimport DMS
+from obitools3.dms.taxo.taxo cimport Taxonomy
+from obitools3.dms.view.view cimport View, Line
+from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
+from obitools3.dms.column.column cimport Column, Column_line
+from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
+from obitools3.dms.capi.obitypes cimport OBI_INT, index_t
+from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
+from obitools3.uri.decode import open_uri
+from obitools3.apps.config import logger


 # TODO silence non-implemented options

 __title__="Groups records together"
 
-default_config = {   'inputview'    : None,
-                     'outputview'   : None
-                 }
+
 
 def addOptions(parser):
 
-    # TODO put this common group somewhere else but I don't know where
-    group=parser.add_argument_group('DMS and view options')
- 
-    group.add_argument('--default-dms','-d', 
-                       action="store", dest="obi:defaultdms",
-                       metavar='<DMS NAME>',
-                       default=None,
-                       type=str,
-                       help="Name of the default DMS for reading and writing data.")
- 
-    group.add_argument('--input-view','-i',
-                       action="store", dest="obi:inputview",
-                       metavar='<INPUT VIEW NAME>',
-                       default=None,
-                       type=str,
-                       help="Name of the input view, either raw if the view is in the default DMS,"
-                            " or in the form 'dms:view' if it is in another DMS.")
- 
-    group.add_argument('--output-view','-o',
-                       action="store", dest="obi:outputview",
-                       metavar='<OUTPUT VIEW NAME>',
-                       default=None,
-                       type=str,
-                       help="Name of the output view, either raw if the view is in the default DMS,"
-                            " or in the form 'dms:view' if it is in another DMS.")
-    
-    group.add_argument('--taxo','-t',
-                       action="store", dest="obi:taxo",
-                       metavar='<TAXONOMY NAME>',
-                       default='',  # TODO not None because if it's None, the option is not entered in the option dictionary.
-                       type=str,
-                       help="Name of the taxonomy to use.")
+    addSequenceInputOption(parser)
+    addMinimalOutputOption(parser)
 
 
    group = parser.add_argument_group('obi uniq specific options')
@ -89,151 +59,197 @@ def addOptions(parser):
 # TODO taxonomy


-# TODO
-COUNT_COLUMN_str = tostr(COUNT_COLUMN)
+cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=None, list mergedKeys_list=None, bint mergeIds=False, list categories=None) :
     
-
-def uniqSequence(view, pb, o_view, taxonomy=None, mergedKey=None, mergeIds=False, categories=None) :
+    cdef int            i
+    cdef int            o_idx
+    cdef int            u_idx
+    cdef int            u_id
+    cdef int            i_count
+    cdef set            mergedKeys
+    cdef dict           uniques
+    cdef dict           merged_infos
+    cdef object         iter_view
+    cdef Line           i_seq
+    cdef Line           o_seq
+    cdef str            key
+    cdef bytes          key_b
+    cdef str            mkey
+    cdef str            merged_col_name
+    cdef Column         i_col
+    cdef Column         seq_col
+    cdef object         to_merge
+    cdef Column_line    mcol
+    cdef Column_line    i_mcol
        
    uniques = {}
     
    if categories is None:
        categories=[]
    
-    if mergedKey is not None:
-        mergedKey=set(mergedKey)
+    if mergedKeys_list is not None:
+        mergedKeys=set(mergedKeys_list)
    else:
-        mergedKey=set() 
+        mergedKeys=set() 
    
-    if taxonomy is not None:
-        mergedKey.add('taxid')
+#    if taxonomy is not None:
+#        mergedKeys.add('taxid')
    
-    # Faire parcours de la view des colonnes à merged pour créer les merged_col avant et les remplir au fur et à mesure
-    
-    o_idx = 0
+    # Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
+    #logger("info", "obi uniq", "First browsing through the input")
+    merged_infos = {}
    i = 0
+    iter_view = iter(view)
+    for i_seq in iter_view:
+        pb(i)
+        for key in mergedKeys:
+            mkey = "merged_%s" % key 
+            if key in i_seq:    # TODO what if mkey already in i_seq?
+                if mkey not in merged_infos:
+                    merged_infos[mkey] = {}
+                    mkey_infos = merged_infos[mkey]
+                    mkey_infos['nb_elts'] = 1
+                    mkey_infos['elt_names'] = [i_seq[key]]
+                else:
+                    mkey_infos = merged_infos[mkey]
+                    if i_seq[key] not in mkey_infos['elt_names']:     # TODO make faster? but how?
+                        mkey_infos['elt_names'].append(i_seq[key])
+                        mkey_infos['nb_elts'] += 1
+        i+=1
+
+    for key in mergedKeys:
+        merged_col_name = "merged_%s" % key
+        i_col = view[key]
+        Column.new_column(o_view, 
+                          merged_col_name, 
+                          OBI_INT, 
+                          nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
+                          elements_names=merged_infos[merged_col_name]['elt_names'],
+                          comments=i_col.comments,
+                          alias=merged_col_name     # TODO what if it already exists
+                         )
+    
+    del(merged_infos)
+    
+    #logger("info", "obi uniq", "Second browsing through the input")
+    i = 0
+    o_idx = 0
    seq_col = view[NUC_SEQUENCE_COLUMN]
        
-    iter_view = iter(view)
-    for i_seq in iter_view :
-        pass
-    
    iter_view = iter(view)
    for i_seq in iter_view :
        pb(i)
-        # utiliser l'index des AVLs, faire l'API
-        #u_id = tuple(i_seq[x] for x in categories) + (str(i_seq),)
+        
+        #u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),)
        u_id = seq_col.get_line_idx(i)
+        
        if u_id in uniques:
-            u_seq = uniques[u_id]
-            o_seq = o_view[u_seq['idx']]
-            if COUNT_COLUMN_str in i_seq:
-                o_seq[COUNT_COLUMN_str] += i_seq[COUNT_COLUMN_str]
+                        
+            if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
+                i_count = 1
            else:
-                o_seq[COUNT_COLUMN_str] += 1
-#                seq['COUNT']=1
+                i_count = i_seq[COUNT_COLUMN]
+
+            u_idx = uniques[u_id]
+            o_seq = o_view[u_idx]                    
+            o_seq[COUNT_COLUMN] += i_count
+                        
 #            if taxonomy is not None and 'taxid' in seq:
 #                s['merged_taxid'][seq['taxid']]=
-            for key in mergedKey:
-                if key=='taxid' and mergeIds:   # TODO
-                    if 'taxid_dist' in i_seq:
-                        u_seq["taxid_dist"].update(i_seq["taxid_dist"])
-                    if 'taxid' in i_seq:
-                        u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
+            for key in mergedKeys:
+#                if key=='taxid' and mergeIds:   # TODO
+#                    if 'taxid_dist' in i_seq:
+#                        u_seq["taxid_dist"].update(i_seq["taxid_dist"])
+#                    if 'taxid' in i_seq:
+#                        u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
                
                mkey = "merged_%s" % key
                #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
                if key in i_seq:
-                    u_seq[mkey][i_seq[key]] = u_seq[mkey].get(i_seq[key], 0) + i_seq[COUNT_COLUMN_str]
+                    to_merge = i_seq[key]
+                    mcol = o_seq[mkey]
+                    if mcol[to_merge] is None:
+                        mcol[to_merge] = i_count
+                    else:
+                        mcol[to_merge] = mcol[to_merge] + i_count
                #cas ou merged_keys existe deja
-                else:
+                else:   # TODO is this a good else
                    if mkey in i_seq:
-                        for skey in i_seq[mkey]:
-                            u_seq[mkey][skey] = u_seq[mkey].get(skey,0) + i_seq[mkey][skey]
+                        mcol = o_seq[mkey]
+                        i_mcol = i_seq[mkey]
+                        for key_b in i_mcol:
+                            if mcol[key_b] is None:
+                                mcol[key_b] = i_mcol[key_b]
+                            else:
+                                mcol[key_b] = mcol[key_b] + i_mcol[key_b]

-            for key in i_seq.keys():
+            for key_b in i_seq.keys():
                # Merger proprement l'attribut merged s'il existe
-                if key in o_seq and o_seq[key] != i_seq[key] and tostr(key) != COUNT_COLUMN_str : #and key[0:7]!='merged_' and key!='merged': TODO check this
-                    o_seq[key] = None
+                # TODO make special columns list?
+                if key_b != COUNT_COLUMN and key_b != ID_COLUMN and key_b != NUC_SEQUENCE_COLUMN and key_b in o_seq and o_seq[key_b] != i_seq[key_b] :
+                    o_seq[key_b] = None
                                        
-            if mergeIds:        
-                u_seq['merged'].append(i_seq.id)
+ #           if mergeIds:                           # TODO
+ #               u_seq['merged'].append(i_seq.id)
            
        else:
-            
            o_view[o_idx] = i_seq
            o_seq = o_view[o_idx]
-            uniques[u_id] = {'idx':o_idx}
-            u_seq = uniques[u_id]
+            uniques[u_id] = o_idx
            o_idx += 1

-            if COUNT_COLUMN_str not in o_seq:
-                o_seq[COUNT_COLUMN_str] = 1
+            if COUNT_COLUMN not in o_seq or o_seq[COUNT_COLUMN] is None:
+                o_seq[COUNT_COLUMN] = 1
                        
-            for key in mergedKey:
-                if key=='taxid' and mergeIds:
-                    if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
-                        u_seq["taxid_dist"] = {}
-                    else :
-                        u_seq["taxid_dist"] = o_seq["taxid_dist"]
-                    if 'taxid' in o_seq:
-                        u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
+            for key in mergedKeys:
+#                 if key=='taxid' and mergeIds:
+#                     if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
+#                         u_seq["taxid_dist"] = {}
+#                     else :
+#                         u_seq["taxid_dist"] = o_seq["taxid_dist"]
+#                     if 'taxid' in o_seq:
+#                         u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
                mkey = "merged_%s" % key
-                if mkey not in o_seq:
-                    u_seq[mkey]={}
-                else :
-                    u_seq[mkey] = o_seq[mkey]
                if key in o_seq:
-                    u_seq[mkey][o_seq[key]] = u_seq[mkey].get(o_seq[key],0) + o_seq[COUNT_COLUMN_str]
-                    o_seq[key] = None
-
-            if mergeIds:
-                u_seq['merged']=[o_seq.id]
+                    to_merge = o_seq[key]
+                    mcol = o_seq[mkey]
+                    if to_merge in mcol and mcol[to_merge] is not None:
+                        mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN]
+                    else:
+                        mcol[to_merge] = o_seq[COUNT_COLUMN]
+                    o_seq[key] = None   # TODO delete column eventually -> make C function?
                        
+#            if mergeIds:
+#                u_seq['merged']=[o_seq.id]
        i+=1 
   
    #TODO
    #if taxonomy is not None:
    #    mergeTaxonomyClassification(uniqSeq, taxonomy)

-    # Get informations to build the columns with merged attributes
-    merged_infos = {}
-    for u_id in uniques :
-        u_seq = uniques[u_id]
-        for mkey in u_seq :
-            if mkey != 'idx' :
-                mkey_dict = u_seq[mkey]
-                if mkey not in merged_infos :
-                    merged_infos[mkey] = {}
-                    mkey_infos = merged_infos[mkey]
-                    mkey_infos['nb_elts'] = len(mkey_dict.keys())
-                    mkey_infos['elt_names'] = [k for k in mkey_dict]
-                else :
-                    mkey_infos = merged_infos[mkey]
-                    for k in mkey_dict :
-                        if k not in mkey_infos['elt_names'] :
-                            mkey_infos['elt_names'].append(k)
-                            mkey_infos['nb_elts'] += 1
-    
-    keys_to_del = []
-    for k in merged_infos :
-        if merged_infos[k]['nb_elts'] == 0:
-            keys_to_del.append(k)
-    for k in keys_to_del :
-        del merged_infos[k]
-    
-    return (uniques, merged_infos)

     
 def run(config):
    
-    # TODO declare variables 
+    cdef tuple         input
+    cdef tuple         output 
+    cdef View_NUC_SEQS entries
+    cdef View_NUC_SEQS o_view
+    cdef ProgressBar   pb
     
-    # Open DMS
-    d = DMS.open(config['obi']['defaultdms'])
+    logger("info","obi uniq")
    
-    # Open input view
-    entries = View.open(d, config['obi']['inputview'])
+    input = open_uri(config['obi']['inputURI'])
+    
+    if input[2] != View_NUC_SEQS:
+        raise NotImplementedError('obi uniq only works on NUC_SEQS views')
+        
+    output = open_uri(config['obi']['outputURI'],
+                      input=False,
+                      newviewtype=View_NUC_SEQS)
+    
+    entries = input[1]
+    o_view = output[1]
         
    # Initialize the progress bar
    pb = ProgressBar(len(entries), config, seconde=5)
@ -244,54 +260,19 @@ def run(config):
 #    else:
    usm = uniqSequence

-    # Create output view
-    view_class = View.get_view_class(entries.type)
-    if view_class == View_NUC_SEQS :
-        get_quality = tostr(QUALITY_COLUMN) in entries # TODO
-        o_view = View_NUC_SEQS.new(d, config['obi']['outputview'], quality=get_quality)
-    else :
-        o_view = view_class.new(d, config['obi']['outputview'])
+    usm(entries, o_view, pb, taxonomy=None, mergedKeys_list=config['uniq']['merge'], mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])       
    
-    (uniques, merged_infos) = usm(entries, pb, o_view, config['obi']['taxo'], config['uniq']['merge'], config['uniq']['mergeids'], config['uniq']['categories'])       
-    
-    if 'merge' in config['uniq'] :
-        merged_keys=set(config['uniq']['merge'])
-    else:
-        merged_keys=set() 
-      
-    if 'taxo' in config['obi'] :
-        merged_keys.add('taxid')
-    
-    # TODO gotta handle special merged columns
-    for k in merged_keys:
-        merged_col_name = "merged_%s" % k
-        if merged_col_name in merged_infos :
-            i_col = entries[k]
-            Column.new_column(o_view, 
-                              merged_col_name, 
-                              OBI_INT, 
-                              nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
-                              elements_names=merged_infos[merged_col_name]['elt_names'],
-                              comments=i_col.comments,
-                              alias=merged_col_name
-                             )
-            
-    for u_id in uniques:
-        u_dict = uniques[u_id]
-        for merged_k in u_dict :
-            if merged_k in merged_infos :   # TODO don't enter irrelevant keys to begin with, instead
-                o_view[u_dict['idx']][merged_k] = u_dict[merged_k]
+#     if 'merge' in config['uniq'] :
+#         merged_keys=set(config['uniq']['merge'])
+#     else:
+#         merged_keys=set() 
+#       
+#     if 'taxo' in config['obi'] :
+#         merged_keys.add('taxid')
             
    print("\n")
    print(repr(o_view))
    
-    d.close()
-     
-     
-     
- 
-     
-     
-     
-     
+    input[0].close()
+    output[0].close()     

--- a/python/obitools3/dms/column/column.pyx
+++ b/python/obitools3/dms/column/column.pyx
@ -371,8 +371,18 @@ cdef class Column_line :
        self._column.set_item(self._index, elt_id, value)


-    def __contains__(self, object element_name):
-        return (tobytes(element_name) in self._column.elements_names)
+    def get(self, object elt_id, object default=None):      # TODO returns default if None???
+        if elt_id in self:
+            return self._column.get_item(self._index, elt_id)
+        else:
+            return default
+        
+        
+    def __contains__(self, object elt_id):
+        if type(elt_id) == int:
+            return elt_id < self._column.nb_elements_per_line
+        else:
+            return (tobytes(elt_id) in self._column.elements_names)
 
 
    def __repr__(self) :
--- a/python/obitools3/dms/obiseq.pyx
+++ b/python/obitools3/dms/obiseq.pyx
@ -21,7 +21,7 @@ cdef class Seq(dict) :
        if tags is not None :
            for k in tags:
                k_b = tobytes(k)
-                self[k_b] = tags[k_b]
+                self[k_b] = tags[k]


    def __contains__(self, object key):
@ -70,11 +70,10 @@ cdef class Nuc_Seq(Seq) :
    # nuc sequence property getter and setter
    @property
    def seq(self):
-        return self._seq
+        return self[NUC_SEQUENCE_COLUMN]
    
    @seq.setter
    def seq(self, object new_seq):  # @DuplicatedSignature
-        self._seq = new_seq
        self[NUC_SEQUENCE_COLUMN] = tobytes(new_seq)
        
    # sequence quality property getter and setter
--- a/python/obitools3/dms/view/typed_view/view_NUC_SEQS.pyx
+++ b/python/obitools3/dms/view/typed_view/view_NUC_SEQS.pyx
@ -58,7 +58,8 @@ cdef class View_NUC_SEQS(View):
        return view


-# TODO
+    # TODO test time gain without
+    @OBIWrapper.checkIsActive
    def __getitem__(self, object item) :
        if type(item) == int :
            return Nuc_Seq_Stored(self, item)
@ -66,6 +67,18 @@ cdef class View_NUC_SEQS(View):
            return self.get_column(item)    # TODO hyper lent dans la pratique


+    @OBIWrapper.checkIsActive
+    def __iter__(self):
+        # Iteration on each line of all columns
+        
+        # Declarations
+        cdef index_t line_nb
+        
+        # Yield each line    
+        for line_nb in range(self.line_count) :
+            yield Nuc_Seq_Stored(self, line_nb)
+    
+
 # TODO? test if efficiency gain
 #    def __setitem__(self, index_t line_idx, Nuc_Seq sequence_obj) :
 #        for key in sequence_obj :
--- a/python/obitools3/parsers/fasta.pyx
+++ b/python/obitools3/parsers/fasta.pyx
@ -6,7 +6,9 @@ Created on 30 mars 2016
@author: coissac
 '''

-from obitools3.dms.obiseq import Nuc_Seq
+import types
+
+from obitools3.dms.obiseq cimport Nuc_Seq


 def fastaIterator(lineiterator, 
@ -48,7 +50,7 @@ def fastaIterator(lineiterator,
    
    while True:
        
-        if read >= ionly:
+        if ionly >= 0 and read >= ionly:
            break
                
        while skipped < skip :
@ -79,7 +81,7 @@ def fastaIterator(lineiterator,
    #                       definition,
    #                       tags=tags,
    #                       )
-                
+          # TODO      
        yield { "id"         : ident,
                "definition" : definition,
                "sequence"   : sequence,
@ -105,64 +107,64 @@ def fastaNucIterator(lineiterator,
    cdef list       s
    cdef bytes      sequence
    cdef int        lines_to_skip, ionly, read
-#    cdef OBI_Seq    seq
+    cdef Nuc_Seq    seq
    
    if only is None:
-        ionly=-1
+        ionly = -1
    else:
-        ionly=int(only)
+        ionly = int(only)

-    if isinstance(lineiterator,(str,bytes)):
+    if isinstance(lineiterator, (str, bytes)):
        lineiterator=uopen(lineiterator)
    
+    if isinstance(lineiterator, types.GeneratorType):
+        iterator = lineiterator
    if isinstance(lineiterator, LineBuffer):
-        lb=lineiterator
+        iterator = iter(lineiterator)
    else:
-        lb=LineBuffer(lineiterator,buffersize)
-
+        iterator = iter(LineBuffer(lineiterator, buffersize))

    skipped = 0
    read = 0
-    i = iter(lb)
    
    if firstline is None:
-        line = next(i)
+        line = next(iterator)
    else:
        line = firstline       
    
    while True:
                
-        if read >= ionly:
+        if ionly >= 0 and read >= ionly:
            break
                
        while skipped < skip :
-            line = next(i)
+            line = next(iterator)
            try:
                while line[0]!='>':
-                    line = next(i)
+                    line = next(iterator)
            except StopIteration:
                pass
            skipped += 1

        ident,tags,definition = parseHeader(line)
        s = []
-        line = next(i)
+        line = next(iterator)
    
        try:
            while line[0]!='>':
                s.append(str2bytes(line)[0:-1])
-                line = next(i)
+                line = next(iterator)
        except StopIteration:
            pass
        
        sequence  = b"".join(s)        
        
-#        seq = 
        seq = Nuc_Seq(ident,
                      sequence,
-                      definition,
-                      None,-1,
-                      tags)
+                      definition=definition,
+                      quality=None,
+                      offset=-1,
+                      tags=tags)
            
        yield seq
        
--- a/python/obitools3/parsers/fastq.pyx
+++ b/python/obitools3/parsers/fastq.pyx
@ -6,7 +6,7 @@ Created on 30 mars 2016
@author: coissac
 '''

-from obitools3.dms.obiseq import Nuc_Seq
+from obitools3.dms.obiseq cimport Nuc_Seq


 def fastqIterator(lineiterator, 
@ -74,10 +74,9 @@ def fastqWithQualityIterator(lineiterator,
    else:
        hline = firstline
        
-
    for line in i:
        
-        if read >= ionly:
+        if ionly >= 0 and read >= ionly:
            break
        
        ident,tags,definition = parseHeader(hline)
@ -87,9 +86,10 @@ def fastqWithQualityIterator(lineiterator,

        seq = Nuc_Seq(ident,
                      sequence,
-                      definition,
-                      quality,qualityoffset,
-                      tags)
+                      definition=definition,
+                      quality=quality,
+                      offset=qualityoffset,
+                      tags=tags)
        
        yield seq

@ -152,7 +152,7 @@ def fastqWithoutQualityIterator(lineiterator,
    
    for line in i:
                
-        if read >= ionly:
+        if ionly >= 0 and read >= ionly:
            break
       
        ident,tags,definition = parseHeader(hline)
@ -162,9 +162,10 @@ def fastqWithoutQualityIterator(lineiterator,
                
        seq = Nuc_Seq(ident,
                      sequence,
-                      definition,
-                      None,-1,
-                      tags)
+                      definition=definition,
+                      quality=None,
+                      offset=-1,
+                      tags=tags)
        
        yield seq

--- a/python/obitools3/parsers/universal.pyx
+++ b/python/obitools3/parsers/universal.pyx
@ -41,7 +41,7 @@ def entryIteratorFactory(lineiterator,
    if isinstance(lineiterator, LineBuffer):
        lb=lineiterator
    else:
-        lb=LineBuffer(lineiterator,buffersize)
+        lb=LineBuffer(lineiterator, buffersize)

    i = iter(lb)
        
@ -62,25 +62,28 @@ def entryIteratorFactory(lineiterator,
    elif is_ngsfilter_line(first):
        format=b"ngsfilter"
    
+    # TODO Temporary fix
+    first=None
+    lineiterator.seek(0)
+    
    if format==b'fasta':
        if seqtype == b'nuc':
            return (fastaNucIterator(lineiterator,
-                                    skip,only,
-                                    first),
+                                    skip=skip,only=only,
+                                    firstline=first,
+                                    buffersize=buffersize),
                    Nuc_Seq)
        else:
            raise NotImplementedError()
    elif format==b'fastq':
            return (fastqIterator(lineiterator,
-                                 skip,only,
-                                 qualityoffset,
-                                 first),
+                                 skip=skip,only=only,
+                                 qualityoffset=qualityoffset,
+                                 noquality=noquality,
+                                 firstline=first,
+                                 buffersize=buffersize),
                    Nuc_Seq)
                    
            
    raise NotImplementedError('File format not yet implemented')

-
-        
-
-    
--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@ -364,19 +364,22 @@ def open_uri(uri,
            if qualifiers[b"seqtype"]==b"nuc":
                objclass = Nuc_Seq
                if format==b"fasta":
-                    iseq = fastaNucIterator(file,skip,only)
+                    iseq = fastaNucIterator(file, 
+                                            skip=skip, 
+                                            only=only)
                elif format==b"fastq":
                    iseq = fastqIterator(file,
-                                         skip,only,
-                                         offset,
-                                         noquality)
+                                         skip=skip, 
+                                         only=only,
+                                         offset=offset,
+                                         noquality=noquality)
                else:
                    raise NotImplementedError('Sequence file format not implemented')
            elif qualifiers[b"seqtype"]==b"prot":
                raise NotImplementedError()
        else:
            iseq,objclass = entryIteratorFactory(file,
-                                          skip,only,
+                                          skip, only,
                                          seqtype,
                                          offset,
                                          noquality,
@ -389,10 +392,9 @@ def open_uri(uri,
                                          blanklineskip,
                                          commentchar)
        
+        #tmpdms = get_temp_dms()
        
-        tmpdms = get_temp_dms()
-            
-        return (file,iseq,objclass,urib)
+        return (file, iseq, objclass, urib)



--- a/src/bloom.h
+++ b/src/bloom.h
@ -136,7 +136,7 @@ int bloom_init_size(struct bloom * bloom, int entries, double error,

 /** ***************************************************************************
 * Check if the given element is in the bloom filter. Remember this may
- * return false positive if a collision occured.
+ * return false positive if a collision occurred.
 *
 * Parameters:
 * -----------
--- a/src/obiavl.c
+++ b/src/obiavl.c
@ -2463,7 +2463,7 @@ index_t obi_avl_group_add(OBIDMS_avl_group_p avl_group, Obi_blob_p value)
 	// Check if the AVL group is writable
 	if (!(avl_group->writable))
 	{
-		obi_set_errno(OBI_READ_ONLY_INDEXER_ERROR);
+		obi_set_errno(OBI_READ_ONLY_INDEXER_ERROR);	// Note: this error is read by the calling functions to clone the AVL group if needed
 		return -1;
 	}

@ -2476,6 +2476,9 @@ index_t obi_avl_group_add(OBIDMS_avl_group_p avl_group, Obi_blob_p value)

 	// Add in the current AVL
 	index_in_avl = (int32_t) obi_avl_add((avl_group->sub_avls)[avl_group->last_avl_idx], value);
+	if (index_in_avl < 0)
+		return -1;
+
 	bloom_add(&((((avl_group->sub_avls)[avl_group->last_avl_idx])->header)->bloom_filter), value, obi_blob_sizeof(value));

 	// Build the index containing the AVL index
--- a/src/obiblob.c
+++ b/src/obiblob.c
@ -32,7 +32,7 @@ Obi_blob_p obi_blob(byte_t* encoded_value, uint8_t element_size, int32_t length_
 	Obi_blob_p blob;

 	// Allocate the memory for the blob structure
-	blob = (Obi_blob_p) malloc(sizeof(Obi_blob_t) + length_encoded_value);
+	blob = (Obi_blob_p) calloc(sizeof(Obi_blob_t) + length_encoded_value, sizeof(byte_t));
 	if (blob == NULL)
 	{
 		obi_set_errno(OBI_MALLOC_ERROR);
--- a/src/obidms.c
+++ b/src/obidms.c
@ -240,7 +240,6 @@ OBIDMS_p obi_create_dms(const char* dms_path)
 	char*  directory_name;
 	DIR*   dms_dir;
 	int    dms_file_descriptor;
-	size_t i, j;

 	// Build and check the directory name
 	directory_name = build_directory_name(dms_path);
@ -318,7 +317,7 @@ OBIDMS_p obi_create_dms(const char* dms_path)
 */

 	// Create the informations file
-	if (create_dms_infos_file(dms_file_descriptor, basename(dms_path)) < 0)
+	if (create_dms_infos_file(dms_file_descriptor, basename((char*)dms_path)) < 0)
 		return NULL;

 	return obi_open_dms(dms_path);
@ -333,7 +332,6 @@ OBIDMS_p obi_open_dms(const char* dms_path)
 	int			infos_file_descriptor;
 	bool		little_endian_dms;
 	bool 		little_endian_platform;
-	size_t		i, j;

 	dms = NULL;

@ -356,7 +354,7 @@ OBIDMS_p obi_open_dms(const char* dms_path)
 		i++;
 	}
 */
-	strcpy(dms->dms_name, basename(dms_path));
+	strcpy(dms->dms_name, basename((char*)dms_path));

 	// Build and check the directory name including the relative path
 	complete_dms_path = build_directory_name(dms_path);
--- a/src/obidmscolumn.c
+++ b/src/obidmscolumn.c
@ -703,7 +703,7 @@ static int get_formatted_elt_names_length(const char* elements_names)

 static index_t get_line_count_per_page(OBIType_t data_type, index_t nb_elements_per_line)
 {
-	return getpagesize() / (obi_sizeof(data_type) * nb_elements_per_line);
+	return getpagesize() / obi_sizeof(data_type) / nb_elements_per_line;
 }


@ -919,6 +919,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p     dms,

 	// The initial line count should be between the minimum (corresponding to the page size) and the maximum allowed
 	minimum_line_count = get_line_count_per_page(stored_data_type, nb_elements_per_line);
+	if (minimum_line_count == 0)	// Happens if high number of elements per line
+		minimum_line_count = 1;
 	if (nb_lines > MAXIMUM_LINE_COUNT)
 	{
 		obidebug(1, "\nCan't create column because of line count greater than the maximum allowed (%d)", MAXIMUM_LINE_COUNT);
@ -1023,7 +1025,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p     dms,
 	if (new_column->data == MAP_FAILED)
 	{
 		obi_set_errno(OBICOL_UNKNOWN_ERROR);
-		obidebug(1, "\nError mmapping the data of a column");
+		obidebug(1, "\nError mmapping the data of a column.\nArguments: data_size=%lu, column_file_descriptor=%d, header_size=%lu",
+				data_size, column_file_descriptor, header_size);
 		munmap(new_column->header, header_size);
 		close(column_file_descriptor);
 		free(new_column);