obi import with new input/ouput API

2017-08-20 17:58:36 +02:00
parent 38029b1f77
commit 6a2759eee6
1 changed files with 149 additions and 157 deletions
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -8,12 +8,10 @@ from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
 from obitools3.files.universalopener cimport uopen
 from obitools3.parsers.fasta import fastaIterator
 from obitools3.parsers.fastq import fastqIterator
-from obitools3.dms.dms import DMS       # TODO cimport doesn't work
 from obitools3.dms.view.view cimport View
-from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS    # TODO cimport doesn't work
+from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
 from obitools3.dms.column.column cimport Column
-
-from obitools3.dms.obiseq import Nuc_Seq
+from obitools3.dms.obiseq cimport Nuc_Seq

 from obitools3.utils cimport tobytes, \
                             get_obitype, \
@ -25,6 +23,7 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \
 from obitools3.dms.capi.obierrno cimport obi_errno

 from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
+
 from obitools3.uri.decode import open_uri

 from obitools3.apps.config import logger
@ -50,6 +49,8 @@ def addOptions(parser):

 def run(config):
    
+    cdef   tuple       input
+    cdef   tuple       output 
    cdef   int         i
    cdef   type        value_type
    cdef   obitype_t   value_obitype
@ -62,7 +63,6 @@ def run(config):
    cdef   View        view
    cdef   object      iseq
    cdef   object      seq
-    cdef   object      inputs
    cdef   Column      id_col
    cdef   Column      def_col
    cdef   Column      seq_col
@ -71,7 +71,7 @@ def run(config):
    cdef   bint        rewrite
    cdef   dict        dcols
    cdef   int         skipping
-    cdef   str         tag
+    cdef   bytes       tag
    cdef   object      value
    cdef   list        elt_names
    cdef   int         old_nb_elements_per_line
@ -84,165 +84,157 @@ def run(config):
    
    logger("info","obi import : imports file into an DMS")
    
-    inputs = open_uri(config['obi']['inputURI'])
+    input = open_uri(config['obi']['inputURI'])
    
-    if inputs[2]==Nuc_Seq:
+    if input[2]==Nuc_Seq:
        v = View_NUC_SEQS
    else:
-        v= View 
+        v = View 
        
-    print(v)
-    
    output = open_uri(config['obi']['outputURI'],
                      input=False,
                      newviewtype=v)
    
-    print(input)
-    print(output)
+    #print(input)
+    #print(output)
        
-    sys.exit()
+    pb = ProgressBar(1000000, config, seconde=5)   # TODO should be number of records in file
    
-#     pb = ProgressBar(1000000, config, seconde=5)   # TODO should be number of records in file
-#        
-#     inputs = uopen(config['import']['filename'])
-# 
-#     # Create or open DMS
-#     d = DMS.open_or_new(config['obi']['defaultdms'])
-#     
-#     get_quality = False
-#     NUC_SEQS_view = False
-#     if config['import']['seqinformat']=='fasta':
-#         get_quality = False
-#         NUC_SEQS_view = True
-#         iseq = fastaIterator(inputs, skip=config['import']['skip'])
-#         view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
-#     elif config['import']['seqinformat']=='fastq':
-#         get_quality = True
-#         NUC_SEQS_view = True
-#         iseq = fastqIterator(inputs, skip=config['import']['skip'])
-#         view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
-#     else:
-#         raise RuntimeError('File format not handled')
-#         
-#     # Save basic columns in variables for optimization
-#     if NUC_SEQS_view :
-#         id_col = view["ID"]
-#         def_col = view["DEFINITION"]
-#         seq_col = view["NUC_SEQ"]
-#         if get_quality :
-#             qual_col = view["QUALITY"]
-#     
-#     dcols = {}
-#     
-#     i = 0
-#     for seq in iseq :
-#         if i == config['import']['only'] :
-#             break
-#         else :
-#             pb(i)
-#             if NUC_SEQS_view :
-#                 id_col[i] = seq['id']
-#                 def_col[i] = seq['definition']
-#                 seq_col[i] = seq['sequence']
-#                 if get_quality :
-#                     qual_col[i] = seq['quality']
-#             
-#             for tag in seq['tags'] :
-#                             
-#                 value = seq['tags'][tag]
-#                 
-#                 # Check NA value
-#                 if value == config['import']['NA'] :
-#                     value = None
-#                 
-#                 if tag not in dcols :
-#                     
-#                     value_type = type(value)
-#                     nb_elts = 1
-#                     value_obitype = OBI_VOID
-#                     
-#                     if value_type == dict or value_type == list :
-#                         nb_elts = len(value)
-#                         elt_names = list(value)
-#                     else :
-#                         nb_elts = 1
-#                         elt_names = None
-#                     
-#                     value_obitype = get_obitype(value)
-#                     
-#                     if value_obitype != OBI_VOID :
-#                         dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
-#                                                 
-#                         # Fill value
-#                         dcols[tag][0][i] = value
-#                     
-#                     # TODO else log error?
-# 
-#                 else :
-#         
-#                     rewrite = False
-# 
-#                     # Check type adequation
-#                     old_type = dcols[tag][1]
-#                     new_type = OBI_VOID
-#                     new_type = update_obitype(old_type, value)
-#                     if old_type != new_type :
-#                         rewrite = True
-# 
-#                     try:
-#                         # Fill value
-#                         dcols[tag][0][i] = value
-#                     
-#                     except IndexError :
-#                                                 
-#                         value_type = type(value)
-#                         old_column = dcols[tag][0]
-#                         old_nb_elements_per_line = old_column.nb_elements_per_line
-#                         new_nb_elements_per_line = 0
-#                         old_elements_names = old_column.elements_names
-#                         new_elements_names = None
-#     
-#                         #####################################################################
-#                         
-#                         # Check the length and keys of column lines if needed
-#                         if value_type == dict :    # Check dictionary keys
-#                             for k in value :
-#                                 if k not in old_elements_names :
-#                                     new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
-#                                     rewrite = True
-#                                     break
-#                         
-#                         elif value_type == list or value_type == tuple :  # Check vector length
-#                             if old_nb_elements_per_line < len(value) :
-#                                 new_nb_elements_per_line = len(value)
-#                                 rewrite = True
-#                         
-#                         #####################################################################
-#                         
-#                         if rewrite :
-#                             if new_nb_elements_per_line == 0 and new_elements_names is not None :
-#                                 new_nb_elements_per_line = len(new_elements_names)
-#                             
-#                             # Reset obierrno 
-#                             obi_errno = 0
-# 
-#                             dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
-#                                                                                    new_data_type=new_type, 
-#                                                                                    new_nb_elements_per_line=new_nb_elements_per_line,
-#                                                                                    new_elements_names=new_elements_names), 
-#                                           value_obitype)
-#                             
-#                             # Update the dictionary:
-#                             for t in dcols :
-#                                 dcols[t] = (view[t], dcols[t][1])
-#                             
-#                             # Fill value
-#                             dcols[tag][0][i] = value
-#                                     
-#             i+=1
-# 
-#     print("\n")
-#     print(view.__repr__())
-#  
-#     d.close()
+    iseq = input[1]
+    
+    get_quality = False
+    NUC_SEQS_view = False
+    if isinstance(output[1], View) :
+        view = output[1]
+        if output[2] == View_NUC_SEQS :
+            NUC_SEQS_view = True
+            if "QUALITY" in view :      # TODO
+                get_quality = True
+    else: 
+        raise NotImplementedError()
+    
+    # Save basic columns in variables for optimization
+    if NUC_SEQS_view :
+        id_col = view[b"ID"]
+        def_col = view[b"DEFINITION"]
+        seq_col = view[b"NUC_SEQ"]
+        if get_quality :
+            qual_col = view[b"QUALITY"]
+     
+    dcols = {}
+     
+    i = 0
+    for seq in iseq :
+        
+        pb(i)
+        
+        if NUC_SEQS_view :
+            id_col[i] = seq.id
+            def_col[i] = seq.definition
+            seq_col[i] = seq.seq
+            
+            if get_quality :
+                qual_col[i] = seq.quality
+         
+        for tag in seq :
+            
+            if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" :  # TODO hmmm... 
+                                
+                value = seq[tag]
+                 
+                # Check NA value
+                if value == config['obi']['nastring'] :
+                    value = None
+                 
+                if tag not in dcols :
+                     
+                    value_type = type(value)
+                    nb_elts = 1
+                    value_obitype = OBI_VOID
+                     
+                    if value_type == dict or value_type == list :
+                        nb_elts = len(value)
+                        elt_names = list(value)
+                    else :
+                        nb_elts = 1
+                        elt_names = None
+                     
+                    value_obitype = get_obitype(value)
+                     
+                    if value_obitype != OBI_VOID :
+                        dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
+                                                 
+                        # Fill value
+                        dcols[tag][0][i] = value
+                     
+                    # TODO else log error?
+ 
+                else :
+         
+                    rewrite = False
+ 
+                    # Check type adequation
+                    old_type = dcols[tag][1]
+                    new_type = OBI_VOID
+                    new_type = update_obitype(old_type, value)
+                    if old_type != new_type :
+                        rewrite = True
+ 
+                    try:
+                        # Fill value
+                        dcols[tag][0][i] = value
+                     
+                    except IndexError :
+                                                 
+                        value_type = type(value)
+                        old_column = dcols[tag][0]
+                        old_nb_elements_per_line = old_column.nb_elements_per_line
+                        new_nb_elements_per_line = 0
+                        old_elements_names = old_column.elements_names
+                        new_elements_names = None
+     
+                        #####################################################################
+                         
+                        # Check the length and keys of column lines if needed
+                        if value_type == dict :    # Check dictionary keys
+                            for k in value :
+                                if k not in old_elements_names :
+                                    new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
+                                    rewrite = True
+                                    break
+                         
+                        elif value_type == list or value_type == tuple :  # Check vector length
+                            if old_nb_elements_per_line < len(value) :
+                                new_nb_elements_per_line = len(value)
+                                rewrite = True
+                         
+                        #####################################################################
+                         
+                        if rewrite :
+                            if new_nb_elements_per_line == 0 and new_elements_names is not None :
+                                new_nb_elements_per_line = len(new_elements_names)
+                             
+                            # Reset obierrno 
+                            obi_errno = 0
+ 
+                            dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
+                                                                                   new_data_type=new_type, 
+                                                                                   new_nb_elements_per_line=new_nb_elements_per_line,
+                                                                                   new_elements_names=new_elements_names), 
+                                          value_obitype)
+                             
+                            # Update the dictionary:
+                            for t in dcols :
+                                dcols[t] = (view[t], dcols[t][1])
+                             
+                            # Fill value
+                            dcols[tag][0][i] = value
+                                    
+        i+=1
+ 
+    print("\n")
+    print(view.__repr__())
+  
+    input[0].close()    # TODO
+    output[0].close()