Cython API: obi import can now import ngsfilter files and tabular files

2018-03-12 18:10:43 +01:00
parent 8a0b95c1d6
commit 15e43bb9a1
9 changed files with 168 additions and 142 deletions
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -1,13 +1,8 @@
 #cython: language_level=3

-# TODO cimport generate errors with argument numbers, but without them some variables can't be declared
-
 import sys

 from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
-from obitools3.files.universalopener cimport uopen
-from obitools3.parsers.fasta import fastaIterator
-from obitools3.parsers.fastq import fastqIterator
 from obitools3.dms.view.view cimport View
 from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
 from obitools3.dms.column.column cimport Column
@ -24,7 +19,7 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \

 from obitools3.dms.capi.obierrno cimport obi_errno

-from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
+from obitools3.apps.optiongroups import addSequenceInputOption, addTabularInputOption, addMinimalOutputOption

 from obitools3.uri.decode import open_uri

@ -45,6 +40,7 @@ default_config = {   'destview'     : None,
 def addOptions(parser):
    
    addSequenceInputOption(parser)
+    addTabularInputOption(parser)
    addMinimalOutputOption(parser)
 #    addTaxdumpInputOption(parser)

@ -63,8 +59,8 @@ def run(config):
    cdef   int         nb_elts
    cdef   object      d
    cdef   View        view
-    cdef   object      iseq
-    cdef   object      seq
+    cdef   object      entries
+    cdef   object      entry
    cdef   Column      id_col
    cdef   Column      def_col
    cdef   Column      seq_col
@ -108,9 +104,9 @@ def run(config):
        
    pb = ProgressBar(1000000, config, seconde=5)   # TODO should be number of records in file
    
-    iseq = input[1]
+    entries = input[1]
    
-    NA_value = config['obi']['nastring']
+    NA_value = tobytes(config['obi']['nastring'])   # TODO
    
    NUC_SEQS_view = False
    if isinstance(output[1], View) :
@ -121,39 +117,39 @@ def run(config):
        raise NotImplementedError()
    
    # Save basic columns in variables for optimization
-    if NUC_SEQS_view :
-        id_col = view[b"ID"]
+    if NUC_SEQS_view :      
+        id_col = view[b"ID"]            # TODO use macros or globals for column names
        def_col = view[b"DEFINITION"]
        seq_col = view[b"NUC_SEQ"]
     
    dcols = {}
     
    i = 0
-    for seq in iseq :
-        
+    for entry in entries :
+            
        pb(i)
                
        if NUC_SEQS_view :
            
            # Check if there is a sequencing quality associated     # TODO
            if i == 0:
-                get_quality = b"QUALITY" in seq
+                get_quality = b"QUALITY" in entry
                if get_quality:
                    Column.new_column(view, b"QUALITY", OBI_QUAL)
                    qual_col = view[b"QUALITY"]
            
-            id_col[i] = seq.id
-            def_col[i] = seq.definition
-            seq_col[i] = seq.seq
+            id_col[i] = entry.id
+            def_col[i] = entry.definition
+            seq_col[i] = entry.seq
            
            if get_quality :
-                qual_col[i] = seq.quality
+                qual_col[i] = entry.quality
         
-        for tag in seq :
+        for tag in entry :
            
            if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" :  # TODO hmmm... 
                                
-                value = seq[tag]
+                value = entry[tag]
                                
                # Check NA value
                if value == NA_value :