New obi import with rewriting of columns when column type or line

elements (keys) change
2017-07-05 17:15:23 +02:00
parent cb5ad2ed2d
commit 101f764cce
1 changed files with 281 additions and 130 deletions
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -1,133 +1,284 @@
-# from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
-# from obitools3.files.universalopener cimport uopen
-# from obitools3.parsers.fasta import fastaIterator
-# from obitools3.parsers.fastq import fastqIterator
-# from obitools3.dms.dms import OBIDMS       # TODO cimport doesn't work
-# 
-# import time
-# 
+#cython: language_level=3

+# TODO cimport generate errors with argument numbers, but without them some variables can't be declared
+
+from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
+from obitools3.files.universalopener cimport uopen
+from obitools3.parsers.fasta import fastaIterator
+from obitools3.parsers.fastq import fastqIterator
+from obitools3.dms.dms import DMS       # TODO cimport doesn't work
+from obitools3.dms.view.view cimport View
+from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS    # TODO cimport doesn't work
+from obitools3.dms.column.column cimport Column
+
+from obitools3.utils cimport tobytes, \
+                             get_obitype, \
+                             update_obitype
+
+from obitools3.dms.capi.obitypes cimport obitype_t, \
+                                         OBI_VOID
+
+from obitools3.dms.capi.obierrno cimport obi_errno
+
+import time
+
+import pickle
+
+
+__title__="Imports sequences from different formats into a DMS"
+ 
+ 
+default_config = {   'destview'     : None,
+                     'skip'         : 0,
+                     'only'         : None,
+                     'skiperror'    : False,
+                     'seqinformat'  : None,
+                     'moltype'      : 'nuc',
+                     'filename'     : None
+                 }
+
+def addOptions(parser):
+    parser.add_argument(dest='import:filename',     
+                        metavar='<FILENAME>', 
+                        nargs='?', 
+                        default=None,
+                        help='Name of the sequence file to import' )
+
+    group=parser.add_argument_group('obi import specific options')
+
+    group.add_argument('--default-dms','-d',
+                     action="store", dest="obi:defaultdms",
+                     metavar='<DMS NAME>',
+                     default=None,
+                     type=str,
+                     help="Name of the default DMS for reading and writing data")
+
+    group.add_argument('--destination-view','-v',
+                     action="store", dest="import:destview",
+                     metavar='<VIEW NAME>',
+                     default=None,
+                     type=str,
+                     required=True,
+                     help="Name of the default DMS for reading and writing data")
+
+    group.add_argument('--skip',
+                     action="store", dest="import:skip",
+                     metavar='<N>',
+                     default=0,
+                     type=int,
+                     help="Skip the N first sequences")
+
+    group.add_argument('--only',
+                     action="store", dest="import:only",
+                     metavar='<N>',
+                     default=None,
+                     type=int,
+                     help="Treat only N sequences")
+
+    group.add_argument('--skip-on-error',
+                     action="store_true", dest="import:skiperror",
+                     default=None,
+                     help="Skip sequence entries with parse error")
+
+    group.add_argument('--fasta',
+                     action="store_const", dest="import:seqinformat",
+                     default=None,
+                     const='fasta',
+                     help="Input file is in fasta nucleic format (including obitools fasta extentions)")
+
+    group.add_argument('--fastq',
+                     action="store_const", dest="import:seqinformat",
+                     default=None,
+                     const='fastq',
+                     help="Input file is in sanger fastq nucleic format (standard fastq)")
+
+    group.add_argument('--nuc',
+                     action="store_const", dest="import:moltype",
+                     default=None,
+                     const='nuc',
+                     help="Input file contains nucleic sequences")
+
+    group.add_argument('--prot',
+                     action="store_const", dest="import:moltype",
+                     default=None,
+                     const='pep',
+                     help="Input file contains protein sequences")
+
+
+# TODO: Handling of NA values. Check None. Specify in doc? None or NA? Possiblity to specify in option?
+# look in R read.table option to specify NA value
 def run(config):
-    pass
+    
+    cdef   int         i
+    cdef   type        value_type
+    cdef   obitype_t   value_obitype
+    cdef   obitype_t   old_type
+    cdef   obitype_t   new_type
+    cdef   bint        get_quality
+    cdef   bint        NUC_SEQS_view
+    cdef   int         nb_elts
+    cdef   object      d
+    cdef   View        view
+    cdef   object      iseq
+    cdef   object      seq
+    cdef   object      inputs
+    cdef   Column      id_col
+    cdef   Column      def_col
+    cdef   Column      seq_col
+    cdef   Column      qual_col
+    cdef   Column      old_column
+    cdef   bint        rewrite
+    cdef   dict        dcols
+    cdef   int         skipping
+    cdef   str         tag
+    cdef   object      value
+    cdef   list        elt_names
+    cdef   int         old_nb_elements_per_line
+    cdef   int         new_nb_elements_per_line
+    cdef   list        old_elements_names
+    cdef   list        new_elements_names
+    cdef   ProgressBar pb
+    global             obi_errno
+    
+    pb = ProgressBar(1000000, config, seconde=5)   # TODO should be number of records in file
+       
+    inputs = uopen(config['import']['filename'])
+
+    # Create or open DMS
+    try:
+        d = DMS.test_open(config['obi']['defaultdms'])
+    except :
+        d = DMS.new(config['obi']['defaultdms'])
+
+    get_quality = False
+    NUC_SEQS_view = False
+    if config['import']['seqinformat']=='fasta':
+        get_quality = False
+        NUC_SEQS_view = True
+        iseq = fastaIterator(inputs)
+        view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
+    elif config['import']['seqinformat']=='fastq':
+        get_quality = True
+        NUC_SEQS_view = True
+        iseq = fastqIterator(inputs)
+        view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
+    else:
+        raise RuntimeError('File format not handled')
+        
+    # Save basic columns in variables for optimization
+    if NUC_SEQS_view :
+        id_col = view["ID"]
+        def_col = view["DEFINITION"]
+        seq_col = view["NUC_SEQ"]
+        if get_quality :
+            qual_col = view["QUALITY"]
+    
+    dcols = {}
+    
+    skipping = 0
+    i = 0
+    for seq in iseq :
+        if skipping < config['import']['skip'] :    # TODO not efficient because sequences are parsed
+            skipping+=1
+        elif i == config['import']['only'] :
+            break
+        else :
+            pb(i)
+            if NUC_SEQS_view :
+                id_col[i] = seq['id']
+                def_col[i] = seq['definition']
+                seq_col[i] = seq['sequence']
+                if get_quality :
+                    qual_col[i] = seq['quality']
+            
+            for tag in seq['tags'] :
+    
+                value = seq['tags'][tag]
+                
+                if tag not in dcols :
+                    
+                    value_type = type(value)
+                    nb_elts = 1
+                    value_obitype = OBI_VOID
+                    
+                    if value_type == dict or value_type == list :
+                        nb_elts = len(value)
+                        elt_names = list(value)
+                    else :
+                        nb_elts = 1
+                        elt_names = None
+                    
+                    value_obitype = get_obitype(value)
+                    
+                    if value_obitype != OBI_VOID :
+                        dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
+
+                        # Fill value
+                        dcols[tag][0][i] = value
+                    
+                    # TODO else log error?
+
+                else :
+        
+                    rewrite = False
+
+                    # Check type adequation
+                    old_type = dcols[tag][1]
+                    new_type = OBI_VOID
+                    new_type = update_obitype(old_type, value)
+                    if old_type != new_type :
+                        rewrite = True
+
+                    try:
+                        # Fill value
+                        dcols[tag][0][i] = value
+                    
+                    except IndexError :
+                        
+                        value_type = type(value)
+                        old_column = dcols[tag][0]
+                        old_nb_elements_per_line = old_column.nb_elements_per_line
+                        new_nb_elements_per_line = 0
+                        old_elements_names = old_column.elements_names
+                        new_elements_names = None
+    
+                        #####################################################################
+                        
+                        # Check the length and keys of column lines if needed
+                        if value_type == dict :    # Check dictionary keys
+                            for k in value :
+                                if k not in old_elements_names :
+                                    new_elements_names = list(value)
+                                    rewrite = True
+                                    break
+                        
+                        elif value_type == list or value_type == tuple :  # Check vector length
+                            if old_nb_elements_per_line < len(value) :
+                                new_nb_elements_per_line = len(value)
+                                rewrite = True
+                        
+                        #####################################################################
+                        
+                        if rewrite :
+                            if new_nb_elements_per_line == 0 and new_elements_names is not None :
+                                new_nb_elements_per_line = len(new_elements_names)
+                            
+                            dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
+                                                                                   new_data_type=new_type, 
+                                                                                   new_nb_elements_per_line=new_nb_elements_per_line,
+                                                                                   new_elements_names=new_elements_names), 
+                                          value_obitype)
+                            
+                            # Reset obierrno
+                            obi_errno = 0
+                            
+                            # Fill value
+                            dcols[tag][0][i] = value
+        
+            i+=1
+
+    print("\n")
+    print(view.__repr__())
+ 
+    d.close()

-# __title__="Counts sequences in a sequence set"
-# 
-# 
-# default_config = {   'destview'     : None,
-#                      'skip'         : 0,
-#                      'only'         : None,
-#                      'skiperror'    : False,
-#                      'seqinformat'  : None,
-#                      'moltype'      : 'nuc',
-#                      'filename'     : None
-#                  }
-# 
-# def addOptions(parser):
-#     parser.add_argument(dest='import:filename',     
-#                         metavar='<FILENAME>', 
-#                         nargs='?', 
-#                         default=None,
-#                         help='sequence file name to be imported' )
-# 
-#     group=parser.add_argument_group('obi import specific options')
-# 
-#     group.add_argument('--default-dms','-d',
-#                      action="store", dest="obi:defaultdms",
-#                      metavar='<DMS NAME>',
-#                      default=None,
-#                      type=str,
-#                      help="Name of the default DMS for reading and writing data")
-#     
-#     group.add_argument('--destination-view','-v',
-#                      action="store", dest="import:destview",
-#                      metavar='<VIEW NAME>',
-#                      default=None,
-#                      type=str,
-#                      required=True,
-#                      help="Name of the default DMS for reading and writing data")
-#     
-#     group.add_argument('--skip',
-#                      action="store", dest="import:skip",
-#                      metavar='<N>',
-#                      default=None,
-#                      type=int,
-#                      help="skip the N first sequences")
-# 
-#     group.add_argument('--only',
-#                      action="store", dest="import:only",
-#                      metavar='<N>',
-#                      default=None,
-#                      type=int,
-#                      help="treat only N sequences")
-# 
-#     group.add_argument('--skip-on-error',
-#                      action="store_true", dest="import:skiperror",
-#                      default=None,
-#                      help="Skip sequence entries with parse error")
-#     
-#     group.add_argument('--fasta',
-#                      action="store_const", dest="import:seqinformat",
-#                      default=None,
-#                      const='fasta',
-#                      help="Input file is in fasta nucleic format (including obitools fasta extentions)")
-# 
-#     group.add_argument('--fastq',
-#                      action="store_const", dest="import:seqinformat",
-#                      default=None,
-#                      const='fastq',
-#                      help="Input file is in sanger fastq nucleic format (standard fastq)")
-# 
-#     group.add_argument('--nuc',
-#                      action="store_const", dest="import:moltype",
-#                      default=None,
-#                      const='nuc',
-#                      help="Input file contains nucleic sequences")
-#     
-#     group.add_argument('--prot',
-#                      action="store_const", dest="import:moltype",
-#                      default=None,
-#                      const='pep',
-#                      help="Input file contains protein sequences")
-# 
-# 
-# 
-# # TODO: Handling of NA values
-# def run(config):
-#     pb = ProgressBar(35000000, config, seconde=5)   # TODO should be number of records in file
-#       
-#     inputs = uopen(config['import']['filename'])
-#     
-#     get_quality = False
-#     if config['import']['seqinformat']=='fasta':
-#         iseq = fastaIterator(inputs)
-#         view_type="NUC_SEQS_VIEW"
-#     elif config['import']['seqinformat']=='fastq':
-#         iseq = fastqIterator(inputs)
-#         view_type="NUC_SEQS_VIEW"
-#         get_quality = True
-#     else:
-#         raise RuntimeError('No file format specified')
-#         
-#     # Create DMS
-#     d = OBIDMS(config['obi']['defaultdms'])
-#      
-#     # Create view
-# #     view = d.new_view(config['import']['destview'], view_type=view_type, quality_column=get_quality)
-# #     
-# #     i = 0
-# #     for seq in iseq:
-# #         pb(i)
-# #         view[i].id = seq['id']
-# #         view[i].definition = seq['definition']
-# #         view[i].nuc_seq = seq['sequence']
-# #         if get_quality :
-# #             view[i].quality = seq['quality']
-# #         for tag in seq['tags'] :
-# #             view[i][tag] = seq['tags'][tag]
-# #         i+=1
-# # 
-# #     #print(view.__repr__())
-# # 
-# #     view.close()
-#     d.close()
-#