New obi import with rewriting of columns when column type or line

elements (keys) change
2017-07-05 17:15:23 +02:00
parent cb5ad2ed2d
commit 101f764cce
1 changed files with 281 additions and 130 deletions
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -1,133 +1,284 @@
-# from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
+#cython: language_level=3
 # from obitools3.files.universalopener cimport uopen
 # from obitools3.parsers.fasta import fastaIterator
 # from obitools3.parsers.fastq import fastqIterator
 # from obitools3.dms.dms import OBIDMS       # TODO cimport doesn't work
 # 
 # import time
 # 
 # TODO cimport generate errors with argument numbers, but without them some variables can't be declared
 from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
 from obitools3.files.universalopener cimport uopen
 from obitools3.parsers.fasta import fastaIterator
 from obitools3.parsers.fastq import fastqIterator
 from obitools3.dms.dms import DMS       # TODO cimport doesn't work
 from obitools3.dms.view.view cimport View
 from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS    # TODO cimport doesn't work
 from obitools3.dms.column.column cimport Column
 from obitools3.utils cimport tobytes, \
                             get_obitype, \
                             update_obitype
 from obitools3.dms.capi.obitypes cimport obitype_t, \
                                         OBI_VOID
 from obitools3.dms.capi.obierrno cimport obi_errno
 import time
 import pickle
 __title__="Imports sequences from different formats into a DMS"
 default_config = {   'destview'     : None,
                     'skip'         : 0,
                     'only'         : None,
                     'skiperror'    : False,
                     'seqinformat'  : None,
                     'moltype'      : 'nuc',
                     'filename'     : None
                 }
 def addOptions(parser):
    parser.add_argument(dest='import:filename',     
                        metavar='<FILENAME>', 
                        nargs='?', 
                        default=None,
                        help='Name of the sequence file to import' )
    group=parser.add_argument_group('obi import specific options')
    group.add_argument('--default-dms','-d',
                     action="store", dest="obi:defaultdms",
                     metavar='<DMS NAME>',
                     default=None,
                     type=str,
                     help="Name of the default DMS for reading and writing data")
    group.add_argument('--destination-view','-v',
                     action="store", dest="import:destview",
                     metavar='<VIEW NAME>',
                     default=None,
                     type=str,
                     required=True,
                     help="Name of the default DMS for reading and writing data")
    group.add_argument('--skip',
                     action="store", dest="import:skip",
                     metavar='<N>',
                     default=0,
                     type=int,
                     help="Skip the N first sequences")
    group.add_argument('--only',
                     action="store", dest="import:only",
                     metavar='<N>',
                     default=None,
                     type=int,
                     help="Treat only N sequences")
    group.add_argument('--skip-on-error',
                     action="store_true", dest="import:skiperror",
                     default=None,
                     help="Skip sequence entries with parse error")
    group.add_argument('--fasta',
                     action="store_const", dest="import:seqinformat",
                     default=None,
                     const='fasta',
                     help="Input file is in fasta nucleic format (including obitools fasta extentions)")
    group.add_argument('--fastq',
                     action="store_const", dest="import:seqinformat",
                     default=None,
                     const='fastq',
                     help="Input file is in sanger fastq nucleic format (standard fastq)")
    group.add_argument('--nuc',
                     action="store_const", dest="import:moltype",
                     default=None,
                     const='nuc',
                     help="Input file contains nucleic sequences")
    group.add_argument('--prot',
                     action="store_const", dest="import:moltype",
                     default=None,
                     const='pep',
                     help="Input file contains protein sequences")
 # TODO: Handling of NA values. Check None. Specify in doc? None or NA? Possiblity to specify in option?
 # look in R read.table option to specify NA value
 def run(config):
    pass
-# __title__="Counts sequences in a sequence set"
+    cdef   int         i
-# 
+    cdef   type        value_type
-# 
+    cdef   obitype_t   value_obitype
-# default_config = {   'destview'     : None,
+    cdef   obitype_t   old_type
-#                      'skip'         : 0,
+    cdef   obitype_t   new_type
-#                      'only'         : None,
+    cdef   bint        get_quality
-#                      'skiperror'    : False,
+    cdef   bint        NUC_SEQS_view
-#                      'seqinformat'  : None,
+    cdef   int         nb_elts
-#                      'moltype'      : 'nuc',
+    cdef   object      d
-#                      'filename'     : None
+    cdef   View        view
-#                  }
+    cdef   object      iseq
-# 
+    cdef   object      seq
-# def addOptions(parser):
+    cdef   object      inputs
-#     parser.add_argument(dest='import:filename',     
+    cdef   Column      id_col
-#                         metavar='<FILENAME>', 
+    cdef   Column      def_col
-#                         nargs='?', 
+    cdef   Column      seq_col
-#                         default=None,
+    cdef   Column      qual_col
-#                         help='sequence file name to be imported' )
+    cdef   Column      old_column
-# 
+    cdef   bint        rewrite
-#     group=parser.add_argument_group('obi import specific options')
+    cdef   dict        dcols
-# 
+    cdef   int         skipping
-#     group.add_argument('--default-dms','-d',
+    cdef   str         tag
-#                      action="store", dest="obi:defaultdms",
+    cdef   object      value
-#                      metavar='<DMS NAME>',
+    cdef   list        elt_names
-#                      default=None,
+    cdef   int         old_nb_elements_per_line
-#                      type=str,
+    cdef   int         new_nb_elements_per_line
-#                      help="Name of the default DMS for reading and writing data")
+    cdef   list        old_elements_names
-#     
+    cdef   list        new_elements_names
-#     group.add_argument('--destination-view','-v',
+    cdef   ProgressBar pb
-#                      action="store", dest="import:destview",
+    global             obi_errno
-#                      metavar='<VIEW NAME>',
+    
-#                      default=None,
+    pb = ProgressBar(1000000, config, seconde=5)   # TODO should be number of records in file
-#                      type=str,
+       
-#                      required=True,
+    inputs = uopen(config['import']['filename'])
-#                      help="Name of the default DMS for reading and writing data")
+
-#     
+    # Create or open DMS
-#     group.add_argument('--skip',
+    try:
-#                      action="store", dest="import:skip",
+        d = DMS.test_open(config['obi']['defaultdms'])
-#                      metavar='<N>',
+    except :
-#                      default=None,
+        d = DMS.new(config['obi']['defaultdms'])
-#                      type=int,
+
-#                      help="skip the N first sequences")
+    get_quality = False
-# 
+    NUC_SEQS_view = False
-#     group.add_argument('--only',
+    if config['import']['seqinformat']=='fasta':
-#                      action="store", dest="import:only",
+        get_quality = False
-#                      metavar='<N>',
+        NUC_SEQS_view = True
-#                      default=None,
+        iseq = fastaIterator(inputs)
-#                      type=int,
+        view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
-#                      help="treat only N sequences")
+    elif config['import']['seqinformat']=='fastq':
-# 
+        get_quality = True
-#     group.add_argument('--skip-on-error',
+        NUC_SEQS_view = True
-#                      action="store_true", dest="import:skiperror",
+        iseq = fastqIterator(inputs)
-#                      default=None,
+        view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
-#                      help="Skip sequence entries with parse error")
+    else:
-#     
+        raise RuntimeError('File format not handled')
-#     group.add_argument('--fasta',
+        
-#                      action="store_const", dest="import:seqinformat",
+    # Save basic columns in variables for optimization
-#                      default=None,
+    if NUC_SEQS_view :
-#                      const='fasta',
+        id_col = view["ID"]
-#                      help="Input file is in fasta nucleic format (including obitools fasta extentions)")
+        def_col = view["DEFINITION"]
-# 
+        seq_col = view["NUC_SEQ"]
-#     group.add_argument('--fastq',
+        if get_quality :
-#                      action="store_const", dest="import:seqinformat",
+            qual_col = view["QUALITY"]
-#                      default=None,
+    
-#                      const='fastq',
+    dcols = {}
-#                      help="Input file is in sanger fastq nucleic format (standard fastq)")
+    
-# 
+    skipping = 0
-#     group.add_argument('--nuc',
+    i = 0
-#                      action="store_const", dest="import:moltype",
+    for seq in iseq :
-#                      default=None,
+        if skipping < config['import']['skip'] :    # TODO not efficient because sequences are parsed
-#                      const='nuc',
+            skipping+=1
-#                      help="Input file contains nucleic sequences")
+        elif i == config['import']['only'] :
-#     
+            break
-#     group.add_argument('--prot',
+        else :
-#                      action="store_const", dest="import:moltype",
+            pb(i)
-#                      default=None,
+            if NUC_SEQS_view :
-#                      const='pep',
+                id_col[i] = seq['id']
-#                      help="Input file contains protein sequences")
+                def_col[i] = seq['definition']
-# 
+                seq_col[i] = seq['sequence']
-# 
+                if get_quality :
-# 
+                    qual_col[i] = seq['quality']
-# # TODO: Handling of NA values
+            
-# def run(config):
+            for tag in seq['tags'] :
-#     pb = ProgressBar(35000000, config, seconde=5)   # TODO should be number of records in file
+    
-#       
+                value = seq['tags'][tag]
-#     inputs = uopen(config['import']['filename'])
+                
-#     
+                if tag not in dcols :
-#     get_quality = False
+                    
-#     if config['import']['seqinformat']=='fasta':
+                    value_type = type(value)
-#         iseq = fastaIterator(inputs)
+                    nb_elts = 1
-#         view_type="NUC_SEQS_VIEW"
+                    value_obitype = OBI_VOID
-#     elif config['import']['seqinformat']=='fastq':
+                    
-#         iseq = fastqIterator(inputs)
+                    if value_type == dict or value_type == list :
-#         view_type="NUC_SEQS_VIEW"
+                        nb_elts = len(value)
-#         get_quality = True
+                        elt_names = list(value)
-#     else:
+                    else :
-#         raise RuntimeError('No file format specified')
+                        nb_elts = 1
-#         
+                        elt_names = None
-#     # Create DMS
+                    
-#     d = OBIDMS(config['obi']['defaultdms'])
+                    value_obitype = get_obitype(value)
-#      
+                    
-#     # Create view
+                    if value_obitype != OBI_VOID :
-# #     view = d.new_view(config['import']['destview'], view_type=view_type, quality_column=get_quality)
+                        dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
-# #     
+
-# #     i = 0
+                        # Fill value
-# #     for seq in iseq:
+                        dcols[tag][0][i] = value
-# #         pb(i)
+                    
-# #         view[i].id = seq['id']
+                    # TODO else log error?
-# #         view[i].definition = seq['definition']
+
-# #         view[i].nuc_seq = seq['sequence']
+                else :
-# #         if get_quality :
+        
-# #             view[i].quality = seq['quality']
+                    rewrite = False
-# #         for tag in seq['tags'] :
+
-# #             view[i][tag] = seq['tags'][tag]
+                    # Check type adequation
-# #         i+=1
+                    old_type = dcols[tag][1]
-# # 
+                    new_type = OBI_VOID
-# #     #print(view.__repr__())
+                    new_type = update_obitype(old_type, value)
-# # 
+                    if old_type != new_type :
-# #     view.close()
+                        rewrite = True
-#     d.close()
+
-#     
+                    try:
                        # Fill value
                        dcols[tag][0][i] = value
                    except IndexError :
                        value_type = type(value)
                        old_column = dcols[tag][0]
                        old_nb_elements_per_line = old_column.nb_elements_per_line
                        new_nb_elements_per_line = 0
                        old_elements_names = old_column.elements_names
                        new_elements_names = None
                        #####################################################################
                        # Check the length and keys of column lines if needed
                        if value_type == dict :    # Check dictionary keys
                            for k in value :
                                if k not in old_elements_names :
                                    new_elements_names = list(value)
                                    rewrite = True
                                    break
                        elif value_type == list or value_type == tuple :  # Check vector length
                            if old_nb_elements_per_line < len(value) :
                                new_nb_elements_per_line = len(value)
                                rewrite = True
                        #####################################################################
                        if rewrite :
                            if new_nb_elements_per_line == 0 and new_elements_names is not None :
                                new_nb_elements_per_line = len(new_elements_names)
                            dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
                                                                                   new_data_type=new_type, 
                                                                                   new_nb_elements_per_line=new_nb_elements_per_line,
                                                                                   new_elements_names=new_elements_names), 
                                          value_obitype)
                            # Reset obierrno
                            obi_errno = 0
                            # Fill value
                            dcols[tag][0][i] = value
            i+=1
    print("\n")
    print(view.__repr__())
    d.close()