New obi import with rewriting of columns when column type or line

elements (keys) change
This commit is contained in:
Celine Mercier
2017-07-05 17:15:23 +02:00
parent cb5ad2ed2d
commit 101f764cce

View File

@ -1,133 +1,284 @@
# from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport #cython: language_level=3
# from obitools3.files.universalopener cimport uopen
# from obitools3.parsers.fasta import fastaIterator
# from obitools3.parsers.fastq import fastqIterator
# from obitools3.dms.dms import OBIDMS # TODO cimport doesn't work
#
# import time
#
# TODO cimport generate errors with argument numbers, but without them some variables can't be declared
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.files.universalopener cimport uopen
from obitools3.parsers.fasta import fastaIterator
from obitools3.parsers.fastq import fastqIterator
from obitools3.dms.dms import DMS # TODO cimport doesn't work
from obitools3.dms.view.view cimport View
from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS # TODO cimport doesn't work
from obitools3.dms.column.column cimport Column
from obitools3.utils cimport tobytes, \
get_obitype, \
update_obitype
from obitools3.dms.capi.obitypes cimport obitype_t, \
OBI_VOID
from obitools3.dms.capi.obierrno cimport obi_errno
import time
import pickle
__title__="Imports sequences from different formats into a DMS"
default_config = { 'destview' : None,
'skip' : 0,
'only' : None,
'skiperror' : False,
'seqinformat' : None,
'moltype' : 'nuc',
'filename' : None
}
def addOptions(parser):
parser.add_argument(dest='import:filename',
metavar='<FILENAME>',
nargs='?',
default=None,
help='Name of the sequence file to import' )
group=parser.add_argument_group('obi import specific options')
group.add_argument('--default-dms','-d',
action="store", dest="obi:defaultdms",
metavar='<DMS NAME>',
default=None,
type=str,
help="Name of the default DMS for reading and writing data")
group.add_argument('--destination-view','-v',
action="store", dest="import:destview",
metavar='<VIEW NAME>',
default=None,
type=str,
required=True,
help="Name of the default DMS for reading and writing data")
group.add_argument('--skip',
action="store", dest="import:skip",
metavar='<N>',
default=0,
type=int,
help="Skip the N first sequences")
group.add_argument('--only',
action="store", dest="import:only",
metavar='<N>',
default=None,
type=int,
help="Treat only N sequences")
group.add_argument('--skip-on-error',
action="store_true", dest="import:skiperror",
default=None,
help="Skip sequence entries with parse error")
group.add_argument('--fasta',
action="store_const", dest="import:seqinformat",
default=None,
const='fasta',
help="Input file is in fasta nucleic format (including obitools fasta extentions)")
group.add_argument('--fastq',
action="store_const", dest="import:seqinformat",
default=None,
const='fastq',
help="Input file is in sanger fastq nucleic format (standard fastq)")
group.add_argument('--nuc',
action="store_const", dest="import:moltype",
default=None,
const='nuc',
help="Input file contains nucleic sequences")
group.add_argument('--prot',
action="store_const", dest="import:moltype",
default=None,
const='pep',
help="Input file contains protein sequences")
# TODO: Handling of NA values. Check None. Specify in doc? None or NA? Possiblity to specify in option?
# look in R read.table option to specify NA value
def run(config): def run(config):
pass
# __title__="Counts sequences in a sequence set" cdef int i
# cdef type value_type
# cdef obitype_t value_obitype
# default_config = { 'destview' : None, cdef obitype_t old_type
# 'skip' : 0, cdef obitype_t new_type
# 'only' : None, cdef bint get_quality
# 'skiperror' : False, cdef bint NUC_SEQS_view
# 'seqinformat' : None, cdef int nb_elts
# 'moltype' : 'nuc', cdef object d
# 'filename' : None cdef View view
# } cdef object iseq
# cdef object seq
# def addOptions(parser): cdef object inputs
# parser.add_argument(dest='import:filename', cdef Column id_col
# metavar='<FILENAME>', cdef Column def_col
# nargs='?', cdef Column seq_col
# default=None, cdef Column qual_col
# help='sequence file name to be imported' ) cdef Column old_column
# cdef bint rewrite
# group=parser.add_argument_group('obi import specific options') cdef dict dcols
# cdef int skipping
# group.add_argument('--default-dms','-d', cdef str tag
# action="store", dest="obi:defaultdms", cdef object value
# metavar='<DMS NAME>', cdef list elt_names
# default=None, cdef int old_nb_elements_per_line
# type=str, cdef int new_nb_elements_per_line
# help="Name of the default DMS for reading and writing data") cdef list old_elements_names
# cdef list new_elements_names
# group.add_argument('--destination-view','-v', cdef ProgressBar pb
# action="store", dest="import:destview", global obi_errno
# metavar='<VIEW NAME>',
# default=None, pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
# type=str,
# required=True, inputs = uopen(config['import']['filename'])
# help="Name of the default DMS for reading and writing data")
# # Create or open DMS
# group.add_argument('--skip', try:
# action="store", dest="import:skip", d = DMS.test_open(config['obi']['defaultdms'])
# metavar='<N>', except :
# default=None, d = DMS.new(config['obi']['defaultdms'])
# type=int,
# help="skip the N first sequences") get_quality = False
# NUC_SEQS_view = False
# group.add_argument('--only', if config['import']['seqinformat']=='fasta':
# action="store", dest="import:only", get_quality = False
# metavar='<N>', NUC_SEQS_view = True
# default=None, iseq = fastaIterator(inputs)
# type=int, view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
# help="treat only N sequences") elif config['import']['seqinformat']=='fastq':
# get_quality = True
# group.add_argument('--skip-on-error', NUC_SEQS_view = True
# action="store_true", dest="import:skiperror", iseq = fastqIterator(inputs)
# default=None, view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
# help="Skip sequence entries with parse error") else:
# raise RuntimeError('File format not handled')
# group.add_argument('--fasta',
# action="store_const", dest="import:seqinformat", # Save basic columns in variables for optimization
# default=None, if NUC_SEQS_view :
# const='fasta', id_col = view["ID"]
# help="Input file is in fasta nucleic format (including obitools fasta extentions)") def_col = view["DEFINITION"]
# seq_col = view["NUC_SEQ"]
# group.add_argument('--fastq', if get_quality :
# action="store_const", dest="import:seqinformat", qual_col = view["QUALITY"]
# default=None,
# const='fastq', dcols = {}
# help="Input file is in sanger fastq nucleic format (standard fastq)")
# skipping = 0
# group.add_argument('--nuc', i = 0
# action="store_const", dest="import:moltype", for seq in iseq :
# default=None, if skipping < config['import']['skip'] : # TODO not efficient because sequences are parsed
# const='nuc', skipping+=1
# help="Input file contains nucleic sequences") elif i == config['import']['only'] :
# break
# group.add_argument('--prot', else :
# action="store_const", dest="import:moltype", pb(i)
# default=None, if NUC_SEQS_view :
# const='pep', id_col[i] = seq['id']
# help="Input file contains protein sequences") def_col[i] = seq['definition']
# seq_col[i] = seq['sequence']
# if get_quality :
# qual_col[i] = seq['quality']
# # TODO: Handling of NA values
# def run(config): for tag in seq['tags'] :
# pb = ProgressBar(35000000, config, seconde=5) # TODO should be number of records in file
# value = seq['tags'][tag]
# inputs = uopen(config['import']['filename'])
# if tag not in dcols :
# get_quality = False
# if config['import']['seqinformat']=='fasta': value_type = type(value)
# iseq = fastaIterator(inputs) nb_elts = 1
# view_type="NUC_SEQS_VIEW" value_obitype = OBI_VOID
# elif config['import']['seqinformat']=='fastq':
# iseq = fastqIterator(inputs) if value_type == dict or value_type == list :
# view_type="NUC_SEQS_VIEW" nb_elts = len(value)
# get_quality = True elt_names = list(value)
# else: else :
# raise RuntimeError('No file format specified') nb_elts = 1
# elt_names = None
# # Create DMS
# d = OBIDMS(config['obi']['defaultdms']) value_obitype = get_obitype(value)
#
# # Create view if value_obitype != OBI_VOID :
# # view = d.new_view(config['import']['destview'], view_type=view_type, quality_column=get_quality) dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
# #
# # i = 0 # Fill value
# # for seq in iseq: dcols[tag][0][i] = value
# # pb(i)
# # view[i].id = seq['id'] # TODO else log error?
# # view[i].definition = seq['definition']
# # view[i].nuc_seq = seq['sequence'] else :
# # if get_quality :
# # view[i].quality = seq['quality'] rewrite = False
# # for tag in seq['tags'] :
# # view[i][tag] = seq['tags'][tag] # Check type adequation
# # i+=1 old_type = dcols[tag][1]
# # new_type = OBI_VOID
# # #print(view.__repr__()) new_type = update_obitype(old_type, value)
# # if old_type != new_type :
# # view.close() rewrite = True
# d.close()
# try:
# Fill value
dcols[tag][0][i] = value
except IndexError :
value_type = type(value)
old_column = dcols[tag][0]
old_nb_elements_per_line = old_column.nb_elements_per_line
new_nb_elements_per_line = 0
old_elements_names = old_column.elements_names
new_elements_names = None
#####################################################################
# Check the length and keys of column lines if needed
if value_type == dict : # Check dictionary keys
for k in value :
if k not in old_elements_names :
new_elements_names = list(value)
rewrite = True
break
elif value_type == list or value_type == tuple : # Check vector length
if old_nb_elements_per_line < len(value) :
new_nb_elements_per_line = len(value)
rewrite = True
#####################################################################
if rewrite :
if new_nb_elements_per_line == 0 and new_elements_names is not None :
new_nb_elements_per_line = len(new_elements_names)
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
new_data_type=new_type,
new_nb_elements_per_line=new_nb_elements_per_line,
new_elements_names=new_elements_names),
value_obitype)
# Reset obierrno
obi_errno = 0
# Fill value
dcols[tag][0][i] = value
i+=1
print("\n")
print(view.__repr__())
d.close()