New obi import with rewriting of columns when column type or line
elements (keys) change
This commit is contained in:
@ -1,133 +1,284 @@
|
|||||||
# from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
#cython: language_level=3
|
||||||
# from obitools3.files.universalopener cimport uopen
|
|
||||||
# from obitools3.parsers.fasta import fastaIterator
|
|
||||||
# from obitools3.parsers.fastq import fastqIterator
|
|
||||||
# from obitools3.dms.dms import OBIDMS # TODO cimport doesn't work
|
|
||||||
#
|
|
||||||
# import time
|
|
||||||
#
|
|
||||||
|
|
||||||
|
# TODO cimport generate errors with argument numbers, but without them some variables can't be declared
|
||||||
|
|
||||||
|
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||||
|
from obitools3.files.universalopener cimport uopen
|
||||||
|
from obitools3.parsers.fasta import fastaIterator
|
||||||
|
from obitools3.parsers.fastq import fastqIterator
|
||||||
|
from obitools3.dms.dms import DMS # TODO cimport doesn't work
|
||||||
|
from obitools3.dms.view.view cimport View
|
||||||
|
from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS # TODO cimport doesn't work
|
||||||
|
from obitools3.dms.column.column cimport Column
|
||||||
|
|
||||||
|
from obitools3.utils cimport tobytes, \
|
||||||
|
get_obitype, \
|
||||||
|
update_obitype
|
||||||
|
|
||||||
|
from obitools3.dms.capi.obitypes cimport obitype_t, \
|
||||||
|
OBI_VOID
|
||||||
|
|
||||||
|
from obitools3.dms.capi.obierrno cimport obi_errno
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
|
__title__="Imports sequences from different formats into a DMS"
|
||||||
|
|
||||||
|
|
||||||
|
default_config = { 'destview' : None,
|
||||||
|
'skip' : 0,
|
||||||
|
'only' : None,
|
||||||
|
'skiperror' : False,
|
||||||
|
'seqinformat' : None,
|
||||||
|
'moltype' : 'nuc',
|
||||||
|
'filename' : None
|
||||||
|
}
|
||||||
|
|
||||||
|
def addOptions(parser):
|
||||||
|
parser.add_argument(dest='import:filename',
|
||||||
|
metavar='<FILENAME>',
|
||||||
|
nargs='?',
|
||||||
|
default=None,
|
||||||
|
help='Name of the sequence file to import' )
|
||||||
|
|
||||||
|
group=parser.add_argument_group('obi import specific options')
|
||||||
|
|
||||||
|
group.add_argument('--default-dms','-d',
|
||||||
|
action="store", dest="obi:defaultdms",
|
||||||
|
metavar='<DMS NAME>',
|
||||||
|
default=None,
|
||||||
|
type=str,
|
||||||
|
help="Name of the default DMS for reading and writing data")
|
||||||
|
|
||||||
|
group.add_argument('--destination-view','-v',
|
||||||
|
action="store", dest="import:destview",
|
||||||
|
metavar='<VIEW NAME>',
|
||||||
|
default=None,
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Name of the default DMS for reading and writing data")
|
||||||
|
|
||||||
|
group.add_argument('--skip',
|
||||||
|
action="store", dest="import:skip",
|
||||||
|
metavar='<N>',
|
||||||
|
default=0,
|
||||||
|
type=int,
|
||||||
|
help="Skip the N first sequences")
|
||||||
|
|
||||||
|
group.add_argument('--only',
|
||||||
|
action="store", dest="import:only",
|
||||||
|
metavar='<N>',
|
||||||
|
default=None,
|
||||||
|
type=int,
|
||||||
|
help="Treat only N sequences")
|
||||||
|
|
||||||
|
group.add_argument('--skip-on-error',
|
||||||
|
action="store_true", dest="import:skiperror",
|
||||||
|
default=None,
|
||||||
|
help="Skip sequence entries with parse error")
|
||||||
|
|
||||||
|
group.add_argument('--fasta',
|
||||||
|
action="store_const", dest="import:seqinformat",
|
||||||
|
default=None,
|
||||||
|
const='fasta',
|
||||||
|
help="Input file is in fasta nucleic format (including obitools fasta extentions)")
|
||||||
|
|
||||||
|
group.add_argument('--fastq',
|
||||||
|
action="store_const", dest="import:seqinformat",
|
||||||
|
default=None,
|
||||||
|
const='fastq',
|
||||||
|
help="Input file is in sanger fastq nucleic format (standard fastq)")
|
||||||
|
|
||||||
|
group.add_argument('--nuc',
|
||||||
|
action="store_const", dest="import:moltype",
|
||||||
|
default=None,
|
||||||
|
const='nuc',
|
||||||
|
help="Input file contains nucleic sequences")
|
||||||
|
|
||||||
|
group.add_argument('--prot',
|
||||||
|
action="store_const", dest="import:moltype",
|
||||||
|
default=None,
|
||||||
|
const='pep',
|
||||||
|
help="Input file contains protein sequences")
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: Handling of NA values. Check None. Specify in doc? None or NA? Possiblity to specify in option?
|
||||||
|
# look in R read.table option to specify NA value
|
||||||
def run(config):
|
def run(config):
|
||||||
pass
|
|
||||||
|
|
||||||
# __title__="Counts sequences in a sequence set"
|
cdef int i
|
||||||
#
|
cdef type value_type
|
||||||
#
|
cdef obitype_t value_obitype
|
||||||
# default_config = { 'destview' : None,
|
cdef obitype_t old_type
|
||||||
# 'skip' : 0,
|
cdef obitype_t new_type
|
||||||
# 'only' : None,
|
cdef bint get_quality
|
||||||
# 'skiperror' : False,
|
cdef bint NUC_SEQS_view
|
||||||
# 'seqinformat' : None,
|
cdef int nb_elts
|
||||||
# 'moltype' : 'nuc',
|
cdef object d
|
||||||
# 'filename' : None
|
cdef View view
|
||||||
# }
|
cdef object iseq
|
||||||
#
|
cdef object seq
|
||||||
# def addOptions(parser):
|
cdef object inputs
|
||||||
# parser.add_argument(dest='import:filename',
|
cdef Column id_col
|
||||||
# metavar='<FILENAME>',
|
cdef Column def_col
|
||||||
# nargs='?',
|
cdef Column seq_col
|
||||||
# default=None,
|
cdef Column qual_col
|
||||||
# help='sequence file name to be imported' )
|
cdef Column old_column
|
||||||
#
|
cdef bint rewrite
|
||||||
# group=parser.add_argument_group('obi import specific options')
|
cdef dict dcols
|
||||||
#
|
cdef int skipping
|
||||||
# group.add_argument('--default-dms','-d',
|
cdef str tag
|
||||||
# action="store", dest="obi:defaultdms",
|
cdef object value
|
||||||
# metavar='<DMS NAME>',
|
cdef list elt_names
|
||||||
# default=None,
|
cdef int old_nb_elements_per_line
|
||||||
# type=str,
|
cdef int new_nb_elements_per_line
|
||||||
# help="Name of the default DMS for reading and writing data")
|
cdef list old_elements_names
|
||||||
#
|
cdef list new_elements_names
|
||||||
# group.add_argument('--destination-view','-v',
|
cdef ProgressBar pb
|
||||||
# action="store", dest="import:destview",
|
global obi_errno
|
||||||
# metavar='<VIEW NAME>',
|
|
||||||
# default=None,
|
pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
|
||||||
# type=str,
|
|
||||||
# required=True,
|
inputs = uopen(config['import']['filename'])
|
||||||
# help="Name of the default DMS for reading and writing data")
|
|
||||||
#
|
# Create or open DMS
|
||||||
# group.add_argument('--skip',
|
try:
|
||||||
# action="store", dest="import:skip",
|
d = DMS.test_open(config['obi']['defaultdms'])
|
||||||
# metavar='<N>',
|
except :
|
||||||
# default=None,
|
d = DMS.new(config['obi']['defaultdms'])
|
||||||
# type=int,
|
|
||||||
# help="skip the N first sequences")
|
get_quality = False
|
||||||
#
|
NUC_SEQS_view = False
|
||||||
# group.add_argument('--only',
|
if config['import']['seqinformat']=='fasta':
|
||||||
# action="store", dest="import:only",
|
get_quality = False
|
||||||
# metavar='<N>',
|
NUC_SEQS_view = True
|
||||||
# default=None,
|
iseq = fastaIterator(inputs)
|
||||||
# type=int,
|
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
|
||||||
# help="treat only N sequences")
|
elif config['import']['seqinformat']=='fastq':
|
||||||
#
|
get_quality = True
|
||||||
# group.add_argument('--skip-on-error',
|
NUC_SEQS_view = True
|
||||||
# action="store_true", dest="import:skiperror",
|
iseq = fastqIterator(inputs)
|
||||||
# default=None,
|
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
|
||||||
# help="Skip sequence entries with parse error")
|
else:
|
||||||
#
|
raise RuntimeError('File format not handled')
|
||||||
# group.add_argument('--fasta',
|
|
||||||
# action="store_const", dest="import:seqinformat",
|
# Save basic columns in variables for optimization
|
||||||
# default=None,
|
if NUC_SEQS_view :
|
||||||
# const='fasta',
|
id_col = view["ID"]
|
||||||
# help="Input file is in fasta nucleic format (including obitools fasta extentions)")
|
def_col = view["DEFINITION"]
|
||||||
#
|
seq_col = view["NUC_SEQ"]
|
||||||
# group.add_argument('--fastq',
|
if get_quality :
|
||||||
# action="store_const", dest="import:seqinformat",
|
qual_col = view["QUALITY"]
|
||||||
# default=None,
|
|
||||||
# const='fastq',
|
dcols = {}
|
||||||
# help="Input file is in sanger fastq nucleic format (standard fastq)")
|
|
||||||
#
|
skipping = 0
|
||||||
# group.add_argument('--nuc',
|
i = 0
|
||||||
# action="store_const", dest="import:moltype",
|
for seq in iseq :
|
||||||
# default=None,
|
if skipping < config['import']['skip'] : # TODO not efficient because sequences are parsed
|
||||||
# const='nuc',
|
skipping+=1
|
||||||
# help="Input file contains nucleic sequences")
|
elif i == config['import']['only'] :
|
||||||
#
|
break
|
||||||
# group.add_argument('--prot',
|
else :
|
||||||
# action="store_const", dest="import:moltype",
|
pb(i)
|
||||||
# default=None,
|
if NUC_SEQS_view :
|
||||||
# const='pep',
|
id_col[i] = seq['id']
|
||||||
# help="Input file contains protein sequences")
|
def_col[i] = seq['definition']
|
||||||
#
|
seq_col[i] = seq['sequence']
|
||||||
#
|
if get_quality :
|
||||||
#
|
qual_col[i] = seq['quality']
|
||||||
# # TODO: Handling of NA values
|
|
||||||
# def run(config):
|
for tag in seq['tags'] :
|
||||||
# pb = ProgressBar(35000000, config, seconde=5) # TODO should be number of records in file
|
|
||||||
#
|
value = seq['tags'][tag]
|
||||||
# inputs = uopen(config['import']['filename'])
|
|
||||||
#
|
if tag not in dcols :
|
||||||
# get_quality = False
|
|
||||||
# if config['import']['seqinformat']=='fasta':
|
value_type = type(value)
|
||||||
# iseq = fastaIterator(inputs)
|
nb_elts = 1
|
||||||
# view_type="NUC_SEQS_VIEW"
|
value_obitype = OBI_VOID
|
||||||
# elif config['import']['seqinformat']=='fastq':
|
|
||||||
# iseq = fastqIterator(inputs)
|
if value_type == dict or value_type == list :
|
||||||
# view_type="NUC_SEQS_VIEW"
|
nb_elts = len(value)
|
||||||
# get_quality = True
|
elt_names = list(value)
|
||||||
# else:
|
else :
|
||||||
# raise RuntimeError('No file format specified')
|
nb_elts = 1
|
||||||
#
|
elt_names = None
|
||||||
# # Create DMS
|
|
||||||
# d = OBIDMS(config['obi']['defaultdms'])
|
value_obitype = get_obitype(value)
|
||||||
#
|
|
||||||
# # Create view
|
if value_obitype != OBI_VOID :
|
||||||
# # view = d.new_view(config['import']['destview'], view_type=view_type, quality_column=get_quality)
|
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
|
||||||
# #
|
|
||||||
# # i = 0
|
# Fill value
|
||||||
# # for seq in iseq:
|
dcols[tag][0][i] = value
|
||||||
# # pb(i)
|
|
||||||
# # view[i].id = seq['id']
|
# TODO else log error?
|
||||||
# # view[i].definition = seq['definition']
|
|
||||||
# # view[i].nuc_seq = seq['sequence']
|
else :
|
||||||
# # if get_quality :
|
|
||||||
# # view[i].quality = seq['quality']
|
rewrite = False
|
||||||
# # for tag in seq['tags'] :
|
|
||||||
# # view[i][tag] = seq['tags'][tag]
|
# Check type adequation
|
||||||
# # i+=1
|
old_type = dcols[tag][1]
|
||||||
# #
|
new_type = OBI_VOID
|
||||||
# # #print(view.__repr__())
|
new_type = update_obitype(old_type, value)
|
||||||
# #
|
if old_type != new_type :
|
||||||
# # view.close()
|
rewrite = True
|
||||||
# d.close()
|
|
||||||
#
|
try:
|
||||||
|
# Fill value
|
||||||
|
dcols[tag][0][i] = value
|
||||||
|
|
||||||
|
except IndexError :
|
||||||
|
|
||||||
|
value_type = type(value)
|
||||||
|
old_column = dcols[tag][0]
|
||||||
|
old_nb_elements_per_line = old_column.nb_elements_per_line
|
||||||
|
new_nb_elements_per_line = 0
|
||||||
|
old_elements_names = old_column.elements_names
|
||||||
|
new_elements_names = None
|
||||||
|
|
||||||
|
#####################################################################
|
||||||
|
|
||||||
|
# Check the length and keys of column lines if needed
|
||||||
|
if value_type == dict : # Check dictionary keys
|
||||||
|
for k in value :
|
||||||
|
if k not in old_elements_names :
|
||||||
|
new_elements_names = list(value)
|
||||||
|
rewrite = True
|
||||||
|
break
|
||||||
|
|
||||||
|
elif value_type == list or value_type == tuple : # Check vector length
|
||||||
|
if old_nb_elements_per_line < len(value) :
|
||||||
|
new_nb_elements_per_line = len(value)
|
||||||
|
rewrite = True
|
||||||
|
|
||||||
|
#####################################################################
|
||||||
|
|
||||||
|
if rewrite :
|
||||||
|
if new_nb_elements_per_line == 0 and new_elements_names is not None :
|
||||||
|
new_nb_elements_per_line = len(new_elements_names)
|
||||||
|
|
||||||
|
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
|
||||||
|
new_data_type=new_type,
|
||||||
|
new_nb_elements_per_line=new_nb_elements_per_line,
|
||||||
|
new_elements_names=new_elements_names),
|
||||||
|
value_obitype)
|
||||||
|
|
||||||
|
# Reset obierrno
|
||||||
|
obi_errno = 0
|
||||||
|
|
||||||
|
# Fill value
|
||||||
|
dcols[tag][0][i] = value
|
||||||
|
|
||||||
|
i+=1
|
||||||
|
|
||||||
|
print("\n")
|
||||||
|
print(view.__repr__())
|
||||||
|
|
||||||
|
d.close()
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user