Merge branch 'master' of
git@git.metabarcoding.org:obitools/obitools3.git Conflicts: python/obitools3/commands/import.pyx
This commit is contained in:
@ -8,12 +8,10 @@ from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
|||||||
from obitools3.files.universalopener cimport uopen
|
from obitools3.files.universalopener cimport uopen
|
||||||
from obitools3.parsers.fasta import fastaIterator
|
from obitools3.parsers.fasta import fastaIterator
|
||||||
from obitools3.parsers.fastq import fastqIterator
|
from obitools3.parsers.fastq import fastqIterator
|
||||||
from obitools3.dms.dms import DMS # TODO cimport doesn't work
|
|
||||||
from obitools3.dms.view.view cimport View
|
from obitools3.dms.view.view cimport View
|
||||||
from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS # TODO cimport doesn't work
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||||
from obitools3.dms.column.column cimport Column
|
from obitools3.dms.column.column cimport Column
|
||||||
|
from obitools3.dms.obiseq cimport Nuc_Seq
|
||||||
from obitools3.dms.obiseq import Nuc_Seq
|
|
||||||
|
|
||||||
from obitools3.utils cimport tobytes, \
|
from obitools3.utils cimport tobytes, \
|
||||||
get_obitype, \
|
get_obitype, \
|
||||||
@ -25,6 +23,7 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \
|
|||||||
from obitools3.dms.capi.obierrno cimport obi_errno
|
from obitools3.dms.capi.obierrno cimport obi_errno
|
||||||
|
|
||||||
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
|
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
|
||||||
|
|
||||||
from obitools3.uri.decode import open_uri
|
from obitools3.uri.decode import open_uri
|
||||||
|
|
||||||
from obitools3.apps.config import logger
|
from obitools3.apps.config import logger
|
||||||
@ -50,6 +49,8 @@ def addOptions(parser):
|
|||||||
|
|
||||||
def run(config):
|
def run(config):
|
||||||
|
|
||||||
|
cdef tuple input
|
||||||
|
cdef tuple output
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef type value_type
|
cdef type value_type
|
||||||
cdef obitype_t value_obitype
|
cdef obitype_t value_obitype
|
||||||
@ -62,7 +63,6 @@ def run(config):
|
|||||||
cdef View view
|
cdef View view
|
||||||
cdef object iseq
|
cdef object iseq
|
||||||
cdef object seq
|
cdef object seq
|
||||||
cdef object inputs
|
|
||||||
cdef Column id_col
|
cdef Column id_col
|
||||||
cdef Column def_col
|
cdef Column def_col
|
||||||
cdef Column seq_col
|
cdef Column seq_col
|
||||||
@ -71,7 +71,7 @@ def run(config):
|
|||||||
cdef bint rewrite
|
cdef bint rewrite
|
||||||
cdef dict dcols
|
cdef dict dcols
|
||||||
cdef int skipping
|
cdef int skipping
|
||||||
cdef str tag
|
cdef bytes tag
|
||||||
cdef object value
|
cdef object value
|
||||||
cdef list elt_names
|
cdef list elt_names
|
||||||
cdef int old_nb_elements_per_line
|
cdef int old_nb_elements_per_line
|
||||||
@ -84,163 +84,157 @@ def run(config):
|
|||||||
|
|
||||||
logger("info","obi import : imports file into an DMS")
|
logger("info","obi import : imports file into an DMS")
|
||||||
|
|
||||||
inputs = open_uri(config['obi']['inputURI'])
|
input = open_uri(config['obi']['inputURI'])
|
||||||
|
|
||||||
if inputs[2]==Nuc_Seq:
|
if input[2]==Nuc_Seq:
|
||||||
v = View_NUC_SEQS
|
v = View_NUC_SEQS
|
||||||
else:
|
else:
|
||||||
v= View
|
v = View
|
||||||
|
|
||||||
output = open_uri(config['obi']['outputURI'],
|
output = open_uri(config['obi']['outputURI'],
|
||||||
input=False,
|
input=False,
|
||||||
newviewtype=v)
|
newviewtype=v)
|
||||||
|
|
||||||
print(input)
|
#print(input)
|
||||||
print(output)
|
#print(output)
|
||||||
|
|
||||||
sys.exit()
|
pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
|
||||||
|
|
||||||
# pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
|
iseq = input[1]
|
||||||
#
|
|
||||||
# inputs = uopen(config['import']['filename'])
|
get_quality = False
|
||||||
#
|
NUC_SEQS_view = False
|
||||||
# # Create or open DMS
|
if isinstance(output[1], View) :
|
||||||
# d = DMS.open_or_new(config['obi']['defaultdms'])
|
view = output[1]
|
||||||
#
|
if output[2] == View_NUC_SEQS :
|
||||||
# get_quality = False
|
NUC_SEQS_view = True
|
||||||
# NUC_SEQS_view = False
|
if "QUALITY" in view : # TODO
|
||||||
# if config['import']['seqinformat']=='fasta':
|
get_quality = True
|
||||||
# get_quality = False
|
else:
|
||||||
# NUC_SEQS_view = True
|
raise NotImplementedError()
|
||||||
# iseq = fastaIterator(inputs, skip=config['import']['skip'])
|
|
||||||
# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
|
# Save basic columns in variables for optimization
|
||||||
# elif config['import']['seqinformat']=='fastq':
|
if NUC_SEQS_view :
|
||||||
# get_quality = True
|
id_col = view[b"ID"]
|
||||||
# NUC_SEQS_view = True
|
def_col = view[b"DEFINITION"]
|
||||||
# iseq = fastqIterator(inputs, skip=config['import']['skip'])
|
seq_col = view[b"NUC_SEQ"]
|
||||||
# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
|
if get_quality :
|
||||||
# else:
|
qual_col = view[b"QUALITY"]
|
||||||
# raise RuntimeError('File format not handled')
|
|
||||||
#
|
dcols = {}
|
||||||
# # Save basic columns in variables for optimization
|
|
||||||
# if NUC_SEQS_view :
|
i = 0
|
||||||
# id_col = view["ID"]
|
for seq in iseq :
|
||||||
# def_col = view["DEFINITION"]
|
|
||||||
# seq_col = view["NUC_SEQ"]
|
pb(i)
|
||||||
# if get_quality :
|
|
||||||
# qual_col = view["QUALITY"]
|
if NUC_SEQS_view :
|
||||||
#
|
id_col[i] = seq.id
|
||||||
# dcols = {}
|
def_col[i] = seq.definition
|
||||||
#
|
seq_col[i] = seq.seq
|
||||||
# i = 0
|
|
||||||
# for seq in iseq :
|
if get_quality :
|
||||||
# if i == config['import']['only'] :
|
qual_col[i] = seq.quality
|
||||||
# break
|
|
||||||
# else :
|
for tag in seq :
|
||||||
# pb(i)
|
|
||||||
# if NUC_SEQS_view :
|
if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" : # TODO hmmm...
|
||||||
# id_col[i] = seq['id']
|
|
||||||
# def_col[i] = seq['definition']
|
value = seq[tag]
|
||||||
# seq_col[i] = seq['sequence']
|
|
||||||
# if get_quality :
|
# Check NA value
|
||||||
# qual_col[i] = seq['quality']
|
if value == config['obi']['nastring'] :
|
||||||
#
|
value = None
|
||||||
# for tag in seq['tags'] :
|
|
||||||
#
|
if tag not in dcols :
|
||||||
# value = seq['tags'][tag]
|
|
||||||
#
|
value_type = type(value)
|
||||||
# # Check NA value
|
nb_elts = 1
|
||||||
# if value == config['import']['NA'] :
|
value_obitype = OBI_VOID
|
||||||
# value = None
|
|
||||||
#
|
if value_type == dict or value_type == list :
|
||||||
# if tag not in dcols :
|
nb_elts = len(value)
|
||||||
#
|
elt_names = list(value)
|
||||||
# value_type = type(value)
|
else :
|
||||||
# nb_elts = 1
|
nb_elts = 1
|
||||||
# value_obitype = OBI_VOID
|
elt_names = None
|
||||||
#
|
|
||||||
# if value_type == dict or value_type == list :
|
value_obitype = get_obitype(value)
|
||||||
# nb_elts = len(value)
|
|
||||||
# elt_names = list(value)
|
if value_obitype != OBI_VOID :
|
||||||
# else :
|
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
|
||||||
# nb_elts = 1
|
|
||||||
# elt_names = None
|
# Fill value
|
||||||
#
|
dcols[tag][0][i] = value
|
||||||
# value_obitype = get_obitype(value)
|
|
||||||
#
|
# TODO else log error?
|
||||||
# if value_obitype != OBI_VOID :
|
|
||||||
# dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
|
else :
|
||||||
#
|
|
||||||
# # Fill value
|
rewrite = False
|
||||||
# dcols[tag][0][i] = value
|
|
||||||
#
|
# Check type adequation
|
||||||
# # TODO else log error?
|
old_type = dcols[tag][1]
|
||||||
#
|
new_type = OBI_VOID
|
||||||
# else :
|
new_type = update_obitype(old_type, value)
|
||||||
#
|
if old_type != new_type :
|
||||||
# rewrite = False
|
rewrite = True
|
||||||
#
|
|
||||||
# # Check type adequation
|
try:
|
||||||
# old_type = dcols[tag][1]
|
# Fill value
|
||||||
# new_type = OBI_VOID
|
dcols[tag][0][i] = value
|
||||||
# new_type = update_obitype(old_type, value)
|
|
||||||
# if old_type != new_type :
|
except IndexError :
|
||||||
# rewrite = True
|
|
||||||
#
|
value_type = type(value)
|
||||||
# try:
|
old_column = dcols[tag][0]
|
||||||
# # Fill value
|
old_nb_elements_per_line = old_column.nb_elements_per_line
|
||||||
# dcols[tag][0][i] = value
|
new_nb_elements_per_line = 0
|
||||||
#
|
old_elements_names = old_column.elements_names
|
||||||
# except IndexError :
|
new_elements_names = None
|
||||||
#
|
|
||||||
# value_type = type(value)
|
#####################################################################
|
||||||
# old_column = dcols[tag][0]
|
|
||||||
# old_nb_elements_per_line = old_column.nb_elements_per_line
|
# Check the length and keys of column lines if needed
|
||||||
# new_nb_elements_per_line = 0
|
if value_type == dict : # Check dictionary keys
|
||||||
# old_elements_names = old_column.elements_names
|
for k in value :
|
||||||
# new_elements_names = None
|
if k not in old_elements_names :
|
||||||
#
|
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
|
||||||
# #####################################################################
|
rewrite = True
|
||||||
#
|
break
|
||||||
# # Check the length and keys of column lines if needed
|
|
||||||
# if value_type == dict : # Check dictionary keys
|
elif value_type == list or value_type == tuple : # Check vector length
|
||||||
# for k in value :
|
if old_nb_elements_per_line < len(value) :
|
||||||
# if k not in old_elements_names :
|
new_nb_elements_per_line = len(value)
|
||||||
# new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
|
rewrite = True
|
||||||
# rewrite = True
|
|
||||||
# break
|
#####################################################################
|
||||||
#
|
|
||||||
# elif value_type == list or value_type == tuple : # Check vector length
|
if rewrite :
|
||||||
# if old_nb_elements_per_line < len(value) :
|
if new_nb_elements_per_line == 0 and new_elements_names is not None :
|
||||||
# new_nb_elements_per_line = len(value)
|
new_nb_elements_per_line = len(new_elements_names)
|
||||||
# rewrite = True
|
|
||||||
#
|
# Reset obierrno
|
||||||
# #####################################################################
|
obi_errno = 0
|
||||||
#
|
|
||||||
# if rewrite :
|
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
|
||||||
# if new_nb_elements_per_line == 0 and new_elements_names is not None :
|
new_data_type=new_type,
|
||||||
# new_nb_elements_per_line = len(new_elements_names)
|
new_nb_elements_per_line=new_nb_elements_per_line,
|
||||||
#
|
new_elements_names=new_elements_names),
|
||||||
# # Reset obierrno
|
value_obitype)
|
||||||
# obi_errno = 0
|
|
||||||
#
|
# Update the dictionary:
|
||||||
# dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
|
for t in dcols :
|
||||||
# new_data_type=new_type,
|
dcols[t] = (view[t], dcols[t][1])
|
||||||
# new_nb_elements_per_line=new_nb_elements_per_line,
|
|
||||||
# new_elements_names=new_elements_names),
|
# Fill value
|
||||||
# value_obitype)
|
dcols[tag][0][i] = value
|
||||||
#
|
|
||||||
# # Update the dictionary:
|
i+=1
|
||||||
# for t in dcols :
|
|
||||||
# dcols[t] = (view[t], dcols[t][1])
|
print("\n")
|
||||||
#
|
print(view.__repr__())
|
||||||
# # Fill value
|
|
||||||
# dcols[tag][0][i] = value
|
input[0].close() # TODO
|
||||||
#
|
output[0].close()
|
||||||
# i+=1
|
|
||||||
#
|
|
||||||
# print("\n")
|
|
||||||
# print(view.__repr__())
|
|
||||||
#
|
|
||||||
# d.close()
|
|
||||||
|
|
||||||
|
8
python/obitools3/commands/uniq.pxd
Normal file
8
python/obitools3/commands/uniq.pxd
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
#cython: language_level=3
|
||||||
|
|
||||||
|
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||||
|
from obitools3.dms.taxo.taxo cimport Taxonomy
|
||||||
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||||
|
|
||||||
|
|
||||||
|
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=*, list mergedKeys_list=*, bint mergeIds=*, list categories=*)
|
@ -1,59 +1,29 @@
|
|||||||
#cython: language_level=3
|
#cython: language_level=3
|
||||||
|
|
||||||
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||||
from obitools3.dms.dms import DMS # TODO cimport doesn't work
|
from obitools3.dms.dms cimport DMS
|
||||||
from obitools3.dms.view.view import View # TODO cimport doesn't work
|
from obitools3.dms.taxo.taxo cimport Taxonomy
|
||||||
from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS
|
from obitools3.dms.view.view cimport View, Line
|
||||||
from obitools3.dms.obiseq cimport Nuc_Seq
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||||
from obitools3.dms.column.column cimport Column
|
from obitools3.dms.column.column cimport Column, Column_line
|
||||||
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN
|
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
|
||||||
from obitools3.dms.capi.obitypes cimport OBI_INT
|
from obitools3.dms.capi.obitypes cimport OBI_INT, index_t
|
||||||
from obitools3.utils cimport tostr
|
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
|
||||||
|
from obitools3.uri.decode import open_uri
|
||||||
|
from obitools3.apps.config import logger
|
||||||
|
|
||||||
|
|
||||||
# TODO silence non-implemented options
|
# TODO silence non-implemented options
|
||||||
|
|
||||||
__title__="Groups records together"
|
__title__="Groups records together"
|
||||||
|
|
||||||
default_config = { 'inputview' : None,
|
|
||||||
'outputview' : None
|
|
||||||
}
|
|
||||||
|
|
||||||
def addOptions(parser):
|
def addOptions(parser):
|
||||||
|
|
||||||
# TODO put this common group somewhere else but I don't know where
|
addSequenceInputOption(parser)
|
||||||
group=parser.add_argument_group('DMS and view options')
|
addMinimalOutputOption(parser)
|
||||||
|
|
||||||
group.add_argument('--default-dms','-d',
|
|
||||||
action="store", dest="obi:defaultdms",
|
|
||||||
metavar='<DMS NAME>',
|
|
||||||
default=None,
|
|
||||||
type=str,
|
|
||||||
help="Name of the default DMS for reading and writing data.")
|
|
||||||
|
|
||||||
group.add_argument('--input-view','-i',
|
|
||||||
action="store", dest="obi:inputview",
|
|
||||||
metavar='<INPUT VIEW NAME>',
|
|
||||||
default=None,
|
|
||||||
type=str,
|
|
||||||
help="Name of the input view, either raw if the view is in the default DMS,"
|
|
||||||
" or in the form 'dms:view' if it is in another DMS.")
|
|
||||||
|
|
||||||
group.add_argument('--output-view','-o',
|
|
||||||
action="store", dest="obi:outputview",
|
|
||||||
metavar='<OUTPUT VIEW NAME>',
|
|
||||||
default=None,
|
|
||||||
type=str,
|
|
||||||
help="Name of the output view, either raw if the view is in the default DMS,"
|
|
||||||
" or in the form 'dms:view' if it is in another DMS.")
|
|
||||||
|
|
||||||
group.add_argument('--taxo','-t',
|
|
||||||
action="store", dest="obi:taxo",
|
|
||||||
metavar='<TAXONOMY NAME>',
|
|
||||||
default='', # TODO not None because if it's None, the option is not entered in the option dictionary.
|
|
||||||
type=str,
|
|
||||||
help="Name of the taxonomy to use.")
|
|
||||||
|
|
||||||
|
|
||||||
group = parser.add_argument_group('obi uniq specific options')
|
group = parser.add_argument_group('obi uniq specific options')
|
||||||
|
|
||||||
@ -89,152 +59,198 @@ def addOptions(parser):
|
|||||||
# TODO taxonomy
|
# TODO taxonomy
|
||||||
|
|
||||||
|
|
||||||
# TODO
|
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=None, list mergedKeys_list=None, bint mergeIds=False, list categories=None) :
|
||||||
COUNT_COLUMN_str = tostr(COUNT_COLUMN)
|
|
||||||
|
|
||||||
|
|
||||||
def uniqSequence(view, pb, o_view, taxonomy=None, mergedKey=None, mergeIds=False, categories=None) :
|
|
||||||
|
|
||||||
|
cdef int i
|
||||||
|
cdef int o_idx
|
||||||
|
cdef int u_idx
|
||||||
|
cdef int u_id
|
||||||
|
cdef int i_count
|
||||||
|
cdef set mergedKeys
|
||||||
|
cdef dict uniques
|
||||||
|
cdef dict merged_infos
|
||||||
|
cdef object iter_view
|
||||||
|
cdef Line i_seq
|
||||||
|
cdef Line o_seq
|
||||||
|
cdef str key
|
||||||
|
cdef bytes key_b
|
||||||
|
cdef str mkey
|
||||||
|
cdef str merged_col_name
|
||||||
|
cdef Column i_col
|
||||||
|
cdef Column seq_col
|
||||||
|
cdef object to_merge
|
||||||
|
cdef Column_line mcol
|
||||||
|
cdef Column_line i_mcol
|
||||||
|
|
||||||
uniques = {}
|
uniques = {}
|
||||||
|
|
||||||
if categories is None:
|
if categories is None:
|
||||||
categories=[]
|
categories=[]
|
||||||
|
|
||||||
if mergedKey is not None:
|
if mergedKeys_list is not None:
|
||||||
mergedKey=set(mergedKey)
|
mergedKeys=set(mergedKeys_list)
|
||||||
else:
|
else:
|
||||||
mergedKey=set()
|
mergedKeys=set()
|
||||||
|
|
||||||
if taxonomy is not None:
|
# if taxonomy is not None:
|
||||||
mergedKey.add('taxid')
|
# mergedKeys.add('taxid')
|
||||||
|
|
||||||
# Faire parcours de la view des colonnes à merged pour créer les merged_col avant et les remplir au fur et à mesure
|
# Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
|
||||||
|
#logger("info", "obi uniq", "First browsing through the input")
|
||||||
o_idx = 0
|
merged_infos = {}
|
||||||
i = 0
|
i = 0
|
||||||
seq_col = view[NUC_SEQUENCE_COLUMN]
|
|
||||||
|
|
||||||
iter_view = iter(view)
|
iter_view = iter(view)
|
||||||
for i_seq in iter_view :
|
for i_seq in iter_view:
|
||||||
pass
|
pb(i)
|
||||||
|
for key in mergedKeys:
|
||||||
|
mkey = "merged_%s" % key
|
||||||
|
if key in i_seq: # TODO what if mkey already in i_seq?
|
||||||
|
if mkey not in merged_infos:
|
||||||
|
merged_infos[mkey] = {}
|
||||||
|
mkey_infos = merged_infos[mkey]
|
||||||
|
mkey_infos['nb_elts'] = 1
|
||||||
|
mkey_infos['elt_names'] = [i_seq[key]]
|
||||||
|
else:
|
||||||
|
mkey_infos = merged_infos[mkey]
|
||||||
|
if i_seq[key] not in mkey_infos['elt_names']: # TODO make faster? but how?
|
||||||
|
mkey_infos['elt_names'].append(i_seq[key])
|
||||||
|
mkey_infos['nb_elts'] += 1
|
||||||
|
i+=1
|
||||||
|
|
||||||
|
for key in mergedKeys:
|
||||||
|
merged_col_name = "merged_%s" % key
|
||||||
|
i_col = view[key]
|
||||||
|
Column.new_column(o_view,
|
||||||
|
merged_col_name,
|
||||||
|
OBI_INT,
|
||||||
|
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
|
||||||
|
elements_names=merged_infos[merged_col_name]['elt_names'],
|
||||||
|
comments=i_col.comments,
|
||||||
|
alias=merged_col_name # TODO what if it already exists
|
||||||
|
)
|
||||||
|
|
||||||
|
del(merged_infos)
|
||||||
|
|
||||||
|
#logger("info", "obi uniq", "Second browsing through the input")
|
||||||
|
i = 0
|
||||||
|
o_idx = 0
|
||||||
|
seq_col = view[NUC_SEQUENCE_COLUMN]
|
||||||
|
|
||||||
iter_view = iter(view)
|
iter_view = iter(view)
|
||||||
for i_seq in iter_view :
|
for i_seq in iter_view :
|
||||||
pb(i)
|
pb(i)
|
||||||
# utiliser l'index des AVLs, faire l'API
|
|
||||||
#u_id = tuple(i_seq[x] for x in categories) + (str(i_seq),)
|
#u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),)
|
||||||
u_id = seq_col.get_line_idx(i)
|
u_id = seq_col.get_line_idx(i)
|
||||||
|
|
||||||
if u_id in uniques:
|
if u_id in uniques:
|
||||||
u_seq = uniques[u_id]
|
|
||||||
o_seq = o_view[u_seq['idx']]
|
if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
|
||||||
if COUNT_COLUMN_str in i_seq:
|
i_count = 1
|
||||||
o_seq[COUNT_COLUMN_str] += i_seq[COUNT_COLUMN_str]
|
|
||||||
else:
|
else:
|
||||||
o_seq[COUNT_COLUMN_str] += 1
|
i_count = i_seq[COUNT_COLUMN]
|
||||||
# seq['COUNT']=1
|
|
||||||
|
u_idx = uniques[u_id]
|
||||||
|
o_seq = o_view[u_idx]
|
||||||
|
o_seq[COUNT_COLUMN] += i_count
|
||||||
|
|
||||||
# if taxonomy is not None and 'taxid' in seq:
|
# if taxonomy is not None and 'taxid' in seq:
|
||||||
# s['merged_taxid'][seq['taxid']]=
|
# s['merged_taxid'][seq['taxid']]=
|
||||||
for key in mergedKey:
|
for key in mergedKeys:
|
||||||
if key=='taxid' and mergeIds: # TODO
|
# if key=='taxid' and mergeIds: # TODO
|
||||||
if 'taxid_dist' in i_seq:
|
# if 'taxid_dist' in i_seq:
|
||||||
u_seq["taxid_dist"].update(i_seq["taxid_dist"])
|
# u_seq["taxid_dist"].update(i_seq["taxid_dist"])
|
||||||
if 'taxid' in i_seq:
|
# if 'taxid' in i_seq:
|
||||||
u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
|
# u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
|
||||||
|
|
||||||
mkey = "merged_%s" % key
|
mkey = "merged_%s" % key
|
||||||
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
|
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
|
||||||
if key in i_seq:
|
if key in i_seq:
|
||||||
u_seq[mkey][i_seq[key]] = u_seq[mkey].get(i_seq[key], 0) + i_seq[COUNT_COLUMN_str]
|
to_merge = i_seq[key]
|
||||||
|
mcol = o_seq[mkey]
|
||||||
|
if mcol[to_merge] is None:
|
||||||
|
mcol[to_merge] = i_count
|
||||||
|
else:
|
||||||
|
mcol[to_merge] = mcol[to_merge] + i_count
|
||||||
#cas ou merged_keys existe deja
|
#cas ou merged_keys existe deja
|
||||||
else:
|
else: # TODO is this a good else
|
||||||
if mkey in i_seq:
|
if mkey in i_seq:
|
||||||
for skey in i_seq[mkey]:
|
mcol = o_seq[mkey]
|
||||||
u_seq[mkey][skey] = u_seq[mkey].get(skey,0) + i_seq[mkey][skey]
|
i_mcol = i_seq[mkey]
|
||||||
|
for key_b in i_mcol:
|
||||||
for key in i_seq.keys():
|
if mcol[key_b] is None:
|
||||||
|
mcol[key_b] = i_mcol[key_b]
|
||||||
|
else:
|
||||||
|
mcol[key_b] = mcol[key_b] + i_mcol[key_b]
|
||||||
|
|
||||||
|
for key_b in i_seq.keys():
|
||||||
# Merger proprement l'attribut merged s'il existe
|
# Merger proprement l'attribut merged s'il existe
|
||||||
if key in o_seq and o_seq[key] != i_seq[key] and tostr(key) != COUNT_COLUMN_str : #and key[0:7]!='merged_' and key!='merged': TODO check this
|
# TODO make special columns list?
|
||||||
o_seq[key] = None
|
if key_b != COUNT_COLUMN and key_b != ID_COLUMN and key_b != NUC_SEQUENCE_COLUMN and key_b in o_seq and o_seq[key_b] != i_seq[key_b] :
|
||||||
|
o_seq[key_b] = None
|
||||||
if mergeIds:
|
|
||||||
u_seq['merged'].append(i_seq.id)
|
# if mergeIds: # TODO
|
||||||
|
# u_seq['merged'].append(i_seq.id)
|
||||||
else:
|
|
||||||
|
|
||||||
|
else:
|
||||||
o_view[o_idx] = i_seq
|
o_view[o_idx] = i_seq
|
||||||
o_seq = o_view[o_idx]
|
o_seq = o_view[o_idx]
|
||||||
uniques[u_id] = {'idx':o_idx}
|
uniques[u_id] = o_idx
|
||||||
u_seq = uniques[u_id]
|
|
||||||
o_idx += 1
|
o_idx += 1
|
||||||
|
|
||||||
if COUNT_COLUMN_str not in o_seq:
|
if COUNT_COLUMN not in o_seq or o_seq[COUNT_COLUMN] is None:
|
||||||
o_seq[COUNT_COLUMN_str] = 1
|
o_seq[COUNT_COLUMN] = 1
|
||||||
|
|
||||||
for key in mergedKey:
|
for key in mergedKeys:
|
||||||
if key=='taxid' and mergeIds:
|
# if key=='taxid' and mergeIds:
|
||||||
if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
|
# if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
|
||||||
u_seq["taxid_dist"] = {}
|
# u_seq["taxid_dist"] = {}
|
||||||
else :
|
# else :
|
||||||
u_seq["taxid_dist"] = o_seq["taxid_dist"]
|
# u_seq["taxid_dist"] = o_seq["taxid_dist"]
|
||||||
if 'taxid' in o_seq:
|
# if 'taxid' in o_seq:
|
||||||
u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
|
# u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
|
||||||
mkey = "merged_%s" % key
|
mkey = "merged_%s" % key
|
||||||
if mkey not in o_seq:
|
|
||||||
u_seq[mkey]={}
|
|
||||||
else :
|
|
||||||
u_seq[mkey] = o_seq[mkey]
|
|
||||||
if key in o_seq:
|
if key in o_seq:
|
||||||
u_seq[mkey][o_seq[key]] = u_seq[mkey].get(o_seq[key],0) + o_seq[COUNT_COLUMN_str]
|
to_merge = o_seq[key]
|
||||||
o_seq[key] = None
|
mcol = o_seq[mkey]
|
||||||
|
if to_merge in mcol and mcol[to_merge] is not None:
|
||||||
if mergeIds:
|
mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN]
|
||||||
u_seq['merged']=[o_seq.id]
|
else:
|
||||||
|
mcol[to_merge] = o_seq[COUNT_COLUMN]
|
||||||
|
o_seq[key] = None # TODO delete column eventually -> make C function?
|
||||||
|
|
||||||
|
# if mergeIds:
|
||||||
|
# u_seq['merged']=[o_seq.id]
|
||||||
i+=1
|
i+=1
|
||||||
|
|
||||||
#TODO
|
#TODO
|
||||||
#if taxonomy is not None:
|
#if taxonomy is not None:
|
||||||
# mergeTaxonomyClassification(uniqSeq, taxonomy)
|
# mergeTaxonomyClassification(uniqSeq, taxonomy)
|
||||||
|
|
||||||
# Get informations to build the columns with merged attributes
|
|
||||||
merged_infos = {}
|
|
||||||
for u_id in uniques :
|
|
||||||
u_seq = uniques[u_id]
|
|
||||||
for mkey in u_seq :
|
|
||||||
if mkey != 'idx' :
|
|
||||||
mkey_dict = u_seq[mkey]
|
|
||||||
if mkey not in merged_infos :
|
|
||||||
merged_infos[mkey] = {}
|
|
||||||
mkey_infos = merged_infos[mkey]
|
|
||||||
mkey_infos['nb_elts'] = len(mkey_dict.keys())
|
|
||||||
mkey_infos['elt_names'] = [k for k in mkey_dict]
|
|
||||||
else :
|
|
||||||
mkey_infos = merged_infos[mkey]
|
|
||||||
for k in mkey_dict :
|
|
||||||
if k not in mkey_infos['elt_names'] :
|
|
||||||
mkey_infos['elt_names'].append(k)
|
|
||||||
mkey_infos['nb_elts'] += 1
|
|
||||||
|
|
||||||
keys_to_del = []
|
|
||||||
for k in merged_infos :
|
|
||||||
if merged_infos[k]['nb_elts'] == 0:
|
|
||||||
keys_to_del.append(k)
|
|
||||||
for k in keys_to_del :
|
|
||||||
del merged_infos[k]
|
|
||||||
|
|
||||||
return (uniques, merged_infos)
|
|
||||||
|
|
||||||
|
|
||||||
def run(config):
|
def run(config):
|
||||||
|
|
||||||
# TODO declare variables
|
cdef tuple input
|
||||||
|
cdef tuple output
|
||||||
|
cdef View_NUC_SEQS entries
|
||||||
|
cdef View_NUC_SEQS o_view
|
||||||
|
cdef ProgressBar pb
|
||||||
|
|
||||||
|
logger("info","obi uniq")
|
||||||
|
|
||||||
# Open DMS
|
input = open_uri(config['obi']['inputURI'])
|
||||||
d = DMS.open(config['obi']['defaultdms'])
|
|
||||||
|
if input[2] != View_NUC_SEQS:
|
||||||
# Open input view
|
raise NotImplementedError('obi uniq only works on NUC_SEQS views')
|
||||||
entries = View.open(d, config['obi']['inputview'])
|
|
||||||
|
output = open_uri(config['obi']['outputURI'],
|
||||||
|
input=False,
|
||||||
|
newviewtype=View_NUC_SEQS)
|
||||||
|
|
||||||
|
entries = input[1]
|
||||||
|
o_view = output[1]
|
||||||
|
|
||||||
# Initialize the progress bar
|
# Initialize the progress bar
|
||||||
pb = ProgressBar(len(entries), config, seconde=5)
|
pb = ProgressBar(len(entries), config, seconde=5)
|
||||||
|
|
||||||
@ -243,55 +259,20 @@ def run(config):
|
|||||||
# usm = uniqPrefixSequence
|
# usm = uniqPrefixSequence
|
||||||
# else:
|
# else:
|
||||||
usm = uniqSequence
|
usm = uniqSequence
|
||||||
|
|
||||||
# Create output view
|
|
||||||
view_class = View.get_view_class(entries.type)
|
|
||||||
if view_class == View_NUC_SEQS :
|
|
||||||
get_quality = tostr(QUALITY_COLUMN) in entries # TODO
|
|
||||||
o_view = View_NUC_SEQS.new(d, config['obi']['outputview'], quality=get_quality)
|
|
||||||
else :
|
|
||||||
o_view = view_class.new(d, config['obi']['outputview'])
|
|
||||||
|
|
||||||
(uniques, merged_infos) = usm(entries, pb, o_view, config['obi']['taxo'], config['uniq']['merge'], config['uniq']['mergeids'], config['uniq']['categories'])
|
usm(entries, o_view, pb, taxonomy=None, mergedKeys_list=config['uniq']['merge'], mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])
|
||||||
|
|
||||||
if 'merge' in config['uniq'] :
|
# if 'merge' in config['uniq'] :
|
||||||
merged_keys=set(config['uniq']['merge'])
|
# merged_keys=set(config['uniq']['merge'])
|
||||||
else:
|
# else:
|
||||||
merged_keys=set()
|
# merged_keys=set()
|
||||||
|
#
|
||||||
if 'taxo' in config['obi'] :
|
# if 'taxo' in config['obi'] :
|
||||||
merged_keys.add('taxid')
|
# merged_keys.add('taxid')
|
||||||
|
|
||||||
# TODO gotta handle special merged columns
|
|
||||||
for k in merged_keys:
|
|
||||||
merged_col_name = "merged_%s" % k
|
|
||||||
if merged_col_name in merged_infos :
|
|
||||||
i_col = entries[k]
|
|
||||||
Column.new_column(o_view,
|
|
||||||
merged_col_name,
|
|
||||||
OBI_INT,
|
|
||||||
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
|
|
||||||
elements_names=merged_infos[merged_col_name]['elt_names'],
|
|
||||||
comments=i_col.comments,
|
|
||||||
alias=merged_col_name
|
|
||||||
)
|
|
||||||
|
|
||||||
for u_id in uniques:
|
|
||||||
u_dict = uniques[u_id]
|
|
||||||
for merged_k in u_dict :
|
|
||||||
if merged_k in merged_infos : # TODO don't enter irrelevant keys to begin with, instead
|
|
||||||
o_view[u_dict['idx']][merged_k] = u_dict[merged_k]
|
|
||||||
|
|
||||||
print("\n")
|
print("\n")
|
||||||
print(repr(o_view))
|
print(repr(o_view))
|
||||||
|
|
||||||
d.close()
|
input[0].close()
|
||||||
|
output[0].close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -371,8 +371,18 @@ cdef class Column_line :
|
|||||||
self._column.set_item(self._index, elt_id, value)
|
self._column.set_item(self._index, elt_id, value)
|
||||||
|
|
||||||
|
|
||||||
def __contains__(self, object element_name):
|
def get(self, object elt_id, object default=None): # TODO returns default if None???
|
||||||
return (tobytes(element_name) in self._column.elements_names)
|
if elt_id in self:
|
||||||
|
return self._column.get_item(self._index, elt_id)
|
||||||
|
else:
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def __contains__(self, object elt_id):
|
||||||
|
if type(elt_id) == int:
|
||||||
|
return elt_id < self._column.nb_elements_per_line
|
||||||
|
else:
|
||||||
|
return (tobytes(elt_id) in self._column.elements_names)
|
||||||
|
|
||||||
|
|
||||||
def __repr__(self) :
|
def __repr__(self) :
|
||||||
|
@ -21,7 +21,7 @@ cdef class Seq(dict) :
|
|||||||
if tags is not None :
|
if tags is not None :
|
||||||
for k in tags:
|
for k in tags:
|
||||||
k_b = tobytes(k)
|
k_b = tobytes(k)
|
||||||
self[k_b] = tags[k_b]
|
self[k_b] = tags[k]
|
||||||
|
|
||||||
|
|
||||||
def __contains__(self, object key):
|
def __contains__(self, object key):
|
||||||
@ -70,11 +70,10 @@ cdef class Nuc_Seq(Seq) :
|
|||||||
# nuc sequence property getter and setter
|
# nuc sequence property getter and setter
|
||||||
@property
|
@property
|
||||||
def seq(self):
|
def seq(self):
|
||||||
return self._seq
|
return self[NUC_SEQUENCE_COLUMN]
|
||||||
|
|
||||||
@seq.setter
|
@seq.setter
|
||||||
def seq(self, object new_seq): # @DuplicatedSignature
|
def seq(self, object new_seq): # @DuplicatedSignature
|
||||||
self._seq = new_seq
|
|
||||||
self[NUC_SEQUENCE_COLUMN] = tobytes(new_seq)
|
self[NUC_SEQUENCE_COLUMN] = tobytes(new_seq)
|
||||||
|
|
||||||
# sequence quality property getter and setter
|
# sequence quality property getter and setter
|
||||||
|
@ -58,12 +58,25 @@ cdef class View_NUC_SEQS(View):
|
|||||||
return view
|
return view
|
||||||
|
|
||||||
|
|
||||||
# TODO
|
# TODO test time gain without
|
||||||
|
@OBIWrapper.checkIsActive
|
||||||
def __getitem__(self, object item) :
|
def __getitem__(self, object item) :
|
||||||
if type(item) == int :
|
if type(item) == int :
|
||||||
return Nuc_Seq_Stored(self, item)
|
return Nuc_Seq_Stored(self, item)
|
||||||
else : # TODO assume str or bytes for optimization?
|
else : # TODO assume str or bytes for optimization?
|
||||||
return self.get_column(item) # TODO hyper lent dans la pratique
|
return self.get_column(item) # TODO hyper lent dans la pratique
|
||||||
|
|
||||||
|
|
||||||
|
@OBIWrapper.checkIsActive
|
||||||
|
def __iter__(self):
|
||||||
|
# Iteration on each line of all columns
|
||||||
|
|
||||||
|
# Declarations
|
||||||
|
cdef index_t line_nb
|
||||||
|
|
||||||
|
# Yield each line
|
||||||
|
for line_nb in range(self.line_count) :
|
||||||
|
yield Nuc_Seq_Stored(self, line_nb)
|
||||||
|
|
||||||
|
|
||||||
# TODO? test if efficiency gain
|
# TODO? test if efficiency gain
|
||||||
|
@ -6,7 +6,9 @@ Created on 30 mars 2016
|
|||||||
@author: coissac
|
@author: coissac
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from obitools3.dms.obiseq import Nuc_Seq
|
import types
|
||||||
|
|
||||||
|
from obitools3.dms.obiseq cimport Nuc_Seq
|
||||||
|
|
||||||
|
|
||||||
def fastaIterator(lineiterator,
|
def fastaIterator(lineiterator,
|
||||||
@ -48,7 +50,7 @@ def fastaIterator(lineiterator,
|
|||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
if read >= ionly:
|
if ionly >= 0 and read >= ionly:
|
||||||
break
|
break
|
||||||
|
|
||||||
while skipped < skip :
|
while skipped < skip :
|
||||||
@ -79,7 +81,7 @@ def fastaIterator(lineiterator,
|
|||||||
# definition,
|
# definition,
|
||||||
# tags=tags,
|
# tags=tags,
|
||||||
# )
|
# )
|
||||||
|
# TODO
|
||||||
yield { "id" : ident,
|
yield { "id" : ident,
|
||||||
"definition" : definition,
|
"definition" : definition,
|
||||||
"sequence" : sequence,
|
"sequence" : sequence,
|
||||||
@ -105,65 +107,65 @@ def fastaNucIterator(lineiterator,
|
|||||||
cdef list s
|
cdef list s
|
||||||
cdef bytes sequence
|
cdef bytes sequence
|
||||||
cdef int lines_to_skip, ionly, read
|
cdef int lines_to_skip, ionly, read
|
||||||
# cdef OBI_Seq seq
|
cdef Nuc_Seq seq
|
||||||
|
|
||||||
if only is None:
|
if only is None:
|
||||||
ionly=-1
|
ionly = -1
|
||||||
else:
|
else:
|
||||||
ionly=int(only)
|
ionly = int(only)
|
||||||
|
|
||||||
if isinstance(lineiterator,(str,bytes)):
|
if isinstance(lineiterator, (str, bytes)):
|
||||||
lineiterator=uopen(lineiterator)
|
lineiterator=uopen(lineiterator)
|
||||||
|
|
||||||
|
if isinstance(lineiterator, types.GeneratorType):
|
||||||
|
iterator = lineiterator
|
||||||
if isinstance(lineiterator, LineBuffer):
|
if isinstance(lineiterator, LineBuffer):
|
||||||
lb=lineiterator
|
iterator = iter(lineiterator)
|
||||||
else:
|
else:
|
||||||
lb=LineBuffer(lineiterator,buffersize)
|
iterator = iter(LineBuffer(lineiterator, buffersize))
|
||||||
|
|
||||||
|
|
||||||
skipped = 0
|
skipped = 0
|
||||||
read = 0
|
read = 0
|
||||||
i = iter(lb)
|
|
||||||
|
|
||||||
if firstline is None:
|
if firstline is None:
|
||||||
line = next(i)
|
line = next(iterator)
|
||||||
else:
|
else:
|
||||||
line = firstline
|
line = firstline
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
if read >= ionly:
|
if ionly >= 0 and read >= ionly:
|
||||||
break
|
break
|
||||||
|
|
||||||
while skipped < skip :
|
while skipped < skip :
|
||||||
line = next(i)
|
line = next(iterator)
|
||||||
try:
|
try:
|
||||||
while line[0]!='>':
|
while line[0]!='>':
|
||||||
line = next(i)
|
line = next(iterator)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
pass
|
pass
|
||||||
skipped += 1
|
skipped += 1
|
||||||
|
|
||||||
ident,tags,definition = parseHeader(line)
|
ident,tags,definition = parseHeader(line)
|
||||||
s = []
|
s = []
|
||||||
line = next(i)
|
line = next(iterator)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while line[0]!='>':
|
while line[0]!='>':
|
||||||
s.append(str2bytes(line)[0:-1])
|
s.append(str2bytes(line)[0:-1])
|
||||||
line = next(i)
|
line = next(iterator)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
sequence = b"".join(s)
|
sequence = b"".join(s)
|
||||||
|
|
||||||
# seq =
|
|
||||||
seq = Nuc_Seq(ident,
|
seq = Nuc_Seq(ident,
|
||||||
sequence,
|
sequence,
|
||||||
definition,
|
definition=definition,
|
||||||
None,-1,
|
quality=None,
|
||||||
tags)
|
offset=-1,
|
||||||
|
tags=tags)
|
||||||
|
|
||||||
yield seq
|
yield seq
|
||||||
|
|
||||||
# yield { "id" : ident,
|
# yield { "id" : ident,
|
||||||
|
@ -6,7 +6,7 @@ Created on 30 mars 2016
|
|||||||
@author: coissac
|
@author: coissac
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from obitools3.dms.obiseq import Nuc_Seq
|
from obitools3.dms.obiseq cimport Nuc_Seq
|
||||||
|
|
||||||
|
|
||||||
def fastqIterator(lineiterator,
|
def fastqIterator(lineiterator,
|
||||||
@ -74,12 +74,11 @@ def fastqWithQualityIterator(lineiterator,
|
|||||||
else:
|
else:
|
||||||
hline = firstline
|
hline = firstline
|
||||||
|
|
||||||
|
|
||||||
for line in i:
|
for line in i:
|
||||||
|
|
||||||
if read >= ionly:
|
if ionly >= 0 and read >= ionly:
|
||||||
break
|
break
|
||||||
|
|
||||||
ident,tags,definition = parseHeader(hline)
|
ident,tags,definition = parseHeader(hline)
|
||||||
sequence = str2bytes(line[0:-1])
|
sequence = str2bytes(line[0:-1])
|
||||||
next(i)
|
next(i)
|
||||||
@ -87,9 +86,10 @@ def fastqWithQualityIterator(lineiterator,
|
|||||||
|
|
||||||
seq = Nuc_Seq(ident,
|
seq = Nuc_Seq(ident,
|
||||||
sequence,
|
sequence,
|
||||||
definition,
|
definition=definition,
|
||||||
quality,qualityoffset,
|
quality=quality,
|
||||||
tags)
|
offset=qualityoffset,
|
||||||
|
tags=tags)
|
||||||
|
|
||||||
yield seq
|
yield seq
|
||||||
|
|
||||||
@ -149,22 +149,23 @@ def fastqWithoutQualityIterator(lineiterator,
|
|||||||
hline = next(i)
|
hline = next(i)
|
||||||
else:
|
else:
|
||||||
hline = firstline
|
hline = firstline
|
||||||
|
|
||||||
for line in i:
|
for line in i:
|
||||||
|
|
||||||
if read >= ionly:
|
if ionly >= 0 and read >= ionly:
|
||||||
break
|
break
|
||||||
|
|
||||||
ident,tags,definition = parseHeader(hline)
|
ident,tags,definition = parseHeader(hline)
|
||||||
sequence = str2bytes(line[0:-1])
|
sequence = str2bytes(line[0:-1])
|
||||||
next(i)
|
next(i)
|
||||||
next(i)
|
next(i)
|
||||||
|
|
||||||
seq = Nuc_Seq(ident,
|
seq = Nuc_Seq(ident,
|
||||||
sequence,
|
sequence,
|
||||||
definition,
|
definition=definition,
|
||||||
None,-1,
|
quality=None,
|
||||||
tags)
|
offset=-1,
|
||||||
|
tags=tags)
|
||||||
|
|
||||||
yield seq
|
yield seq
|
||||||
|
|
||||||
|
@ -41,11 +41,11 @@ def entryIteratorFactory(lineiterator,
|
|||||||
if isinstance(lineiterator, LineBuffer):
|
if isinstance(lineiterator, LineBuffer):
|
||||||
lb=lineiterator
|
lb=lineiterator
|
||||||
else:
|
else:
|
||||||
lb=LineBuffer(lineiterator,buffersize)
|
lb=LineBuffer(lineiterator, buffersize)
|
||||||
|
|
||||||
i = iter(lb)
|
i = iter(lb)
|
||||||
|
|
||||||
first=next(i)
|
first=next(i)
|
||||||
|
|
||||||
format=b"tabular"
|
format=b"tabular"
|
||||||
|
|
||||||
@ -61,26 +61,29 @@ def entryIteratorFactory(lineiterator,
|
|||||||
format=b"ecopcrfile"
|
format=b"ecopcrfile"
|
||||||
elif is_ngsfilter_line(first):
|
elif is_ngsfilter_line(first):
|
||||||
format=b"ngsfilter"
|
format=b"ngsfilter"
|
||||||
|
|
||||||
|
# TODO Temporary fix
|
||||||
|
first=None
|
||||||
|
lineiterator.seek(0)
|
||||||
|
|
||||||
if format==b'fasta':
|
if format==b'fasta':
|
||||||
if seqtype == b'nuc':
|
if seqtype == b'nuc':
|
||||||
return (fastaNucIterator(lineiterator,
|
return (fastaNucIterator(lineiterator,
|
||||||
skip,only,
|
skip=skip,only=only,
|
||||||
first),
|
firstline=first,
|
||||||
|
buffersize=buffersize),
|
||||||
Nuc_Seq)
|
Nuc_Seq)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
elif format==b'fastq':
|
elif format==b'fastq':
|
||||||
return (fastqIterator(lineiterator,
|
return (fastqIterator(lineiterator,
|
||||||
skip,only,
|
skip=skip,only=only,
|
||||||
qualityoffset,
|
qualityoffset=qualityoffset,
|
||||||
first),
|
noquality=noquality,
|
||||||
|
firstline=first,
|
||||||
|
buffersize=buffersize),
|
||||||
Nuc_Seq)
|
Nuc_Seq)
|
||||||
|
|
||||||
|
|
||||||
raise NotImplementedError('File format not yet implemented')
|
raise NotImplementedError('File format not yet implemented')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -364,19 +364,22 @@ def open_uri(uri,
|
|||||||
if qualifiers[b"seqtype"]==b"nuc":
|
if qualifiers[b"seqtype"]==b"nuc":
|
||||||
objclass = Nuc_Seq
|
objclass = Nuc_Seq
|
||||||
if format==b"fasta":
|
if format==b"fasta":
|
||||||
iseq = fastaNucIterator(file,skip,only)
|
iseq = fastaNucIterator(file,
|
||||||
|
skip=skip,
|
||||||
|
only=only)
|
||||||
elif format==b"fastq":
|
elif format==b"fastq":
|
||||||
iseq = fastqIterator(file,
|
iseq = fastqIterator(file,
|
||||||
skip,only,
|
skip=skip,
|
||||||
offset,
|
only=only,
|
||||||
noquality)
|
offset=offset,
|
||||||
|
noquality=noquality)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError('Sequence file format not implemented')
|
raise NotImplementedError('Sequence file format not implemented')
|
||||||
elif qualifiers[b"seqtype"]==b"prot":
|
elif qualifiers[b"seqtype"]==b"prot":
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
else:
|
else:
|
||||||
iseq,objclass = entryIteratorFactory(file,
|
iseq,objclass = entryIteratorFactory(file,
|
||||||
skip,only,
|
skip, only,
|
||||||
seqtype,
|
seqtype,
|
||||||
offset,
|
offset,
|
||||||
noquality,
|
noquality,
|
||||||
@ -388,13 +391,12 @@ def open_uri(uri,
|
|||||||
stripwhite,
|
stripwhite,
|
||||||
blanklineskip,
|
blanklineskip,
|
||||||
commentchar)
|
commentchar)
|
||||||
|
|
||||||
|
|
||||||
tmpdms = get_temp_dms()
|
#tmpdms = get_temp_dms()
|
||||||
|
|
||||||
return (file,iseq,objclass,urib)
|
return (file, iseq, objclass, urib)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -136,7 +136,7 @@ int bloom_init_size(struct bloom * bloom, int entries, double error,
|
|||||||
|
|
||||||
/** ***************************************************************************
|
/** ***************************************************************************
|
||||||
* Check if the given element is in the bloom filter. Remember this may
|
* Check if the given element is in the bloom filter. Remember this may
|
||||||
* return false positive if a collision occured.
|
* return false positive if a collision occurred.
|
||||||
*
|
*
|
||||||
* Parameters:
|
* Parameters:
|
||||||
* -----------
|
* -----------
|
||||||
|
@ -2463,7 +2463,7 @@ index_t obi_avl_group_add(OBIDMS_avl_group_p avl_group, Obi_blob_p value)
|
|||||||
// Check if the AVL group is writable
|
// Check if the AVL group is writable
|
||||||
if (!(avl_group->writable))
|
if (!(avl_group->writable))
|
||||||
{
|
{
|
||||||
obi_set_errno(OBI_READ_ONLY_INDEXER_ERROR);
|
obi_set_errno(OBI_READ_ONLY_INDEXER_ERROR); // Note: this error is read by the calling functions to clone the AVL group if needed
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2476,6 +2476,9 @@ index_t obi_avl_group_add(OBIDMS_avl_group_p avl_group, Obi_blob_p value)
|
|||||||
|
|
||||||
// Add in the current AVL
|
// Add in the current AVL
|
||||||
index_in_avl = (int32_t) obi_avl_add((avl_group->sub_avls)[avl_group->last_avl_idx], value);
|
index_in_avl = (int32_t) obi_avl_add((avl_group->sub_avls)[avl_group->last_avl_idx], value);
|
||||||
|
if (index_in_avl < 0)
|
||||||
|
return -1;
|
||||||
|
|
||||||
bloom_add(&((((avl_group->sub_avls)[avl_group->last_avl_idx])->header)->bloom_filter), value, obi_blob_sizeof(value));
|
bloom_add(&((((avl_group->sub_avls)[avl_group->last_avl_idx])->header)->bloom_filter), value, obi_blob_sizeof(value));
|
||||||
|
|
||||||
// Build the index containing the AVL index
|
// Build the index containing the AVL index
|
||||||
|
@ -32,7 +32,7 @@ Obi_blob_p obi_blob(byte_t* encoded_value, uint8_t element_size, int32_t length_
|
|||||||
Obi_blob_p blob;
|
Obi_blob_p blob;
|
||||||
|
|
||||||
// Allocate the memory for the blob structure
|
// Allocate the memory for the blob structure
|
||||||
blob = (Obi_blob_p) malloc(sizeof(Obi_blob_t) + length_encoded_value);
|
blob = (Obi_blob_p) calloc(sizeof(Obi_blob_t) + length_encoded_value, sizeof(byte_t));
|
||||||
if (blob == NULL)
|
if (blob == NULL)
|
||||||
{
|
{
|
||||||
obi_set_errno(OBI_MALLOC_ERROR);
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
|
@ -240,7 +240,6 @@ OBIDMS_p obi_create_dms(const char* dms_path)
|
|||||||
char* directory_name;
|
char* directory_name;
|
||||||
DIR* dms_dir;
|
DIR* dms_dir;
|
||||||
int dms_file_descriptor;
|
int dms_file_descriptor;
|
||||||
size_t i, j;
|
|
||||||
|
|
||||||
// Build and check the directory name
|
// Build and check the directory name
|
||||||
directory_name = build_directory_name(dms_path);
|
directory_name = build_directory_name(dms_path);
|
||||||
@ -318,7 +317,7 @@ OBIDMS_p obi_create_dms(const char* dms_path)
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Create the informations file
|
// Create the informations file
|
||||||
if (create_dms_infos_file(dms_file_descriptor, basename(dms_path)) < 0)
|
if (create_dms_infos_file(dms_file_descriptor, basename((char*)dms_path)) < 0)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
return obi_open_dms(dms_path);
|
return obi_open_dms(dms_path);
|
||||||
@ -333,7 +332,6 @@ OBIDMS_p obi_open_dms(const char* dms_path)
|
|||||||
int infos_file_descriptor;
|
int infos_file_descriptor;
|
||||||
bool little_endian_dms;
|
bool little_endian_dms;
|
||||||
bool little_endian_platform;
|
bool little_endian_platform;
|
||||||
size_t i, j;
|
|
||||||
|
|
||||||
dms = NULL;
|
dms = NULL;
|
||||||
|
|
||||||
@ -356,7 +354,7 @@ OBIDMS_p obi_open_dms(const char* dms_path)
|
|||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
strcpy(dms->dms_name, basename(dms_path));
|
strcpy(dms->dms_name, basename((char*)dms_path));
|
||||||
|
|
||||||
// Build and check the directory name including the relative path
|
// Build and check the directory name including the relative path
|
||||||
complete_dms_path = build_directory_name(dms_path);
|
complete_dms_path = build_directory_name(dms_path);
|
||||||
|
@ -703,7 +703,7 @@ static int get_formatted_elt_names_length(const char* elements_names)
|
|||||||
|
|
||||||
static index_t get_line_count_per_page(OBIType_t data_type, index_t nb_elements_per_line)
|
static index_t get_line_count_per_page(OBIType_t data_type, index_t nb_elements_per_line)
|
||||||
{
|
{
|
||||||
return getpagesize() / (obi_sizeof(data_type) * nb_elements_per_line);
|
return getpagesize() / obi_sizeof(data_type) / nb_elements_per_line;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -919,6 +919,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms,
|
|||||||
|
|
||||||
// The initial line count should be between the minimum (corresponding to the page size) and the maximum allowed
|
// The initial line count should be between the minimum (corresponding to the page size) and the maximum allowed
|
||||||
minimum_line_count = get_line_count_per_page(stored_data_type, nb_elements_per_line);
|
minimum_line_count = get_line_count_per_page(stored_data_type, nb_elements_per_line);
|
||||||
|
if (minimum_line_count == 0) // Happens if high number of elements per line
|
||||||
|
minimum_line_count = 1;
|
||||||
if (nb_lines > MAXIMUM_LINE_COUNT)
|
if (nb_lines > MAXIMUM_LINE_COUNT)
|
||||||
{
|
{
|
||||||
obidebug(1, "\nCan't create column because of line count greater than the maximum allowed (%d)", MAXIMUM_LINE_COUNT);
|
obidebug(1, "\nCan't create column because of line count greater than the maximum allowed (%d)", MAXIMUM_LINE_COUNT);
|
||||||
@ -1023,7 +1025,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms,
|
|||||||
if (new_column->data == MAP_FAILED)
|
if (new_column->data == MAP_FAILED)
|
||||||
{
|
{
|
||||||
obi_set_errno(OBICOL_UNKNOWN_ERROR);
|
obi_set_errno(OBICOL_UNKNOWN_ERROR);
|
||||||
obidebug(1, "\nError mmapping the data of a column");
|
obidebug(1, "\nError mmapping the data of a column.\nArguments: data_size=%lu, column_file_descriptor=%d, header_size=%lu",
|
||||||
|
data_size, column_file_descriptor, header_size);
|
||||||
munmap(new_column->header, header_size);
|
munmap(new_column->header, header_size);
|
||||||
close(column_file_descriptor);
|
close(column_file_descriptor);
|
||||||
free(new_column);
|
free(new_column);
|
||||||
|
Reference in New Issue
Block a user