Merge branch 'master' of

git@git.metabarcoding.org:obitools/obitools3.git

Conflicts:
	python/obitools3/commands/import.pyx
This commit is contained in:
2017-09-05 08:59:45 +02:00
15 changed files with 450 additions and 433 deletions

View File

@ -8,12 +8,10 @@ from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.files.universalopener cimport uopen
from obitools3.parsers.fasta import fastaIterator
from obitools3.parsers.fastq import fastqIterator
from obitools3.dms.dms import DMS # TODO cimport doesn't work
from obitools3.dms.view.view cimport View
from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS # TODO cimport doesn't work
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from obitools3.dms.column.column cimport Column
from obitools3.dms.obiseq import Nuc_Seq
from obitools3.dms.obiseq cimport Nuc_Seq
from obitools3.utils cimport tobytes, \
get_obitype, \
@ -25,6 +23,7 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \
from obitools3.dms.capi.obierrno cimport obi_errno
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger
@ -50,6 +49,8 @@ def addOptions(parser):
def run(config):
cdef tuple input
cdef tuple output
cdef int i
cdef type value_type
cdef obitype_t value_obitype
@ -62,7 +63,6 @@ def run(config):
cdef View view
cdef object iseq
cdef object seq
cdef object inputs
cdef Column id_col
cdef Column def_col
cdef Column seq_col
@ -71,7 +71,7 @@ def run(config):
cdef bint rewrite
cdef dict dcols
cdef int skipping
cdef str tag
cdef bytes tag
cdef object value
cdef list elt_names
cdef int old_nb_elements_per_line
@ -84,163 +84,157 @@ def run(config):
logger("info","obi import : imports file into an DMS")
inputs = open_uri(config['obi']['inputURI'])
input = open_uri(config['obi']['inputURI'])
if inputs[2]==Nuc_Seq:
if input[2]==Nuc_Seq:
v = View_NUC_SEQS
else:
v= View
v = View
output = open_uri(config['obi']['outputURI'],
input=False,
newviewtype=v)
print(input)
print(output)
#print(input)
#print(output)
sys.exit()
pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
# pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
#
# inputs = uopen(config['import']['filename'])
#
# # Create or open DMS
# d = DMS.open_or_new(config['obi']['defaultdms'])
#
# get_quality = False
# NUC_SEQS_view = False
# if config['import']['seqinformat']=='fasta':
# get_quality = False
# NUC_SEQS_view = True
# iseq = fastaIterator(inputs, skip=config['import']['skip'])
# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
# elif config['import']['seqinformat']=='fastq':
# get_quality = True
# NUC_SEQS_view = True
# iseq = fastqIterator(inputs, skip=config['import']['skip'])
# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
# else:
# raise RuntimeError('File format not handled')
#
# # Save basic columns in variables for optimization
# if NUC_SEQS_view :
# id_col = view["ID"]
# def_col = view["DEFINITION"]
# seq_col = view["NUC_SEQ"]
# if get_quality :
# qual_col = view["QUALITY"]
#
# dcols = {}
#
# i = 0
# for seq in iseq :
# if i == config['import']['only'] :
# break
# else :
# pb(i)
# if NUC_SEQS_view :
# id_col[i] = seq['id']
# def_col[i] = seq['definition']
# seq_col[i] = seq['sequence']
# if get_quality :
# qual_col[i] = seq['quality']
#
# for tag in seq['tags'] :
#
# value = seq['tags'][tag]
#
# # Check NA value
# if value == config['import']['NA'] :
# value = None
#
# if tag not in dcols :
#
# value_type = type(value)
# nb_elts = 1
# value_obitype = OBI_VOID
#
# if value_type == dict or value_type == list :
# nb_elts = len(value)
# elt_names = list(value)
# else :
# nb_elts = 1
# elt_names = None
#
# value_obitype = get_obitype(value)
#
# if value_obitype != OBI_VOID :
# dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
#
# # Fill value
# dcols[tag][0][i] = value
#
# # TODO else log error?
#
# else :
#
# rewrite = False
#
# # Check type adequation
# old_type = dcols[tag][1]
# new_type = OBI_VOID
# new_type = update_obitype(old_type, value)
# if old_type != new_type :
# rewrite = True
#
# try:
# # Fill value
# dcols[tag][0][i] = value
#
# except IndexError :
#
# value_type = type(value)
# old_column = dcols[tag][0]
# old_nb_elements_per_line = old_column.nb_elements_per_line
# new_nb_elements_per_line = 0
# old_elements_names = old_column.elements_names
# new_elements_names = None
#
# #####################################################################
#
# # Check the length and keys of column lines if needed
# if value_type == dict : # Check dictionary keys
# for k in value :
# if k not in old_elements_names :
# new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
# rewrite = True
# break
#
# elif value_type == list or value_type == tuple : # Check vector length
# if old_nb_elements_per_line < len(value) :
# new_nb_elements_per_line = len(value)
# rewrite = True
#
# #####################################################################
#
# if rewrite :
# if new_nb_elements_per_line == 0 and new_elements_names is not None :
# new_nb_elements_per_line = len(new_elements_names)
#
# # Reset obierrno
# obi_errno = 0
#
# dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
# new_data_type=new_type,
# new_nb_elements_per_line=new_nb_elements_per_line,
# new_elements_names=new_elements_names),
# value_obitype)
#
# # Update the dictionary:
# for t in dcols :
# dcols[t] = (view[t], dcols[t][1])
#
# # Fill value
# dcols[tag][0][i] = value
#
# i+=1
#
# print("\n")
# print(view.__repr__())
#
# d.close()
iseq = input[1]
get_quality = False
NUC_SEQS_view = False
if isinstance(output[1], View) :
view = output[1]
if output[2] == View_NUC_SEQS :
NUC_SEQS_view = True
if "QUALITY" in view : # TODO
get_quality = True
else:
raise NotImplementedError()
# Save basic columns in variables for optimization
if NUC_SEQS_view :
id_col = view[b"ID"]
def_col = view[b"DEFINITION"]
seq_col = view[b"NUC_SEQ"]
if get_quality :
qual_col = view[b"QUALITY"]
dcols = {}
i = 0
for seq in iseq :
pb(i)
if NUC_SEQS_view :
id_col[i] = seq.id
def_col[i] = seq.definition
seq_col[i] = seq.seq
if get_quality :
qual_col[i] = seq.quality
for tag in seq :
if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" : # TODO hmmm...
value = seq[tag]
# Check NA value
if value == config['obi']['nastring'] :
value = None
if tag not in dcols :
value_type = type(value)
nb_elts = 1
value_obitype = OBI_VOID
if value_type == dict or value_type == list :
nb_elts = len(value)
elt_names = list(value)
else :
nb_elts = 1
elt_names = None
value_obitype = get_obitype(value)
if value_obitype != OBI_VOID :
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
# Fill value
dcols[tag][0][i] = value
# TODO else log error?
else :
rewrite = False
# Check type adequation
old_type = dcols[tag][1]
new_type = OBI_VOID
new_type = update_obitype(old_type, value)
if old_type != new_type :
rewrite = True
try:
# Fill value
dcols[tag][0][i] = value
except IndexError :
value_type = type(value)
old_column = dcols[tag][0]
old_nb_elements_per_line = old_column.nb_elements_per_line
new_nb_elements_per_line = 0
old_elements_names = old_column.elements_names
new_elements_names = None
#####################################################################
# Check the length and keys of column lines if needed
if value_type == dict : # Check dictionary keys
for k in value :
if k not in old_elements_names :
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
rewrite = True
break
elif value_type == list or value_type == tuple : # Check vector length
if old_nb_elements_per_line < len(value) :
new_nb_elements_per_line = len(value)
rewrite = True
#####################################################################
if rewrite :
if new_nb_elements_per_line == 0 and new_elements_names is not None :
new_nb_elements_per_line = len(new_elements_names)
# Reset obierrno
obi_errno = 0
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
new_data_type=new_type,
new_nb_elements_per_line=new_nb_elements_per_line,
new_elements_names=new_elements_names),
value_obitype)
# Update the dictionary:
for t in dcols :
dcols[t] = (view[t], dcols[t][1])
# Fill value
dcols[tag][0][i] = value
i+=1
print("\n")
print(view.__repr__())
input[0].close() # TODO
output[0].close()

View File

@ -0,0 +1,8 @@
#cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms.taxo.taxo cimport Taxonomy
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=*, list mergedKeys_list=*, bint mergeIds=*, list categories=*)

View File

@ -1,58 +1,28 @@
#cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms.dms import DMS # TODO cimport doesn't work
from obitools3.dms.view.view import View # TODO cimport doesn't work
from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS
from obitools3.dms.obiseq cimport Nuc_Seq
from obitools3.dms.column.column cimport Column
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN
from obitools3.dms.capi.obitypes cimport OBI_INT
from obitools3.utils cimport tostr
from obitools3.dms.dms cimport DMS
from obitools3.dms.taxo.taxo cimport Taxonomy
from obitools3.dms.view.view cimport View, Line
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from obitools3.dms.column.column cimport Column, Column_line
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
from obitools3.dms.capi.obitypes cimport OBI_INT, index_t
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger
# TODO silence non-implemented options
__title__="Groups records together"
default_config = { 'inputview' : None,
'outputview' : None
}
def addOptions(parser):
# TODO put this common group somewhere else but I don't know where
group=parser.add_argument_group('DMS and view options')
group.add_argument('--default-dms','-d',
action="store", dest="obi:defaultdms",
metavar='<DMS NAME>',
default=None,
type=str,
help="Name of the default DMS for reading and writing data.")
group.add_argument('--input-view','-i',
action="store", dest="obi:inputview",
metavar='<INPUT VIEW NAME>',
default=None,
type=str,
help="Name of the input view, either raw if the view is in the default DMS,"
" or in the form 'dms:view' if it is in another DMS.")
group.add_argument('--output-view','-o',
action="store", dest="obi:outputview",
metavar='<OUTPUT VIEW NAME>',
default=None,
type=str,
help="Name of the output view, either raw if the view is in the default DMS,"
" or in the form 'dms:view' if it is in another DMS.")
group.add_argument('--taxo','-t',
action="store", dest="obi:taxo",
metavar='<TAXONOMY NAME>',
default='', # TODO not None because if it's None, the option is not entered in the option dictionary.
type=str,
help="Name of the taxonomy to use.")
addSequenceInputOption(parser)
addMinimalOutputOption(parser)
group = parser.add_argument_group('obi uniq specific options')
@ -89,151 +59,197 @@ def addOptions(parser):
# TODO taxonomy
# TODO
COUNT_COLUMN_str = tostr(COUNT_COLUMN)
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=None, list mergedKeys_list=None, bint mergeIds=False, list categories=None) :
def uniqSequence(view, pb, o_view, taxonomy=None, mergedKey=None, mergeIds=False, categories=None) :
cdef int i
cdef int o_idx
cdef int u_idx
cdef int u_id
cdef int i_count
cdef set mergedKeys
cdef dict uniques
cdef dict merged_infos
cdef object iter_view
cdef Line i_seq
cdef Line o_seq
cdef str key
cdef bytes key_b
cdef str mkey
cdef str merged_col_name
cdef Column i_col
cdef Column seq_col
cdef object to_merge
cdef Column_line mcol
cdef Column_line i_mcol
uniques = {}
if categories is None:
categories=[]
if mergedKey is not None:
mergedKey=set(mergedKey)
if mergedKeys_list is not None:
mergedKeys=set(mergedKeys_list)
else:
mergedKey=set()
mergedKeys=set()
if taxonomy is not None:
mergedKey.add('taxid')
# if taxonomy is not None:
# mergedKeys.add('taxid')
# Faire parcours de la view des colonnes à merged pour créer les merged_col avant et les remplir au fur et à mesure
o_idx = 0
# Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
#logger("info", "obi uniq", "First browsing through the input")
merged_infos = {}
i = 0
iter_view = iter(view)
for i_seq in iter_view:
pb(i)
for key in mergedKeys:
mkey = "merged_%s" % key
if key in i_seq: # TODO what if mkey already in i_seq?
if mkey not in merged_infos:
merged_infos[mkey] = {}
mkey_infos = merged_infos[mkey]
mkey_infos['nb_elts'] = 1
mkey_infos['elt_names'] = [i_seq[key]]
else:
mkey_infos = merged_infos[mkey]
if i_seq[key] not in mkey_infos['elt_names']: # TODO make faster? but how?
mkey_infos['elt_names'].append(i_seq[key])
mkey_infos['nb_elts'] += 1
i+=1
for key in mergedKeys:
merged_col_name = "merged_%s" % key
i_col = view[key]
Column.new_column(o_view,
merged_col_name,
OBI_INT,
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
elements_names=merged_infos[merged_col_name]['elt_names'],
comments=i_col.comments,
alias=merged_col_name # TODO what if it already exists
)
del(merged_infos)
#logger("info", "obi uniq", "Second browsing through the input")
i = 0
o_idx = 0
seq_col = view[NUC_SEQUENCE_COLUMN]
iter_view = iter(view)
for i_seq in iter_view :
pass
iter_view = iter(view)
for i_seq in iter_view :
pb(i)
# utiliser l'index des AVLs, faire l'API
#u_id = tuple(i_seq[x] for x in categories) + (str(i_seq),)
#u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),)
u_id = seq_col.get_line_idx(i)
if u_id in uniques:
u_seq = uniques[u_id]
o_seq = o_view[u_seq['idx']]
if COUNT_COLUMN_str in i_seq:
o_seq[COUNT_COLUMN_str] += i_seq[COUNT_COLUMN_str]
if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
i_count = 1
else:
o_seq[COUNT_COLUMN_str] += 1
# seq['COUNT']=1
i_count = i_seq[COUNT_COLUMN]
u_idx = uniques[u_id]
o_seq = o_view[u_idx]
o_seq[COUNT_COLUMN] += i_count
# if taxonomy is not None and 'taxid' in seq:
# s['merged_taxid'][seq['taxid']]=
for key in mergedKey:
if key=='taxid' and mergeIds: # TODO
if 'taxid_dist' in i_seq:
u_seq["taxid_dist"].update(i_seq["taxid_dist"])
if 'taxid' in i_seq:
u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
for key in mergedKeys:
# if key=='taxid' and mergeIds: # TODO
# if 'taxid_dist' in i_seq:
# u_seq["taxid_dist"].update(i_seq["taxid_dist"])
# if 'taxid' in i_seq:
# u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
mkey = "merged_%s" % key
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
if key in i_seq:
u_seq[mkey][i_seq[key]] = u_seq[mkey].get(i_seq[key], 0) + i_seq[COUNT_COLUMN_str]
to_merge = i_seq[key]
mcol = o_seq[mkey]
if mcol[to_merge] is None:
mcol[to_merge] = i_count
else:
mcol[to_merge] = mcol[to_merge] + i_count
#cas ou merged_keys existe deja
else:
else: # TODO is this a good else
if mkey in i_seq:
for skey in i_seq[mkey]:
u_seq[mkey][skey] = u_seq[mkey].get(skey,0) + i_seq[mkey][skey]
mcol = o_seq[mkey]
i_mcol = i_seq[mkey]
for key_b in i_mcol:
if mcol[key_b] is None:
mcol[key_b] = i_mcol[key_b]
else:
mcol[key_b] = mcol[key_b] + i_mcol[key_b]
for key in i_seq.keys():
for key_b in i_seq.keys():
# Merger proprement l'attribut merged s'il existe
if key in o_seq and o_seq[key] != i_seq[key] and tostr(key) != COUNT_COLUMN_str : #and key[0:7]!='merged_' and key!='merged': TODO check this
o_seq[key] = None
# TODO make special columns list?
if key_b != COUNT_COLUMN and key_b != ID_COLUMN and key_b != NUC_SEQUENCE_COLUMN and key_b in o_seq and o_seq[key_b] != i_seq[key_b] :
o_seq[key_b] = None
if mergeIds:
u_seq['merged'].append(i_seq.id)
# if mergeIds: # TODO
# u_seq['merged'].append(i_seq.id)
else:
o_view[o_idx] = i_seq
o_seq = o_view[o_idx]
uniques[u_id] = {'idx':o_idx}
u_seq = uniques[u_id]
uniques[u_id] = o_idx
o_idx += 1
if COUNT_COLUMN_str not in o_seq:
o_seq[COUNT_COLUMN_str] = 1
if COUNT_COLUMN not in o_seq or o_seq[COUNT_COLUMN] is None:
o_seq[COUNT_COLUMN] = 1
for key in mergedKey:
if key=='taxid' and mergeIds:
if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
u_seq["taxid_dist"] = {}
else :
u_seq["taxid_dist"] = o_seq["taxid_dist"]
if 'taxid' in o_seq:
u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
for key in mergedKeys:
# if key=='taxid' and mergeIds:
# if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
# u_seq["taxid_dist"] = {}
# else :
# u_seq["taxid_dist"] = o_seq["taxid_dist"]
# if 'taxid' in o_seq:
# u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
mkey = "merged_%s" % key
if mkey not in o_seq:
u_seq[mkey]={}
else :
u_seq[mkey] = o_seq[mkey]
if key in o_seq:
u_seq[mkey][o_seq[key]] = u_seq[mkey].get(o_seq[key],0) + o_seq[COUNT_COLUMN_str]
o_seq[key] = None
if mergeIds:
u_seq['merged']=[o_seq.id]
to_merge = o_seq[key]
mcol = o_seq[mkey]
if to_merge in mcol and mcol[to_merge] is not None:
mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN]
else:
mcol[to_merge] = o_seq[COUNT_COLUMN]
o_seq[key] = None # TODO delete column eventually -> make C function?
# if mergeIds:
# u_seq['merged']=[o_seq.id]
i+=1
#TODO
#if taxonomy is not None:
# mergeTaxonomyClassification(uniqSeq, taxonomy)
# Get informations to build the columns with merged attributes
merged_infos = {}
for u_id in uniques :
u_seq = uniques[u_id]
for mkey in u_seq :
if mkey != 'idx' :
mkey_dict = u_seq[mkey]
if mkey not in merged_infos :
merged_infos[mkey] = {}
mkey_infos = merged_infos[mkey]
mkey_infos['nb_elts'] = len(mkey_dict.keys())
mkey_infos['elt_names'] = [k for k in mkey_dict]
else :
mkey_infos = merged_infos[mkey]
for k in mkey_dict :
if k not in mkey_infos['elt_names'] :
mkey_infos['elt_names'].append(k)
mkey_infos['nb_elts'] += 1
keys_to_del = []
for k in merged_infos :
if merged_infos[k]['nb_elts'] == 0:
keys_to_del.append(k)
for k in keys_to_del :
del merged_infos[k]
return (uniques, merged_infos)
def run(config):
# TODO declare variables
cdef tuple input
cdef tuple output
cdef View_NUC_SEQS entries
cdef View_NUC_SEQS o_view
cdef ProgressBar pb
# Open DMS
d = DMS.open(config['obi']['defaultdms'])
logger("info","obi uniq")
# Open input view
entries = View.open(d, config['obi']['inputview'])
input = open_uri(config['obi']['inputURI'])
if input[2] != View_NUC_SEQS:
raise NotImplementedError('obi uniq only works on NUC_SEQS views')
output = open_uri(config['obi']['outputURI'],
input=False,
newviewtype=View_NUC_SEQS)
entries = input[1]
o_view = output[1]
# Initialize the progress bar
pb = ProgressBar(len(entries), config, seconde=5)
@ -244,54 +260,19 @@ def run(config):
# else:
usm = uniqSequence
# Create output view
view_class = View.get_view_class(entries.type)
if view_class == View_NUC_SEQS :
get_quality = tostr(QUALITY_COLUMN) in entries # TODO
o_view = View_NUC_SEQS.new(d, config['obi']['outputview'], quality=get_quality)
else :
o_view = view_class.new(d, config['obi']['outputview'])
usm(entries, o_view, pb, taxonomy=None, mergedKeys_list=config['uniq']['merge'], mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])
(uniques, merged_infos) = usm(entries, pb, o_view, config['obi']['taxo'], config['uniq']['merge'], config['uniq']['mergeids'], config['uniq']['categories'])
if 'merge' in config['uniq'] :
merged_keys=set(config['uniq']['merge'])
else:
merged_keys=set()
if 'taxo' in config['obi'] :
merged_keys.add('taxid')
# TODO gotta handle special merged columns
for k in merged_keys:
merged_col_name = "merged_%s" % k
if merged_col_name in merged_infos :
i_col = entries[k]
Column.new_column(o_view,
merged_col_name,
OBI_INT,
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
elements_names=merged_infos[merged_col_name]['elt_names'],
comments=i_col.comments,
alias=merged_col_name
)
for u_id in uniques:
u_dict = uniques[u_id]
for merged_k in u_dict :
if merged_k in merged_infos : # TODO don't enter irrelevant keys to begin with, instead
o_view[u_dict['idx']][merged_k] = u_dict[merged_k]
# if 'merge' in config['uniq'] :
# merged_keys=set(config['uniq']['merge'])
# else:
# merged_keys=set()
#
# if 'taxo' in config['obi'] :
# merged_keys.add('taxid')
print("\n")
print(repr(o_view))
d.close()
input[0].close()
output[0].close()

View File

@ -371,8 +371,18 @@ cdef class Column_line :
self._column.set_item(self._index, elt_id, value)
def __contains__(self, object element_name):
return (tobytes(element_name) in self._column.elements_names)
def get(self, object elt_id, object default=None): # TODO returns default if None???
if elt_id in self:
return self._column.get_item(self._index, elt_id)
else:
return default
def __contains__(self, object elt_id):
if type(elt_id) == int:
return elt_id < self._column.nb_elements_per_line
else:
return (tobytes(elt_id) in self._column.elements_names)
def __repr__(self) :

View File

@ -21,7 +21,7 @@ cdef class Seq(dict) :
if tags is not None :
for k in tags:
k_b = tobytes(k)
self[k_b] = tags[k_b]
self[k_b] = tags[k]
def __contains__(self, object key):
@ -70,11 +70,10 @@ cdef class Nuc_Seq(Seq) :
# nuc sequence property getter and setter
@property
def seq(self):
return self._seq
return self[NUC_SEQUENCE_COLUMN]
@seq.setter
def seq(self, object new_seq): # @DuplicatedSignature
self._seq = new_seq
self[NUC_SEQUENCE_COLUMN] = tobytes(new_seq)
# sequence quality property getter and setter

View File

@ -58,7 +58,8 @@ cdef class View_NUC_SEQS(View):
return view
# TODO
# TODO test time gain without
@OBIWrapper.checkIsActive
def __getitem__(self, object item) :
if type(item) == int :
return Nuc_Seq_Stored(self, item)
@ -66,6 +67,18 @@ cdef class View_NUC_SEQS(View):
return self.get_column(item) # TODO hyper lent dans la pratique
@OBIWrapper.checkIsActive
def __iter__(self):
# Iteration on each line of all columns
# Declarations
cdef index_t line_nb
# Yield each line
for line_nb in range(self.line_count) :
yield Nuc_Seq_Stored(self, line_nb)
# TODO? test if efficiency gain
# def __setitem__(self, index_t line_idx, Nuc_Seq sequence_obj) :
# for key in sequence_obj :

View File

@ -6,7 +6,9 @@ Created on 30 mars 2016
@author: coissac
'''
from obitools3.dms.obiseq import Nuc_Seq
import types
from obitools3.dms.obiseq cimport Nuc_Seq
def fastaIterator(lineiterator,
@ -48,7 +50,7 @@ def fastaIterator(lineiterator,
while True:
if read >= ionly:
if ionly >= 0 and read >= ionly:
break
while skipped < skip :
@ -79,7 +81,7 @@ def fastaIterator(lineiterator,
# definition,
# tags=tags,
# )
# TODO
yield { "id" : ident,
"definition" : definition,
"sequence" : sequence,
@ -105,64 +107,64 @@ def fastaNucIterator(lineiterator,
cdef list s
cdef bytes sequence
cdef int lines_to_skip, ionly, read
# cdef OBI_Seq seq
cdef Nuc_Seq seq
if only is None:
ionly=-1
ionly = -1
else:
ionly=int(only)
ionly = int(only)
if isinstance(lineiterator,(str,bytes)):
if isinstance(lineiterator, (str, bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, types.GeneratorType):
iterator = lineiterator
if isinstance(lineiterator, LineBuffer):
lb=lineiterator
iterator = iter(lineiterator)
else:
lb=LineBuffer(lineiterator,buffersize)
iterator = iter(LineBuffer(lineiterator, buffersize))
skipped = 0
read = 0
i = iter(lb)
if firstline is None:
line = next(i)
line = next(iterator)
else:
line = firstline
while True:
if read >= ionly:
if ionly >= 0 and read >= ionly:
break
while skipped < skip :
line = next(i)
line = next(iterator)
try:
while line[0]!='>':
line = next(i)
line = next(iterator)
except StopIteration:
pass
skipped += 1
ident,tags,definition = parseHeader(line)
s = []
line = next(i)
line = next(iterator)
try:
while line[0]!='>':
s.append(str2bytes(line)[0:-1])
line = next(i)
line = next(iterator)
except StopIteration:
pass
sequence = b"".join(s)
# seq =
seq = Nuc_Seq(ident,
sequence,
definition,
None,-1,
tags)
definition=definition,
quality=None,
offset=-1,
tags=tags)
yield seq

View File

@ -6,7 +6,7 @@ Created on 30 mars 2016
@author: coissac
'''
from obitools3.dms.obiseq import Nuc_Seq
from obitools3.dms.obiseq cimport Nuc_Seq
def fastqIterator(lineiterator,
@ -74,10 +74,9 @@ def fastqWithQualityIterator(lineiterator,
else:
hline = firstline
for line in i:
if read >= ionly:
if ionly >= 0 and read >= ionly:
break
ident,tags,definition = parseHeader(hline)
@ -87,9 +86,10 @@ def fastqWithQualityIterator(lineiterator,
seq = Nuc_Seq(ident,
sequence,
definition,
quality,qualityoffset,
tags)
definition=definition,
quality=quality,
offset=qualityoffset,
tags=tags)
yield seq
@ -152,7 +152,7 @@ def fastqWithoutQualityIterator(lineiterator,
for line in i:
if read >= ionly:
if ionly >= 0 and read >= ionly:
break
ident,tags,definition = parseHeader(hline)
@ -162,9 +162,10 @@ def fastqWithoutQualityIterator(lineiterator,
seq = Nuc_Seq(ident,
sequence,
definition,
None,-1,
tags)
definition=definition,
quality=None,
offset=-1,
tags=tags)
yield seq

View File

@ -41,7 +41,7 @@ def entryIteratorFactory(lineiterator,
if isinstance(lineiterator, LineBuffer):
lb=lineiterator
else:
lb=LineBuffer(lineiterator,buffersize)
lb=LineBuffer(lineiterator, buffersize)
i = iter(lb)
@ -62,25 +62,28 @@ def entryIteratorFactory(lineiterator,
elif is_ngsfilter_line(first):
format=b"ngsfilter"
# TODO Temporary fix
first=None
lineiterator.seek(0)
if format==b'fasta':
if seqtype == b'nuc':
return (fastaNucIterator(lineiterator,
skip,only,
first),
skip=skip,only=only,
firstline=first,
buffersize=buffersize),
Nuc_Seq)
else:
raise NotImplementedError()
elif format==b'fastq':
return (fastqIterator(lineiterator,
skip,only,
qualityoffset,
first),
skip=skip,only=only,
qualityoffset=qualityoffset,
noquality=noquality,
firstline=first,
buffersize=buffersize),
Nuc_Seq)
raise NotImplementedError('File format not yet implemented')

View File

@ -364,19 +364,22 @@ def open_uri(uri,
if qualifiers[b"seqtype"]==b"nuc":
objclass = Nuc_Seq
if format==b"fasta":
iseq = fastaNucIterator(file,skip,only)
iseq = fastaNucIterator(file,
skip=skip,
only=only)
elif format==b"fastq":
iseq = fastqIterator(file,
skip,only,
offset,
noquality)
skip=skip,
only=only,
offset=offset,
noquality=noquality)
else:
raise NotImplementedError('Sequence file format not implemented')
elif qualifiers[b"seqtype"]==b"prot":
raise NotImplementedError()
else:
iseq,objclass = entryIteratorFactory(file,
skip,only,
skip, only,
seqtype,
offset,
noquality,
@ -389,10 +392,9 @@ def open_uri(uri,
blanklineskip,
commentchar)
#tmpdms = get_temp_dms()
tmpdms = get_temp_dms()
return (file,iseq,objclass,urib)
return (file, iseq, objclass, urib)

View File

@ -136,7 +136,7 @@ int bloom_init_size(struct bloom * bloom, int entries, double error,
/** ***************************************************************************
* Check if the given element is in the bloom filter. Remember this may
* return false positive if a collision occured.
* return false positive if a collision occurred.
*
* Parameters:
* -----------

View File

@ -2463,7 +2463,7 @@ index_t obi_avl_group_add(OBIDMS_avl_group_p avl_group, Obi_blob_p value)
// Check if the AVL group is writable
if (!(avl_group->writable))
{
obi_set_errno(OBI_READ_ONLY_INDEXER_ERROR);
obi_set_errno(OBI_READ_ONLY_INDEXER_ERROR); // Note: this error is read by the calling functions to clone the AVL group if needed
return -1;
}
@ -2476,6 +2476,9 @@ index_t obi_avl_group_add(OBIDMS_avl_group_p avl_group, Obi_blob_p value)
// Add in the current AVL
index_in_avl = (int32_t) obi_avl_add((avl_group->sub_avls)[avl_group->last_avl_idx], value);
if (index_in_avl < 0)
return -1;
bloom_add(&((((avl_group->sub_avls)[avl_group->last_avl_idx])->header)->bloom_filter), value, obi_blob_sizeof(value));
// Build the index containing the AVL index

View File

@ -32,7 +32,7 @@ Obi_blob_p obi_blob(byte_t* encoded_value, uint8_t element_size, int32_t length_
Obi_blob_p blob;
// Allocate the memory for the blob structure
blob = (Obi_blob_p) malloc(sizeof(Obi_blob_t) + length_encoded_value);
blob = (Obi_blob_p) calloc(sizeof(Obi_blob_t) + length_encoded_value, sizeof(byte_t));
if (blob == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);

View File

@ -240,7 +240,6 @@ OBIDMS_p obi_create_dms(const char* dms_path)
char* directory_name;
DIR* dms_dir;
int dms_file_descriptor;
size_t i, j;
// Build and check the directory name
directory_name = build_directory_name(dms_path);
@ -318,7 +317,7 @@ OBIDMS_p obi_create_dms(const char* dms_path)
*/
// Create the informations file
if (create_dms_infos_file(dms_file_descriptor, basename(dms_path)) < 0)
if (create_dms_infos_file(dms_file_descriptor, basename((char*)dms_path)) < 0)
return NULL;
return obi_open_dms(dms_path);
@ -333,7 +332,6 @@ OBIDMS_p obi_open_dms(const char* dms_path)
int infos_file_descriptor;
bool little_endian_dms;
bool little_endian_platform;
size_t i, j;
dms = NULL;
@ -356,7 +354,7 @@ OBIDMS_p obi_open_dms(const char* dms_path)
i++;
}
*/
strcpy(dms->dms_name, basename(dms_path));
strcpy(dms->dms_name, basename((char*)dms_path));
// Build and check the directory name including the relative path
complete_dms_path = build_directory_name(dms_path);

View File

@ -703,7 +703,7 @@ static int get_formatted_elt_names_length(const char* elements_names)
static index_t get_line_count_per_page(OBIType_t data_type, index_t nb_elements_per_line)
{
return getpagesize() / (obi_sizeof(data_type) * nb_elements_per_line);
return getpagesize() / obi_sizeof(data_type) / nb_elements_per_line;
}
@ -919,6 +919,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms,
// The initial line count should be between the minimum (corresponding to the page size) and the maximum allowed
minimum_line_count = get_line_count_per_page(stored_data_type, nb_elements_per_line);
if (minimum_line_count == 0) // Happens if high number of elements per line
minimum_line_count = 1;
if (nb_lines > MAXIMUM_LINE_COUNT)
{
obidebug(1, "\nCan't create column because of line count greater than the maximum allowed (%d)", MAXIMUM_LINE_COUNT);
@ -1023,7 +1025,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms,
if (new_column->data == MAP_FAILED)
{
obi_set_errno(OBICOL_UNKNOWN_ERROR);
obidebug(1, "\nError mmapping the data of a column");
obidebug(1, "\nError mmapping the data of a column.\nArguments: data_size=%lu, column_file_descriptor=%d, header_size=%lu",
data_size, column_file_descriptor, header_size);
munmap(new_column->header, header_size);
close(column_file_descriptor);
free(new_column);