Merge branch 'master' of

git@git.metabarcoding.org:obitools/obitools3.git

Conflicts:
	python/obitools3/commands/import.pyx
This commit is contained in:
2017-09-05 08:59:45 +02:00
15 changed files with 450 additions and 433 deletions

View File

@ -8,12 +8,10 @@ from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.files.universalopener cimport uopen from obitools3.files.universalopener cimport uopen
from obitools3.parsers.fasta import fastaIterator from obitools3.parsers.fasta import fastaIterator
from obitools3.parsers.fastq import fastqIterator from obitools3.parsers.fastq import fastqIterator
from obitools3.dms.dms import DMS # TODO cimport doesn't work
from obitools3.dms.view.view cimport View from obitools3.dms.view.view cimport View
from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS # TODO cimport doesn't work from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from obitools3.dms.column.column cimport Column from obitools3.dms.column.column cimport Column
from obitools3.dms.obiseq cimport Nuc_Seq
from obitools3.dms.obiseq import Nuc_Seq
from obitools3.utils cimport tobytes, \ from obitools3.utils cimport tobytes, \
get_obitype, \ get_obitype, \
@ -25,6 +23,7 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \
from obitools3.dms.capi.obierrno cimport obi_errno from obitools3.dms.capi.obierrno cimport obi_errno
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
from obitools3.uri.decode import open_uri from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger from obitools3.apps.config import logger
@ -50,6 +49,8 @@ def addOptions(parser):
def run(config): def run(config):
cdef tuple input
cdef tuple output
cdef int i cdef int i
cdef type value_type cdef type value_type
cdef obitype_t value_obitype cdef obitype_t value_obitype
@ -62,7 +63,6 @@ def run(config):
cdef View view cdef View view
cdef object iseq cdef object iseq
cdef object seq cdef object seq
cdef object inputs
cdef Column id_col cdef Column id_col
cdef Column def_col cdef Column def_col
cdef Column seq_col cdef Column seq_col
@ -71,7 +71,7 @@ def run(config):
cdef bint rewrite cdef bint rewrite
cdef dict dcols cdef dict dcols
cdef int skipping cdef int skipping
cdef str tag cdef bytes tag
cdef object value cdef object value
cdef list elt_names cdef list elt_names
cdef int old_nb_elements_per_line cdef int old_nb_elements_per_line
@ -84,163 +84,157 @@ def run(config):
logger("info","obi import : imports file into an DMS") logger("info","obi import : imports file into an DMS")
inputs = open_uri(config['obi']['inputURI']) input = open_uri(config['obi']['inputURI'])
if inputs[2]==Nuc_Seq: if input[2]==Nuc_Seq:
v = View_NUC_SEQS v = View_NUC_SEQS
else: else:
v= View v = View
output = open_uri(config['obi']['outputURI'], output = open_uri(config['obi']['outputURI'],
input=False, input=False,
newviewtype=v) newviewtype=v)
print(input) #print(input)
print(output) #print(output)
sys.exit() pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
# pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file iseq = input[1]
#
# inputs = uopen(config['import']['filename']) get_quality = False
# NUC_SEQS_view = False
# # Create or open DMS if isinstance(output[1], View) :
# d = DMS.open_or_new(config['obi']['defaultdms']) view = output[1]
# if output[2] == View_NUC_SEQS :
# get_quality = False NUC_SEQS_view = True
# NUC_SEQS_view = False if "QUALITY" in view : # TODO
# if config['import']['seqinformat']=='fasta': get_quality = True
# get_quality = False else:
# NUC_SEQS_view = True raise NotImplementedError()
# iseq = fastaIterator(inputs, skip=config['import']['skip'])
# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) # Save basic columns in variables for optimization
# elif config['import']['seqinformat']=='fastq': if NUC_SEQS_view :
# get_quality = True id_col = view[b"ID"]
# NUC_SEQS_view = True def_col = view[b"DEFINITION"]
# iseq = fastqIterator(inputs, skip=config['import']['skip']) seq_col = view[b"NUC_SEQ"]
# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) if get_quality :
# else: qual_col = view[b"QUALITY"]
# raise RuntimeError('File format not handled')
# dcols = {}
# # Save basic columns in variables for optimization
# if NUC_SEQS_view : i = 0
# id_col = view["ID"] for seq in iseq :
# def_col = view["DEFINITION"]
# seq_col = view["NUC_SEQ"] pb(i)
# if get_quality :
# qual_col = view["QUALITY"] if NUC_SEQS_view :
# id_col[i] = seq.id
# dcols = {} def_col[i] = seq.definition
# seq_col[i] = seq.seq
# i = 0
# for seq in iseq : if get_quality :
# if i == config['import']['only'] : qual_col[i] = seq.quality
# break
# else : for tag in seq :
# pb(i)
# if NUC_SEQS_view : if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" : # TODO hmmm...
# id_col[i] = seq['id']
# def_col[i] = seq['definition'] value = seq[tag]
# seq_col[i] = seq['sequence']
# if get_quality : # Check NA value
# qual_col[i] = seq['quality'] if value == config['obi']['nastring'] :
# value = None
# for tag in seq['tags'] :
# if tag not in dcols :
# value = seq['tags'][tag]
# value_type = type(value)
# # Check NA value nb_elts = 1
# if value == config['import']['NA'] : value_obitype = OBI_VOID
# value = None
# if value_type == dict or value_type == list :
# if tag not in dcols : nb_elts = len(value)
# elt_names = list(value)
# value_type = type(value) else :
# nb_elts = 1 nb_elts = 1
# value_obitype = OBI_VOID elt_names = None
#
# if value_type == dict or value_type == list : value_obitype = get_obitype(value)
# nb_elts = len(value)
# elt_names = list(value) if value_obitype != OBI_VOID :
# else : dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
# nb_elts = 1
# elt_names = None # Fill value
# dcols[tag][0][i] = value
# value_obitype = get_obitype(value)
# # TODO else log error?
# if value_obitype != OBI_VOID :
# dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype) else :
#
# # Fill value rewrite = False
# dcols[tag][0][i] = value
# # Check type adequation
# # TODO else log error? old_type = dcols[tag][1]
# new_type = OBI_VOID
# else : new_type = update_obitype(old_type, value)
# if old_type != new_type :
# rewrite = False rewrite = True
#
# # Check type adequation try:
# old_type = dcols[tag][1] # Fill value
# new_type = OBI_VOID dcols[tag][0][i] = value
# new_type = update_obitype(old_type, value)
# if old_type != new_type : except IndexError :
# rewrite = True
# value_type = type(value)
# try: old_column = dcols[tag][0]
# # Fill value old_nb_elements_per_line = old_column.nb_elements_per_line
# dcols[tag][0][i] = value new_nb_elements_per_line = 0
# old_elements_names = old_column.elements_names
# except IndexError : new_elements_names = None
#
# value_type = type(value) #####################################################################
# old_column = dcols[tag][0]
# old_nb_elements_per_line = old_column.nb_elements_per_line # Check the length and keys of column lines if needed
# new_nb_elements_per_line = 0 if value_type == dict : # Check dictionary keys
# old_elements_names = old_column.elements_names for k in value :
# new_elements_names = None if k not in old_elements_names :
# new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
# ##################################################################### rewrite = True
# break
# # Check the length and keys of column lines if needed
# if value_type == dict : # Check dictionary keys elif value_type == list or value_type == tuple : # Check vector length
# for k in value : if old_nb_elements_per_line < len(value) :
# if k not in old_elements_names : new_nb_elements_per_line = len(value)
# new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value])) rewrite = True
# rewrite = True
# break #####################################################################
#
# elif value_type == list or value_type == tuple : # Check vector length if rewrite :
# if old_nb_elements_per_line < len(value) : if new_nb_elements_per_line == 0 and new_elements_names is not None :
# new_nb_elements_per_line = len(value) new_nb_elements_per_line = len(new_elements_names)
# rewrite = True
# # Reset obierrno
# ##################################################################### obi_errno = 0
#
# if rewrite : dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
# if new_nb_elements_per_line == 0 and new_elements_names is not None : new_data_type=new_type,
# new_nb_elements_per_line = len(new_elements_names) new_nb_elements_per_line=new_nb_elements_per_line,
# new_elements_names=new_elements_names),
# # Reset obierrno value_obitype)
# obi_errno = 0
# # Update the dictionary:
# dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, for t in dcols :
# new_data_type=new_type, dcols[t] = (view[t], dcols[t][1])
# new_nb_elements_per_line=new_nb_elements_per_line,
# new_elements_names=new_elements_names), # Fill value
# value_obitype) dcols[tag][0][i] = value
#
# # Update the dictionary: i+=1
# for t in dcols :
# dcols[t] = (view[t], dcols[t][1]) print("\n")
# print(view.__repr__())
# # Fill value
# dcols[tag][0][i] = value input[0].close() # TODO
# output[0].close()
# i+=1
#
# print("\n")
# print(view.__repr__())
#
# d.close()

View File

@ -0,0 +1,8 @@
#cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms.taxo.taxo cimport Taxonomy
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=*, list mergedKeys_list=*, bint mergeIds=*, list categories=*)

View File

@ -1,59 +1,29 @@
#cython: language_level=3 #cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms.dms import DMS # TODO cimport doesn't work from obitools3.dms.dms cimport DMS
from obitools3.dms.view.view import View # TODO cimport doesn't work from obitools3.dms.taxo.taxo cimport Taxonomy
from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS from obitools3.dms.view.view cimport View, Line
from obitools3.dms.obiseq cimport Nuc_Seq from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from obitools3.dms.column.column cimport Column from obitools3.dms.column.column cimport Column, Column_line
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
from obitools3.dms.capi.obitypes cimport OBI_INT from obitools3.dms.capi.obitypes cimport OBI_INT, index_t
from obitools3.utils cimport tostr from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger
# TODO silence non-implemented options # TODO silence non-implemented options
__title__="Groups records together" __title__="Groups records together"
default_config = { 'inputview' : None,
'outputview' : None
}
def addOptions(parser): def addOptions(parser):
# TODO put this common group somewhere else but I don't know where addSequenceInputOption(parser)
group=parser.add_argument_group('DMS and view options') addMinimalOutputOption(parser)
group.add_argument('--default-dms','-d',
action="store", dest="obi:defaultdms",
metavar='<DMS NAME>',
default=None,
type=str,
help="Name of the default DMS for reading and writing data.")
group.add_argument('--input-view','-i',
action="store", dest="obi:inputview",
metavar='<INPUT VIEW NAME>',
default=None,
type=str,
help="Name of the input view, either raw if the view is in the default DMS,"
" or in the form 'dms:view' if it is in another DMS.")
group.add_argument('--output-view','-o',
action="store", dest="obi:outputview",
metavar='<OUTPUT VIEW NAME>',
default=None,
type=str,
help="Name of the output view, either raw if the view is in the default DMS,"
" or in the form 'dms:view' if it is in another DMS.")
group.add_argument('--taxo','-t',
action="store", dest="obi:taxo",
metavar='<TAXONOMY NAME>',
default='', # TODO not None because if it's None, the option is not entered in the option dictionary.
type=str,
help="Name of the taxonomy to use.")
group = parser.add_argument_group('obi uniq specific options') group = parser.add_argument_group('obi uniq specific options')
@ -89,152 +59,198 @@ def addOptions(parser):
# TODO taxonomy # TODO taxonomy
# TODO cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=None, list mergedKeys_list=None, bint mergeIds=False, list categories=None) :
COUNT_COLUMN_str = tostr(COUNT_COLUMN)
def uniqSequence(view, pb, o_view, taxonomy=None, mergedKey=None, mergeIds=False, categories=None) :
cdef int i
cdef int o_idx
cdef int u_idx
cdef int u_id
cdef int i_count
cdef set mergedKeys
cdef dict uniques
cdef dict merged_infos
cdef object iter_view
cdef Line i_seq
cdef Line o_seq
cdef str key
cdef bytes key_b
cdef str mkey
cdef str merged_col_name
cdef Column i_col
cdef Column seq_col
cdef object to_merge
cdef Column_line mcol
cdef Column_line i_mcol
uniques = {} uniques = {}
if categories is None: if categories is None:
categories=[] categories=[]
if mergedKey is not None: if mergedKeys_list is not None:
mergedKey=set(mergedKey) mergedKeys=set(mergedKeys_list)
else: else:
mergedKey=set() mergedKeys=set()
if taxonomy is not None: # if taxonomy is not None:
mergedKey.add('taxid') # mergedKeys.add('taxid')
# Faire parcours de la view des colonnes à merged pour créer les merged_col avant et les remplir au fur et à mesure # Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
#logger("info", "obi uniq", "First browsing through the input")
o_idx = 0 merged_infos = {}
i = 0 i = 0
seq_col = view[NUC_SEQUENCE_COLUMN]
iter_view = iter(view) iter_view = iter(view)
for i_seq in iter_view : for i_seq in iter_view:
pass pb(i)
for key in mergedKeys:
mkey = "merged_%s" % key
if key in i_seq: # TODO what if mkey already in i_seq?
if mkey not in merged_infos:
merged_infos[mkey] = {}
mkey_infos = merged_infos[mkey]
mkey_infos['nb_elts'] = 1
mkey_infos['elt_names'] = [i_seq[key]]
else:
mkey_infos = merged_infos[mkey]
if i_seq[key] not in mkey_infos['elt_names']: # TODO make faster? but how?
mkey_infos['elt_names'].append(i_seq[key])
mkey_infos['nb_elts'] += 1
i+=1
for key in mergedKeys:
merged_col_name = "merged_%s" % key
i_col = view[key]
Column.new_column(o_view,
merged_col_name,
OBI_INT,
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
elements_names=merged_infos[merged_col_name]['elt_names'],
comments=i_col.comments,
alias=merged_col_name # TODO what if it already exists
)
del(merged_infos)
#logger("info", "obi uniq", "Second browsing through the input")
i = 0
o_idx = 0
seq_col = view[NUC_SEQUENCE_COLUMN]
iter_view = iter(view) iter_view = iter(view)
for i_seq in iter_view : for i_seq in iter_view :
pb(i) pb(i)
# utiliser l'index des AVLs, faire l'API
#u_id = tuple(i_seq[x] for x in categories) + (str(i_seq),) #u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),)
u_id = seq_col.get_line_idx(i) u_id = seq_col.get_line_idx(i)
if u_id in uniques: if u_id in uniques:
u_seq = uniques[u_id]
o_seq = o_view[u_seq['idx']] if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
if COUNT_COLUMN_str in i_seq: i_count = 1
o_seq[COUNT_COLUMN_str] += i_seq[COUNT_COLUMN_str]
else: else:
o_seq[COUNT_COLUMN_str] += 1 i_count = i_seq[COUNT_COLUMN]
# seq['COUNT']=1
u_idx = uniques[u_id]
o_seq = o_view[u_idx]
o_seq[COUNT_COLUMN] += i_count
# if taxonomy is not None and 'taxid' in seq: # if taxonomy is not None and 'taxid' in seq:
# s['merged_taxid'][seq['taxid']]= # s['merged_taxid'][seq['taxid']]=
for key in mergedKey: for key in mergedKeys:
if key=='taxid' and mergeIds: # TODO # if key=='taxid' and mergeIds: # TODO
if 'taxid_dist' in i_seq: # if 'taxid_dist' in i_seq:
u_seq["taxid_dist"].update(i_seq["taxid_dist"]) # u_seq["taxid_dist"].update(i_seq["taxid_dist"])
if 'taxid' in i_seq: # if 'taxid' in i_seq:
u_seq["taxid_dist"][i_seq.id] = i_seq['taxid'] # u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
mkey = "merged_%s" % key mkey = "merged_%s" % key
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
if key in i_seq: if key in i_seq:
u_seq[mkey][i_seq[key]] = u_seq[mkey].get(i_seq[key], 0) + i_seq[COUNT_COLUMN_str] to_merge = i_seq[key]
mcol = o_seq[mkey]
if mcol[to_merge] is None:
mcol[to_merge] = i_count
else:
mcol[to_merge] = mcol[to_merge] + i_count
#cas ou merged_keys existe deja #cas ou merged_keys existe deja
else: else: # TODO is this a good else
if mkey in i_seq: if mkey in i_seq:
for skey in i_seq[mkey]: mcol = o_seq[mkey]
u_seq[mkey][skey] = u_seq[mkey].get(skey,0) + i_seq[mkey][skey] i_mcol = i_seq[mkey]
for key_b in i_mcol:
for key in i_seq.keys(): if mcol[key_b] is None:
mcol[key_b] = i_mcol[key_b]
else:
mcol[key_b] = mcol[key_b] + i_mcol[key_b]
for key_b in i_seq.keys():
# Merger proprement l'attribut merged s'il existe # Merger proprement l'attribut merged s'il existe
if key in o_seq and o_seq[key] != i_seq[key] and tostr(key) != COUNT_COLUMN_str : #and key[0:7]!='merged_' and key!='merged': TODO check this # TODO make special columns list?
o_seq[key] = None if key_b != COUNT_COLUMN and key_b != ID_COLUMN and key_b != NUC_SEQUENCE_COLUMN and key_b in o_seq and o_seq[key_b] != i_seq[key_b] :
o_seq[key_b] = None
if mergeIds:
u_seq['merged'].append(i_seq.id) # if mergeIds: # TODO
# u_seq['merged'].append(i_seq.id)
else:
else:
o_view[o_idx] = i_seq o_view[o_idx] = i_seq
o_seq = o_view[o_idx] o_seq = o_view[o_idx]
uniques[u_id] = {'idx':o_idx} uniques[u_id] = o_idx
u_seq = uniques[u_id]
o_idx += 1 o_idx += 1
if COUNT_COLUMN_str not in o_seq: if COUNT_COLUMN not in o_seq or o_seq[COUNT_COLUMN] is None:
o_seq[COUNT_COLUMN_str] = 1 o_seq[COUNT_COLUMN] = 1
for key in mergedKey: for key in mergedKeys:
if key=='taxid' and mergeIds: # if key=='taxid' and mergeIds:
if 'taxid' in o_seq and 'taxid_dist' not in o_seq: # if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
u_seq["taxid_dist"] = {} # u_seq["taxid_dist"] = {}
else : # else :
u_seq["taxid_dist"] = o_seq["taxid_dist"] # u_seq["taxid_dist"] = o_seq["taxid_dist"]
if 'taxid' in o_seq: # if 'taxid' in o_seq:
u_seq["taxid_dist"][o_seq.id] = o_seq['taxid'] # u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
mkey = "merged_%s" % key mkey = "merged_%s" % key
if mkey not in o_seq:
u_seq[mkey]={}
else :
u_seq[mkey] = o_seq[mkey]
if key in o_seq: if key in o_seq:
u_seq[mkey][o_seq[key]] = u_seq[mkey].get(o_seq[key],0) + o_seq[COUNT_COLUMN_str] to_merge = o_seq[key]
o_seq[key] = None mcol = o_seq[mkey]
if to_merge in mcol and mcol[to_merge] is not None:
if mergeIds: mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN]
u_seq['merged']=[o_seq.id] else:
mcol[to_merge] = o_seq[COUNT_COLUMN]
o_seq[key] = None # TODO delete column eventually -> make C function?
# if mergeIds:
# u_seq['merged']=[o_seq.id]
i+=1 i+=1
#TODO #TODO
#if taxonomy is not None: #if taxonomy is not None:
# mergeTaxonomyClassification(uniqSeq, taxonomy) # mergeTaxonomyClassification(uniqSeq, taxonomy)
# Get informations to build the columns with merged attributes
merged_infos = {}
for u_id in uniques :
u_seq = uniques[u_id]
for mkey in u_seq :
if mkey != 'idx' :
mkey_dict = u_seq[mkey]
if mkey not in merged_infos :
merged_infos[mkey] = {}
mkey_infos = merged_infos[mkey]
mkey_infos['nb_elts'] = len(mkey_dict.keys())
mkey_infos['elt_names'] = [k for k in mkey_dict]
else :
mkey_infos = merged_infos[mkey]
for k in mkey_dict :
if k not in mkey_infos['elt_names'] :
mkey_infos['elt_names'].append(k)
mkey_infos['nb_elts'] += 1
keys_to_del = []
for k in merged_infos :
if merged_infos[k]['nb_elts'] == 0:
keys_to_del.append(k)
for k in keys_to_del :
del merged_infos[k]
return (uniques, merged_infos)
def run(config): def run(config):
# TODO declare variables cdef tuple input
cdef tuple output
cdef View_NUC_SEQS entries
cdef View_NUC_SEQS o_view
cdef ProgressBar pb
logger("info","obi uniq")
# Open DMS input = open_uri(config['obi']['inputURI'])
d = DMS.open(config['obi']['defaultdms'])
if input[2] != View_NUC_SEQS:
# Open input view raise NotImplementedError('obi uniq only works on NUC_SEQS views')
entries = View.open(d, config['obi']['inputview'])
output = open_uri(config['obi']['outputURI'],
input=False,
newviewtype=View_NUC_SEQS)
entries = input[1]
o_view = output[1]
# Initialize the progress bar # Initialize the progress bar
pb = ProgressBar(len(entries), config, seconde=5) pb = ProgressBar(len(entries), config, seconde=5)
@ -243,55 +259,20 @@ def run(config):
# usm = uniqPrefixSequence # usm = uniqPrefixSequence
# else: # else:
usm = uniqSequence usm = uniqSequence
# Create output view
view_class = View.get_view_class(entries.type)
if view_class == View_NUC_SEQS :
get_quality = tostr(QUALITY_COLUMN) in entries # TODO
o_view = View_NUC_SEQS.new(d, config['obi']['outputview'], quality=get_quality)
else :
o_view = view_class.new(d, config['obi']['outputview'])
(uniques, merged_infos) = usm(entries, pb, o_view, config['obi']['taxo'], config['uniq']['merge'], config['uniq']['mergeids'], config['uniq']['categories']) usm(entries, o_view, pb, taxonomy=None, mergedKeys_list=config['uniq']['merge'], mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])
if 'merge' in config['uniq'] : # if 'merge' in config['uniq'] :
merged_keys=set(config['uniq']['merge']) # merged_keys=set(config['uniq']['merge'])
else: # else:
merged_keys=set() # merged_keys=set()
#
if 'taxo' in config['obi'] : # if 'taxo' in config['obi'] :
merged_keys.add('taxid') # merged_keys.add('taxid')
# TODO gotta handle special merged columns
for k in merged_keys:
merged_col_name = "merged_%s" % k
if merged_col_name in merged_infos :
i_col = entries[k]
Column.new_column(o_view,
merged_col_name,
OBI_INT,
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
elements_names=merged_infos[merged_col_name]['elt_names'],
comments=i_col.comments,
alias=merged_col_name
)
for u_id in uniques:
u_dict = uniques[u_id]
for merged_k in u_dict :
if merged_k in merged_infos : # TODO don't enter irrelevant keys to begin with, instead
o_view[u_dict['idx']][merged_k] = u_dict[merged_k]
print("\n") print("\n")
print(repr(o_view)) print(repr(o_view))
d.close() input[0].close()
output[0].close()

View File

@ -371,8 +371,18 @@ cdef class Column_line :
self._column.set_item(self._index, elt_id, value) self._column.set_item(self._index, elt_id, value)
def __contains__(self, object element_name): def get(self, object elt_id, object default=None): # TODO returns default if None???
return (tobytes(element_name) in self._column.elements_names) if elt_id in self:
return self._column.get_item(self._index, elt_id)
else:
return default
def __contains__(self, object elt_id):
if type(elt_id) == int:
return elt_id < self._column.nb_elements_per_line
else:
return (tobytes(elt_id) in self._column.elements_names)
def __repr__(self) : def __repr__(self) :

View File

@ -21,7 +21,7 @@ cdef class Seq(dict) :
if tags is not None : if tags is not None :
for k in tags: for k in tags:
k_b = tobytes(k) k_b = tobytes(k)
self[k_b] = tags[k_b] self[k_b] = tags[k]
def __contains__(self, object key): def __contains__(self, object key):
@ -70,11 +70,10 @@ cdef class Nuc_Seq(Seq) :
# nuc sequence property getter and setter # nuc sequence property getter and setter
@property @property
def seq(self): def seq(self):
return self._seq return self[NUC_SEQUENCE_COLUMN]
@seq.setter @seq.setter
def seq(self, object new_seq): # @DuplicatedSignature def seq(self, object new_seq): # @DuplicatedSignature
self._seq = new_seq
self[NUC_SEQUENCE_COLUMN] = tobytes(new_seq) self[NUC_SEQUENCE_COLUMN] = tobytes(new_seq)
# sequence quality property getter and setter # sequence quality property getter and setter

View File

@ -58,12 +58,25 @@ cdef class View_NUC_SEQS(View):
return view return view
# TODO # TODO test time gain without
@OBIWrapper.checkIsActive
def __getitem__(self, object item) : def __getitem__(self, object item) :
if type(item) == int : if type(item) == int :
return Nuc_Seq_Stored(self, item) return Nuc_Seq_Stored(self, item)
else : # TODO assume str or bytes for optimization? else : # TODO assume str or bytes for optimization?
return self.get_column(item) # TODO hyper lent dans la pratique return self.get_column(item) # TODO hyper lent dans la pratique
@OBIWrapper.checkIsActive
def __iter__(self):
# Iteration on each line of all columns
# Declarations
cdef index_t line_nb
# Yield each line
for line_nb in range(self.line_count) :
yield Nuc_Seq_Stored(self, line_nb)
# TODO? test if efficiency gain # TODO? test if efficiency gain

View File

@ -6,7 +6,9 @@ Created on 30 mars 2016
@author: coissac @author: coissac
''' '''
from obitools3.dms.obiseq import Nuc_Seq import types
from obitools3.dms.obiseq cimport Nuc_Seq
def fastaIterator(lineiterator, def fastaIterator(lineiterator,
@ -48,7 +50,7 @@ def fastaIterator(lineiterator,
while True: while True:
if read >= ionly: if ionly >= 0 and read >= ionly:
break break
while skipped < skip : while skipped < skip :
@ -79,7 +81,7 @@ def fastaIterator(lineiterator,
# definition, # definition,
# tags=tags, # tags=tags,
# ) # )
# TODO
yield { "id" : ident, yield { "id" : ident,
"definition" : definition, "definition" : definition,
"sequence" : sequence, "sequence" : sequence,
@ -105,65 +107,65 @@ def fastaNucIterator(lineiterator,
cdef list s cdef list s
cdef bytes sequence cdef bytes sequence
cdef int lines_to_skip, ionly, read cdef int lines_to_skip, ionly, read
# cdef OBI_Seq seq cdef Nuc_Seq seq
if only is None: if only is None:
ionly=-1 ionly = -1
else: else:
ionly=int(only) ionly = int(only)
if isinstance(lineiterator,(str,bytes)): if isinstance(lineiterator, (str, bytes)):
lineiterator=uopen(lineiterator) lineiterator=uopen(lineiterator)
if isinstance(lineiterator, types.GeneratorType):
iterator = lineiterator
if isinstance(lineiterator, LineBuffer): if isinstance(lineiterator, LineBuffer):
lb=lineiterator iterator = iter(lineiterator)
else: else:
lb=LineBuffer(lineiterator,buffersize) iterator = iter(LineBuffer(lineiterator, buffersize))
skipped = 0 skipped = 0
read = 0 read = 0
i = iter(lb)
if firstline is None: if firstline is None:
line = next(i) line = next(iterator)
else: else:
line = firstline line = firstline
while True: while True:
if read >= ionly: if ionly >= 0 and read >= ionly:
break break
while skipped < skip : while skipped < skip :
line = next(i) line = next(iterator)
try: try:
while line[0]!='>': while line[0]!='>':
line = next(i) line = next(iterator)
except StopIteration: except StopIteration:
pass pass
skipped += 1 skipped += 1
ident,tags,definition = parseHeader(line) ident,tags,definition = parseHeader(line)
s = [] s = []
line = next(i) line = next(iterator)
try: try:
while line[0]!='>': while line[0]!='>':
s.append(str2bytes(line)[0:-1]) s.append(str2bytes(line)[0:-1])
line = next(i) line = next(iterator)
except StopIteration: except StopIteration:
pass pass
sequence = b"".join(s) sequence = b"".join(s)
# seq =
seq = Nuc_Seq(ident, seq = Nuc_Seq(ident,
sequence, sequence,
definition, definition=definition,
None,-1, quality=None,
tags) offset=-1,
tags=tags)
yield seq yield seq
# yield { "id" : ident, # yield { "id" : ident,

View File

@ -6,7 +6,7 @@ Created on 30 mars 2016
@author: coissac @author: coissac
''' '''
from obitools3.dms.obiseq import Nuc_Seq from obitools3.dms.obiseq cimport Nuc_Seq
def fastqIterator(lineiterator, def fastqIterator(lineiterator,
@ -74,12 +74,11 @@ def fastqWithQualityIterator(lineiterator,
else: else:
hline = firstline hline = firstline
for line in i: for line in i:
if read >= ionly: if ionly >= 0 and read >= ionly:
break break
ident,tags,definition = parseHeader(hline) ident,tags,definition = parseHeader(hline)
sequence = str2bytes(line[0:-1]) sequence = str2bytes(line[0:-1])
next(i) next(i)
@ -87,9 +86,10 @@ def fastqWithQualityIterator(lineiterator,
seq = Nuc_Seq(ident, seq = Nuc_Seq(ident,
sequence, sequence,
definition, definition=definition,
quality,qualityoffset, quality=quality,
tags) offset=qualityoffset,
tags=tags)
yield seq yield seq
@ -149,22 +149,23 @@ def fastqWithoutQualityIterator(lineiterator,
hline = next(i) hline = next(i)
else: else:
hline = firstline hline = firstline
for line in i: for line in i:
if read >= ionly: if ionly >= 0 and read >= ionly:
break break
ident,tags,definition = parseHeader(hline) ident,tags,definition = parseHeader(hline)
sequence = str2bytes(line[0:-1]) sequence = str2bytes(line[0:-1])
next(i) next(i)
next(i) next(i)
seq = Nuc_Seq(ident, seq = Nuc_Seq(ident,
sequence, sequence,
definition, definition=definition,
None,-1, quality=None,
tags) offset=-1,
tags=tags)
yield seq yield seq

View File

@ -41,11 +41,11 @@ def entryIteratorFactory(lineiterator,
if isinstance(lineiterator, LineBuffer): if isinstance(lineiterator, LineBuffer):
lb=lineiterator lb=lineiterator
else: else:
lb=LineBuffer(lineiterator,buffersize) lb=LineBuffer(lineiterator, buffersize)
i = iter(lb) i = iter(lb)
first=next(i) first=next(i)
format=b"tabular" format=b"tabular"
@ -61,26 +61,29 @@ def entryIteratorFactory(lineiterator,
format=b"ecopcrfile" format=b"ecopcrfile"
elif is_ngsfilter_line(first): elif is_ngsfilter_line(first):
format=b"ngsfilter" format=b"ngsfilter"
# TODO Temporary fix
first=None
lineiterator.seek(0)
if format==b'fasta': if format==b'fasta':
if seqtype == b'nuc': if seqtype == b'nuc':
return (fastaNucIterator(lineiterator, return (fastaNucIterator(lineiterator,
skip,only, skip=skip,only=only,
first), firstline=first,
buffersize=buffersize),
Nuc_Seq) Nuc_Seq)
else: else:
raise NotImplementedError() raise NotImplementedError()
elif format==b'fastq': elif format==b'fastq':
return (fastqIterator(lineiterator, return (fastqIterator(lineiterator,
skip,only, skip=skip,only=only,
qualityoffset, qualityoffset=qualityoffset,
first), noquality=noquality,
firstline=first,
buffersize=buffersize),
Nuc_Seq) Nuc_Seq)
raise NotImplementedError('File format not yet implemented') raise NotImplementedError('File format not yet implemented')

View File

@ -364,19 +364,22 @@ def open_uri(uri,
if qualifiers[b"seqtype"]==b"nuc": if qualifiers[b"seqtype"]==b"nuc":
objclass = Nuc_Seq objclass = Nuc_Seq
if format==b"fasta": if format==b"fasta":
iseq = fastaNucIterator(file,skip,only) iseq = fastaNucIterator(file,
skip=skip,
only=only)
elif format==b"fastq": elif format==b"fastq":
iseq = fastqIterator(file, iseq = fastqIterator(file,
skip,only, skip=skip,
offset, only=only,
noquality) offset=offset,
noquality=noquality)
else: else:
raise NotImplementedError('Sequence file format not implemented') raise NotImplementedError('Sequence file format not implemented')
elif qualifiers[b"seqtype"]==b"prot": elif qualifiers[b"seqtype"]==b"prot":
raise NotImplementedError() raise NotImplementedError()
else: else:
iseq,objclass = entryIteratorFactory(file, iseq,objclass = entryIteratorFactory(file,
skip,only, skip, only,
seqtype, seqtype,
offset, offset,
noquality, noquality,
@ -388,13 +391,12 @@ def open_uri(uri,
stripwhite, stripwhite,
blanklineskip, blanklineskip,
commentchar) commentchar)
tmpdms = get_temp_dms() #tmpdms = get_temp_dms()
return (file,iseq,objclass,urib) return (file, iseq, objclass, urib)

View File

@ -136,7 +136,7 @@ int bloom_init_size(struct bloom * bloom, int entries, double error,
/** *************************************************************************** /** ***************************************************************************
* Check if the given element is in the bloom filter. Remember this may * Check if the given element is in the bloom filter. Remember this may
* return false positive if a collision occured. * return false positive if a collision occurred.
* *
* Parameters: * Parameters:
* ----------- * -----------

View File

@ -2463,7 +2463,7 @@ index_t obi_avl_group_add(OBIDMS_avl_group_p avl_group, Obi_blob_p value)
// Check if the AVL group is writable // Check if the AVL group is writable
if (!(avl_group->writable)) if (!(avl_group->writable))
{ {
obi_set_errno(OBI_READ_ONLY_INDEXER_ERROR); obi_set_errno(OBI_READ_ONLY_INDEXER_ERROR); // Note: this error is read by the calling functions to clone the AVL group if needed
return -1; return -1;
} }
@ -2476,6 +2476,9 @@ index_t obi_avl_group_add(OBIDMS_avl_group_p avl_group, Obi_blob_p value)
// Add in the current AVL // Add in the current AVL
index_in_avl = (int32_t) obi_avl_add((avl_group->sub_avls)[avl_group->last_avl_idx], value); index_in_avl = (int32_t) obi_avl_add((avl_group->sub_avls)[avl_group->last_avl_idx], value);
if (index_in_avl < 0)
return -1;
bloom_add(&((((avl_group->sub_avls)[avl_group->last_avl_idx])->header)->bloom_filter), value, obi_blob_sizeof(value)); bloom_add(&((((avl_group->sub_avls)[avl_group->last_avl_idx])->header)->bloom_filter), value, obi_blob_sizeof(value));
// Build the index containing the AVL index // Build the index containing the AVL index

View File

@ -32,7 +32,7 @@ Obi_blob_p obi_blob(byte_t* encoded_value, uint8_t element_size, int32_t length_
Obi_blob_p blob; Obi_blob_p blob;
// Allocate the memory for the blob structure // Allocate the memory for the blob structure
blob = (Obi_blob_p) malloc(sizeof(Obi_blob_t) + length_encoded_value); blob = (Obi_blob_p) calloc(sizeof(Obi_blob_t) + length_encoded_value, sizeof(byte_t));
if (blob == NULL) if (blob == NULL)
{ {
obi_set_errno(OBI_MALLOC_ERROR); obi_set_errno(OBI_MALLOC_ERROR);

View File

@ -240,7 +240,6 @@ OBIDMS_p obi_create_dms(const char* dms_path)
char* directory_name; char* directory_name;
DIR* dms_dir; DIR* dms_dir;
int dms_file_descriptor; int dms_file_descriptor;
size_t i, j;
// Build and check the directory name // Build and check the directory name
directory_name = build_directory_name(dms_path); directory_name = build_directory_name(dms_path);
@ -318,7 +317,7 @@ OBIDMS_p obi_create_dms(const char* dms_path)
*/ */
// Create the informations file // Create the informations file
if (create_dms_infos_file(dms_file_descriptor, basename(dms_path)) < 0) if (create_dms_infos_file(dms_file_descriptor, basename((char*)dms_path)) < 0)
return NULL; return NULL;
return obi_open_dms(dms_path); return obi_open_dms(dms_path);
@ -333,7 +332,6 @@ OBIDMS_p obi_open_dms(const char* dms_path)
int infos_file_descriptor; int infos_file_descriptor;
bool little_endian_dms; bool little_endian_dms;
bool little_endian_platform; bool little_endian_platform;
size_t i, j;
dms = NULL; dms = NULL;
@ -356,7 +354,7 @@ OBIDMS_p obi_open_dms(const char* dms_path)
i++; i++;
} }
*/ */
strcpy(dms->dms_name, basename(dms_path)); strcpy(dms->dms_name, basename((char*)dms_path));
// Build and check the directory name including the relative path // Build and check the directory name including the relative path
complete_dms_path = build_directory_name(dms_path); complete_dms_path = build_directory_name(dms_path);

View File

@ -703,7 +703,7 @@ static int get_formatted_elt_names_length(const char* elements_names)
static index_t get_line_count_per_page(OBIType_t data_type, index_t nb_elements_per_line) static index_t get_line_count_per_page(OBIType_t data_type, index_t nb_elements_per_line)
{ {
return getpagesize() / (obi_sizeof(data_type) * nb_elements_per_line); return getpagesize() / obi_sizeof(data_type) / nb_elements_per_line;
} }
@ -919,6 +919,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms,
// The initial line count should be between the minimum (corresponding to the page size) and the maximum allowed // The initial line count should be between the minimum (corresponding to the page size) and the maximum allowed
minimum_line_count = get_line_count_per_page(stored_data_type, nb_elements_per_line); minimum_line_count = get_line_count_per_page(stored_data_type, nb_elements_per_line);
if (minimum_line_count == 0) // Happens if high number of elements per line
minimum_line_count = 1;
if (nb_lines > MAXIMUM_LINE_COUNT) if (nb_lines > MAXIMUM_LINE_COUNT)
{ {
obidebug(1, "\nCan't create column because of line count greater than the maximum allowed (%d)", MAXIMUM_LINE_COUNT); obidebug(1, "\nCan't create column because of line count greater than the maximum allowed (%d)", MAXIMUM_LINE_COUNT);
@ -1023,7 +1025,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms,
if (new_column->data == MAP_FAILED) if (new_column->data == MAP_FAILED)
{ {
obi_set_errno(OBICOL_UNKNOWN_ERROR); obi_set_errno(OBICOL_UNKNOWN_ERROR);
obidebug(1, "\nError mmapping the data of a column"); obidebug(1, "\nError mmapping the data of a column.\nArguments: data_size=%lu, column_file_descriptor=%d, header_size=%lu",
data_size, column_file_descriptor, header_size);
munmap(new_column->header, header_size); munmap(new_column->header, header_size);
close(column_file_descriptor); close(column_file_descriptor);
free(new_column); free(new_column);