668 lines
24 KiB
Cython
668 lines
24 KiB
Cython
#cython: language_level=3
|
|
|
|
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
|
from obitools3.dms import DMS
|
|
from obitools3.dms.view.view cimport View
|
|
from obitools3.dms.obiseq cimport Nuc_Seq_Stored
|
|
from obitools3.dms.view import RollbackException
|
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
|
from obitools3.dms.column.column cimport Column, Column_line
|
|
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN, TAXID_COLUMN, \
|
|
TAXID_DIST_COLUMN, MERGED_TAXID_COLUMN, MERGED_COLUMN, MERGED_PREFIX, \
|
|
REVERSE_QUALITY_COLUMN
|
|
from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
|
|
from obitools3.apps.optiongroups import addMinimalInputOption, \
|
|
addMinimalOutputOption, \
|
|
addTaxonomyOption, \
|
|
addEltLimitOption, \
|
|
addNoProgressBarOption
|
|
from obitools3.uri.decode import open_uri
|
|
from obitools3.apps.config import logger
|
|
from obitools3.utils cimport tobytes, tostr, str2bytes
|
|
|
|
import sys
|
|
from cpython.exc cimport PyErr_CheckSignals
|
|
from io import BufferedWriter
|
|
|
|
|
|
__title__="Group sequence records together"
|
|
|
|
|
|
def addOptions(parser):
|
|
|
|
addMinimalInputOption(parser)
|
|
addTaxonomyOption(parser)
|
|
addMinimalOutputOption(parser)
|
|
addEltLimitOption(parser)
|
|
addNoProgressBarOption(parser)
|
|
|
|
group = parser.add_argument_group('obi uniq specific options')
|
|
|
|
group.add_argument('--merge', '-m',
|
|
action="append", dest="uniq:merge",
|
|
metavar="<TAG NAME>",
|
|
default=[],
|
|
type=str,
|
|
help="Attributes to merge.") # note: must be a 1 elt/line column, but columns containing already merged information (name MERGED_*) are automatically remerged
|
|
|
|
group.add_argument('--merge-ids', '-e',
|
|
action="store_true", dest="uniq:mergeids",
|
|
default=False,
|
|
help="Add the merged key with all ids of merged sequences.")
|
|
|
|
group.add_argument('--category-attribute', '-c',
|
|
action="append", dest="uniq:categories",
|
|
metavar="<Attribute Name>",
|
|
default=[],
|
|
help="Add one attribute to the list of attributes "
|
|
"used to group sequences before dereplication "
|
|
"(option can be used several times).")
|
|
|
|
|
|
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy, dict config) :
|
|
|
|
cdef int taxid
|
|
cdef Nuc_Seq_Stored seq
|
|
cdef list m_taxids
|
|
cdef bytes k
|
|
cdef object tsp
|
|
cdef object tgn
|
|
cdef object tfa
|
|
cdef object sp_sn
|
|
cdef object gn_sn
|
|
cdef object fa_sn
|
|
|
|
# Create columns and save them for efficiency
|
|
if b"species" in o_view and o_view[b"species"].data_type_int != OBI_INT :
|
|
o_view.delete_column(b"species")
|
|
if b"species" not in o_view:
|
|
Column.new_column(o_view,
|
|
b"species",
|
|
OBI_INT
|
|
)
|
|
species_column = o_view[b"species"]
|
|
|
|
if b"genus" in o_view and o_view[b"genus"].data_type_int != OBI_INT :
|
|
o_view.delete_column(b"genus")
|
|
if b"genus" not in o_view:
|
|
Column.new_column(o_view,
|
|
b"genus",
|
|
OBI_INT
|
|
)
|
|
genus_column = o_view[b"genus"]
|
|
|
|
if b"family" in o_view and o_view[b"family"].data_type_int != OBI_INT :
|
|
o_view.delete_column(b"family")
|
|
if b"family" not in o_view:
|
|
Column.new_column(o_view,
|
|
b"family",
|
|
OBI_INT
|
|
)
|
|
family_column = o_view[b"family"]
|
|
|
|
if b"species_name" in o_view and o_view[b"species_name"].data_type_int != OBI_STR :
|
|
o_view.delete_column(b"species_name")
|
|
if b"species_name" not in o_view:
|
|
Column.new_column(o_view,
|
|
b"species_name",
|
|
OBI_STR
|
|
)
|
|
species_name_column = o_view[b"species_name"]
|
|
|
|
if b"genus_name" in o_view and o_view[b"genus_name"].data_type_int != OBI_STR :
|
|
o_view.delete_column(b"genus_name")
|
|
if b"genus_name" not in o_view:
|
|
Column.new_column(o_view,
|
|
b"genus_name",
|
|
OBI_STR
|
|
)
|
|
genus_name_column = o_view[b"genus_name"]
|
|
|
|
if b"family_name" in o_view and o_view[b"family_name"].data_type_int != OBI_STR :
|
|
o_view.delete_column(b"family_name")
|
|
if b"family_name" not in o_view:
|
|
Column.new_column(o_view,
|
|
b"family_name",
|
|
OBI_STR
|
|
)
|
|
family_name_column = o_view[b"family_name"]
|
|
|
|
if b"rank" in o_view and o_view[b"rank"].data_type_int != OBI_STR :
|
|
o_view.delete_column(b"rank")
|
|
if b"rank" not in o_view:
|
|
Column.new_column(o_view,
|
|
b"rank",
|
|
OBI_STR
|
|
)
|
|
rank_column = o_view[b"rank"]
|
|
|
|
if b"scientific_name" in o_view and o_view[b"scientific_name"].data_type_int != OBI_STR :
|
|
o_view.delete_column(b"scientific_name")
|
|
if b"scientific_name" not in o_view:
|
|
Column.new_column(o_view,
|
|
b"scientific_name",
|
|
OBI_STR
|
|
)
|
|
scientific_name_column = o_view[b"scientific_name"]
|
|
|
|
# Initialize the progress bar
|
|
if config['obi']['noprogressbar'] == False:
|
|
pb = ProgressBar(len(o_view), config)
|
|
else:
|
|
pb = None
|
|
|
|
i=0
|
|
for seq in o_view:
|
|
PyErr_CheckSignals()
|
|
if pb is not None:
|
|
pb(i)
|
|
if MERGED_TAXID_COLUMN in seq :
|
|
m_taxids = []
|
|
m_taxids_dict = seq[MERGED_TAXID_COLUMN]
|
|
for k in m_taxids_dict.keys() :
|
|
if m_taxids_dict[k] is not None:
|
|
m_taxids.append(int(k))
|
|
taxid = taxonomy.last_common_taxon(*m_taxids)
|
|
seq[TAXID_COLUMN] = taxid
|
|
tsp = taxonomy.get_species(taxid)
|
|
tgn = taxonomy.get_genus(taxid)
|
|
tfa = taxonomy.get_family(taxid)
|
|
|
|
if tsp is not None:
|
|
sp_sn = taxonomy.get_scientific_name(tsp)
|
|
else:
|
|
sp_sn = None # TODO was '###', discuss
|
|
tsp = None # TODO was '-1', discuss
|
|
|
|
if tgn is not None:
|
|
gn_sn = taxonomy.get_scientific_name(tgn)
|
|
else:
|
|
gn_sn = None
|
|
tgn = None
|
|
|
|
if tfa is not None:
|
|
fa_sn = taxonomy.get_scientific_name(tfa)
|
|
else:
|
|
fa_sn = None
|
|
tfa = None
|
|
|
|
species_column[i] = tsp
|
|
genus_column[i] = tgn
|
|
family_column[i] = tfa
|
|
|
|
species_name_column[i] = sp_sn
|
|
genus_name_column[i] = gn_sn
|
|
family_name_column[i] = fa_sn
|
|
|
|
rank_column[i] = taxonomy.get_rank(taxid)
|
|
scientific_name_column[i] = taxonomy.get_scientific_name(taxid)
|
|
i+=1
|
|
|
|
if pb is not None:
|
|
pb(len(o_view), force=True)
|
|
|
|
|
|
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, dict config, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=1000000) :
|
|
|
|
cdef int i
|
|
cdef int k
|
|
cdef int k_count
|
|
cdef int o_idx
|
|
cdef int u_idx
|
|
cdef int i_idx
|
|
cdef int i_count
|
|
cdef int o_count
|
|
cdef str key_str
|
|
cdef bytes key
|
|
cdef bytes mkey
|
|
cdef bytes merged_col_name
|
|
cdef bytes o_id
|
|
cdef bytes i_id
|
|
cdef set mergedKeys_set
|
|
cdef tuple unique_id
|
|
cdef list catl
|
|
cdef list mergedKeys
|
|
cdef list mergedKeys_list_b
|
|
cdef list mergedKeys_m
|
|
cdef list str_merged_cols
|
|
cdef list merged_sequences
|
|
cdef dict uniques
|
|
cdef dict merged_infos
|
|
cdef dict mkey_infos
|
|
cdef dict merged_dict
|
|
cdef dict mkey_cols
|
|
cdef Nuc_Seq_Stored i_seq
|
|
cdef Nuc_Seq_Stored o_seq
|
|
cdef Nuc_Seq_Stored u_seq
|
|
cdef Column i_seq_col
|
|
cdef Column i_id_col
|
|
cdef Column i_taxid_col
|
|
cdef Column i_taxid_dist_col
|
|
cdef Column o_id_col
|
|
cdef Column o_taxid_dist_col
|
|
cdef Column o_merged_col
|
|
cdef Column o_count_col
|
|
cdef Column i_count_col
|
|
cdef Column_line i_mcol
|
|
cdef object taxid_dist_dict
|
|
cdef object iter_view
|
|
cdef object mcol
|
|
cdef object to_merge
|
|
cdef list merged_list
|
|
|
|
uniques = {}
|
|
|
|
for column_name in view.keys():
|
|
if column_name[:7] == b"MERGED_":
|
|
info_to_merge = column_name[7:]
|
|
if mergedKeys_list is not None:
|
|
mergedKeys_list.append(tostr(info_to_merge))
|
|
else:
|
|
mergedKeys_list = [tostr(info_to_merge)]
|
|
|
|
mergedKeys_list_b = []
|
|
if mergedKeys_list is not None:
|
|
for key_str in mergedKeys_list:
|
|
mergedKeys_list_b.append(tobytes(key_str))
|
|
mergedKeys_set=set(mergedKeys_list_b)
|
|
else:
|
|
mergedKeys_set=set()
|
|
|
|
if taxonomy is not None:
|
|
mergedKeys_set.add(TAXID_COLUMN)
|
|
|
|
mergedKeys = list(mergedKeys_set)
|
|
|
|
k_count = len(mergedKeys)
|
|
|
|
mergedKeys_m = []
|
|
for k in range(k_count):
|
|
mergedKeys_m.append(MERGED_PREFIX + mergedKeys[k])
|
|
|
|
# Check that not trying to remerge without total count information
|
|
for key in mergedKeys_m:
|
|
if key in view and COUNT_COLUMN not in view:
|
|
raise Exception("\n>>>>\nError: trying to re-merge tags without total count tag. Run obi annotate to add the count tag from the relevant merged tag, i.e.: \nobi annotate --set-tag COUNT:'sum([value for key,value in sequence['MERGED_sample'].items()])' dms/input dms/output\n")
|
|
|
|
if categories is None:
|
|
categories = []
|
|
|
|
# Keep columns that are going to be used a lot in variables
|
|
i_seq_col = view[NUC_SEQUENCE_COLUMN]
|
|
i_id_col = view[ID_COLUMN]
|
|
if TAXID_COLUMN in view:
|
|
i_taxid_col = view[TAXID_COLUMN]
|
|
if TAXID_DIST_COLUMN in view:
|
|
i_taxid_dist_col = view[TAXID_DIST_COLUMN]
|
|
|
|
|
|
# First browsing
|
|
i = 0
|
|
o_idx = 0
|
|
|
|
logger("info", "First browsing through the input")
|
|
merged_infos = {}
|
|
iter_view = iter(view)
|
|
for i_seq in iter_view :
|
|
PyErr_CheckSignals()
|
|
if pb is not None:
|
|
pb(i)
|
|
|
|
# This can't be done in the same line as the unique_id tuple creation because it generates a bug
|
|
# where Cython (version 0.25.2) does not detect the reference to the categs_list variable and deallocates
|
|
# it at the beginning of the function.
|
|
# (Only happens if categs_list is an optional parameter, which it is).
|
|
catl = []
|
|
for x in categories :
|
|
catl.append(i_seq[x])
|
|
|
|
#unique_id = tuple(catl) + (i_seq_col[i],)
|
|
unique_id = tuple(catl) + (i_seq_col.get_line_idx(i),)
|
|
#unique_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),) # The line that cython can't read properly
|
|
|
|
if unique_id in uniques:
|
|
uniques[unique_id].append(i)
|
|
else:
|
|
uniques[unique_id] = [i]
|
|
|
|
for k in range(k_count):
|
|
key = mergedKeys[k]
|
|
mkey = mergedKeys_m[k]
|
|
if mkey in i_seq:
|
|
if mkey not in merged_infos:
|
|
merged_infos[mkey] = {}
|
|
mkey_infos = merged_infos[mkey]
|
|
mkey_infos['nb_elts'] = view[mkey].nb_elements_per_line
|
|
mkey_infos['elt_names'] = view[mkey].elements_names
|
|
if key in i_seq:
|
|
if mkey not in merged_infos:
|
|
merged_infos[mkey] = {}
|
|
mkey_infos = merged_infos[mkey]
|
|
mkey_infos['nb_elts'] = 1
|
|
mkey_infos['elt_names'] = [i_seq[key]]
|
|
else:
|
|
mkey_infos = merged_infos[mkey]
|
|
if i_seq[key] not in mkey_infos['elt_names']: # TODO make faster? but how?
|
|
mkey_infos['elt_names'].append(i_seq[key])
|
|
mkey_infos['nb_elts'] += 1
|
|
i+=1
|
|
|
|
# Create merged columns
|
|
str_merged_cols = []
|
|
mkey_cols = {}
|
|
for k in range(k_count):
|
|
key = mergedKeys[k]
|
|
merged_col_name = mergedKeys_m[k]
|
|
|
|
if merged_col_name in view:
|
|
i_col = view[merged_col_name]
|
|
else:
|
|
i_col = view[key]
|
|
|
|
if merged_infos[merged_col_name]['nb_elts'] > max_elts:
|
|
str_merged_cols.append(merged_col_name)
|
|
Column.new_column(o_view,
|
|
merged_col_name,
|
|
OBI_STR,
|
|
to_eval=True,
|
|
comments=i_col.comments,
|
|
alias=merged_col_name
|
|
)
|
|
|
|
else:
|
|
Column.new_column(o_view,
|
|
merged_col_name,
|
|
OBI_INT,
|
|
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
|
|
elements_names=list(merged_infos[merged_col_name]['elt_names']),
|
|
comments=i_col.comments,
|
|
alias=merged_col_name
|
|
)
|
|
|
|
mkey_cols[merged_col_name] = o_view[merged_col_name]
|
|
|
|
# taxid_dist column
|
|
if mergeIds and TAXID_COLUMN in mergedKeys:
|
|
if len(view) > max_elts: #The number of different IDs corresponds to the number of sequences in the view
|
|
str_merged_cols.append(TAXID_DIST_COLUMN)
|
|
Column.new_column(o_view,
|
|
TAXID_DIST_COLUMN,
|
|
OBI_STR,
|
|
to_eval=True,
|
|
alias=TAXID_DIST_COLUMN
|
|
)
|
|
else:
|
|
Column.new_column(o_view,
|
|
TAXID_DIST_COLUMN,
|
|
OBI_INT,
|
|
nb_elements_per_line=len(view),
|
|
elements_names=[id for id in i_id_col],
|
|
alias=TAXID_DIST_COLUMN
|
|
)
|
|
|
|
del(merged_infos)
|
|
|
|
# Merged ids column
|
|
if mergeIds :
|
|
Column.new_column(o_view,
|
|
MERGED_COLUMN,
|
|
OBI_STR,
|
|
tuples=True,
|
|
alias=MERGED_COLUMN
|
|
)
|
|
|
|
# Keep columns in variables for efficiency
|
|
o_id_col = o_view[ID_COLUMN]
|
|
if TAXID_DIST_COLUMN in o_view:
|
|
o_taxid_dist_col = o_view[TAXID_DIST_COLUMN]
|
|
if MERGED_COLUMN in o_view:
|
|
o_merged_col = o_view[MERGED_COLUMN]
|
|
if COUNT_COLUMN not in o_view:
|
|
Column.new_column(o_view,
|
|
COUNT_COLUMN,
|
|
OBI_INT)
|
|
o_count_col = o_view[COUNT_COLUMN]
|
|
if COUNT_COLUMN in view:
|
|
i_count_col = view[COUNT_COLUMN]
|
|
|
|
if pb is not None:
|
|
pb(len(view), force=True)
|
|
print("")
|
|
|
|
logger("info", "Second browsing through the input")
|
|
|
|
# Initialize the progress bar
|
|
if pb is not None:
|
|
pb = ProgressBar(len(view))
|
|
|
|
o_idx = 0
|
|
total_treated = 0
|
|
|
|
for unique_id in uniques :
|
|
PyErr_CheckSignals()
|
|
|
|
merged_sequences = uniques[unique_id]
|
|
|
|
u_idx = uniques[unique_id][0]
|
|
u_seq = view[u_idx]
|
|
o_view[o_idx] = u_seq
|
|
o_seq = o_view[o_idx]
|
|
o_id = o_seq.id
|
|
|
|
if mergeIds:
|
|
merged_list = [view[idx].id for idx in merged_sequences]
|
|
if MERGED_COLUMN in view: # merge all ids if there's already some merged ids info
|
|
merged_list.extend(view[MERGED_COLUMN][idx] for idx in merged_sequences)
|
|
merged_list = list(set(merged_list)) # deduplicate the list
|
|
o_merged_col[o_idx] = merged_list
|
|
|
|
o_count = 0
|
|
|
|
if TAXID_DIST_COLUMN in u_seq and i_taxid_dist_col[u_idx] is not None:
|
|
taxid_dist_dict = i_taxid_dist_col[u_idx]
|
|
else:
|
|
taxid_dist_dict = {}
|
|
|
|
merged_dict = {}
|
|
for mkey in mergedKeys_m:
|
|
merged_dict[mkey] = {}
|
|
|
|
for i_idx in merged_sequences:
|
|
PyErr_CheckSignals()
|
|
|
|
if pb is not None:
|
|
pb(total_treated)
|
|
|
|
i_id = i_id_col[i_idx]
|
|
i_seq = view[i_idx]
|
|
|
|
if COUNT_COLUMN not in i_seq or i_count_col[i_idx] is None:
|
|
i_count = 1
|
|
else:
|
|
i_count = i_count_col[i_idx]
|
|
|
|
o_count += i_count
|
|
|
|
for k in range(k_count):
|
|
|
|
key = mergedKeys[k]
|
|
mkey = mergedKeys_m[k]
|
|
|
|
if key==TAXID_COLUMN and mergeIds:
|
|
if TAXID_DIST_COLUMN in i_seq:
|
|
taxid_dist_dict.update(i_taxid_dist_col[i_idx])
|
|
if TAXID_COLUMN in i_seq:
|
|
taxid_dist_dict[i_id] = i_taxid_col[i_idx]
|
|
|
|
# merge relevant keys
|
|
if key in i_seq:
|
|
to_merge = i_seq[key]
|
|
if to_merge is not None:
|
|
if type(to_merge) != bytes:
|
|
to_merge = tobytes(str(to_merge))
|
|
mcol = merged_dict[mkey]
|
|
if to_merge not in mcol or mcol[to_merge] is None:
|
|
mcol[to_merge] = i_count
|
|
else:
|
|
mcol[to_merge] = mcol[to_merge] + i_count
|
|
o_seq[key] = None
|
|
# merged infos already in seq: merge the merged infos
|
|
if mkey in i_seq:
|
|
mcol = merged_dict[mkey] # dict
|
|
i_mcol = i_seq[mkey] # column line
|
|
if i_mcol.is_NA() == False:
|
|
for key2 in i_mcol:
|
|
if key2 not in mcol:
|
|
mcol[key2] = i_mcol[key2]
|
|
else:
|
|
mcol[key2] = mcol[key2] + i_mcol[key2]
|
|
|
|
for key in i_seq.keys():
|
|
# Delete informations that differ between the merged sequences
|
|
# TODO make special columns list? // could be more efficient
|
|
if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] \
|
|
and key not in merged_dict :
|
|
o_seq[key] = None
|
|
|
|
total_treated += 1
|
|
|
|
# Write merged dicts
|
|
for mkey in merged_dict:
|
|
if mkey in str_merged_cols:
|
|
mkey_cols[mkey][o_idx] = str(merged_dict[mkey])
|
|
else:
|
|
mkey_cols[mkey][o_idx] = merged_dict[mkey]
|
|
# Sets NA values to 0 # TODO discuss, for now keep as None and test for None instead of testing for 0 in tools
|
|
#for key in mkey_cols[mkey][o_idx]:
|
|
# if mkey_cols[mkey][o_idx][key] is None:
|
|
# mkey_cols[mkey][o_idx][key] = 0
|
|
|
|
# Write taxid_dist
|
|
if mergeIds and TAXID_COLUMN in mergedKeys:
|
|
if TAXID_DIST_COLUMN in str_merged_cols:
|
|
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
|
|
else:
|
|
o_taxid_dist_col[o_idx] = taxid_dist_dict
|
|
|
|
o_count_col[o_idx] = o_count
|
|
o_idx += 1
|
|
|
|
if pb is not None:
|
|
pb(len(view), force=True)
|
|
|
|
# Deletes quality columns if there is one because the matching between sequence and quality will be broken (quality set to NA when sequence not)
|
|
if QUALITY_COLUMN in view:
|
|
o_view.delete_column(QUALITY_COLUMN)
|
|
if REVERSE_QUALITY_COLUMN in view:
|
|
o_view.delete_column(REVERSE_QUALITY_COLUMN)
|
|
|
|
# Delete old columns that are now merged
|
|
for k in range(k_count):
|
|
if mergedKeys[k] in o_view:
|
|
o_view.delete_column(mergedKeys[k])
|
|
|
|
if taxonomy is not None:
|
|
print("") # TODO because in the middle of progress bar. Better solution?
|
|
logger("info", "Merging taxonomy classification")
|
|
merge_taxonomy_classification(o_view, taxonomy, config)
|
|
|
|
|
|
|
|
def run(config):
|
|
|
|
cdef tuple input
|
|
cdef tuple output
|
|
cdef tuple taxo_uri
|
|
cdef Taxonomy taxo
|
|
cdef View_NUC_SEQS entries
|
|
cdef View_NUC_SEQS o_view
|
|
cdef ProgressBar pb
|
|
|
|
DMS.obi_atexit()
|
|
|
|
logger("info","obi uniq")
|
|
|
|
# Open the input
|
|
input = open_uri(config['obi']['inputURI'])
|
|
if input is None:
|
|
raise Exception("Could not read input view")
|
|
if input[2] != View_NUC_SEQS:
|
|
raise NotImplementedError('obi uniq only works on NUC_SEQS views')
|
|
|
|
# Open the output
|
|
output = open_uri(config['obi']['outputURI'],
|
|
input=False,
|
|
newviewtype=View_NUC_SEQS)
|
|
if output is None:
|
|
raise Exception("Could not create output view")
|
|
|
|
i_dms = input[0]
|
|
entries = input[1]
|
|
o_dms = output[0]
|
|
output_0 = output[0]
|
|
|
|
# If stdout output create a temporary view that will be exported and deleted.
|
|
if type(output_0)==BufferedWriter:
|
|
temporary_view_name = b"temp"
|
|
i=0
|
|
while temporary_view_name in i_dms: # Making sure view name is unique in input DMS
|
|
temporary_view_name = temporary_view_name+b"_"+str2bytes(str(i))
|
|
i+=1
|
|
o_view_name = temporary_view_name
|
|
o_dms = i_dms
|
|
o_view = View_NUC_SEQS.new(i_dms, o_view_name)
|
|
else:
|
|
o_view = output[1]
|
|
|
|
if 'taxoURI' in config['obi'] and config['obi']['taxoURI'] is not None:
|
|
taxo_uri = open_uri(config['obi']['taxoURI'])
|
|
if taxo_uri is None or taxo_uri[2] == bytes:
|
|
raise RollbackException("Couldn't open taxonomy, rollbacking view", o_view)
|
|
taxo = taxo_uri[1]
|
|
else :
|
|
taxo = None
|
|
|
|
# Initialize the progress bar
|
|
if config['obi']['noprogressbar'] == False:
|
|
pb = ProgressBar(len(entries), config)
|
|
else:
|
|
pb = None
|
|
|
|
if len(entries) > 0:
|
|
try:
|
|
uniq_sequences(entries, o_view, pb, config, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts'])
|
|
except Exception, e:
|
|
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
|
|
|
|
if pb is not None:
|
|
print("", file=sys.stderr)
|
|
|
|
# Save command config in View and DMS comments
|
|
command_line = " ".join(sys.argv[1:])
|
|
input_dms_name=[input[0].name]
|
|
input_view_name=[input[1].name]
|
|
if 'taxoURI' in config['obi'] and config['obi']['taxoURI'] is not None:
|
|
input_dms_name.append(config['obi']['taxoURI'].split("/")[-3])
|
|
input_view_name.append("taxonomy/"+config['obi']['taxoURI'].split("/")[-1])
|
|
o_view.write_config(config, "uniq", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
|
|
o_dms.record_command_line(command_line)
|
|
|
|
# stdout output: write to buffer
|
|
if type(output_0)==BufferedWriter:
|
|
logger("info", "Printing to output...")
|
|
o_view.print_to_output(output_0, noprogressbar=config['obi']['noprogressbar'])
|
|
o_view.close()
|
|
|
|
#print("\n\nOutput view:\n````````````", file=sys.stderr)
|
|
#print(repr(o_view), file=sys.stderr)
|
|
|
|
# If stdout output, delete the temporary result view in the input DMS
|
|
if type(output_0)==BufferedWriter:
|
|
View.delete_view(i_dms, o_view_name)
|
|
|
|
i_dms.close(force=True)
|
|
o_dms.close(force=True)
|
|
|
|
logger("info", "Done.")
|
|
|