New command: obi taxonomy to add local taxa (closes #64)

This commit is contained in:
Celine Mercier
2023-02-13 10:59:20 +13:00
parent eeb93afa7d
commit 2130a949c7
6 changed files with 353 additions and 91 deletions

View File

@ -0,0 +1,230 @@
#cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms import DMS
from obitools3.dms.view.view cimport View, Line_selection
from obitools3.uri.decode import open_uri
from obitools3.apps.optiongroups import addMinimalInputOption, addTaxonomyOption, addMinimalOutputOption, addNoProgressBarOption
from obitools3.dms.view import RollbackException
from obitools3.dms.column.column cimport Column
from functools import reduce
from obitools3.apps.config import logger
from obitools3.utils cimport tobytes, str2bytes, tostr
from io import BufferedWriter
from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
ID_COLUMN, \
DEFINITION_COLUMN, \
QUALITY_COLUMN, \
COUNT_COLUMN, \
TAXID_COLUMN
from obitools3.dms.capi.obitypes cimport OBI_INT
from obitools3.dms.capi.obitaxonomy cimport MIN_LOCAL_TAXID
import time
import math
import sys
from cpython.exc cimport PyErr_CheckSignals
__title__="Add taxa with a new generated taxid to an NCBI taxonomy database"
def addOptions(parser):
addMinimalInputOption(parser)
addTaxonomyOption(parser)
addMinimalOutputOption(parser)
addNoProgressBarOption(parser)
group=parser.add_argument_group('obi taxonomy specific options')
group.add_argument('-n', '--taxon-name-tag',
action="store",
dest="taxonomy:taxon_name_tag",
metavar="<SCIENTIFIC_NAME_TAG>",
default=b"SCIENTIFIC_NAME",
help="Name of the tag giving the scientific name of the taxon "
"(default: 'SCIENTIFIC_NAME').")
# group.add_argument('-g', '--try-genus-match',
# action="store_true", dest="taxonomy:try_genus_match",
# default=False,
# help="Try matching the first word of <SCIENTIFIC_NAME_TAG> when can't find corresponding taxid for a taxon. "
# "If there is a match it is added in the 'parent_taxid' tag. (Can be used by 'obi taxonomy' to add the taxon under that taxid).")
group.add_argument('-a', '--restricting-ancestor',
action="store",
dest="taxonomy:restricting_ancestor",
metavar="<RESTRICTING_ANCESTOR>",
default=None,
help="Enables to restrict the addition of taxids under an ancestor specified by its taxid.")
group.add_argument('-t', '--taxid-tag',
action="store",
dest="taxonomy:taxid_tag",
metavar="<TAXID_TAG>",
default=b"TAXID",
help="Name of the tag to store the new taxid "
"(default: 'TAXID').")
group.add_argument('-l', '--log-file',
action="store",
dest="taxonomy:log_file",
metavar="<LOG_FILE>",
default='',
help="Path to a log file to write informations about added taxids.")
def run(config):
DMS.obi_atexit()
logger("info", "obi taxonomy")
# Open the input
input = open_uri(config['obi']['inputURI'])
if input is None:
raise Exception("Could not read input view")
i_dms = input[0]
i_view = input[1]
i_view_name = input[1].name
# Open the output: only the DMS, as the output view is going to be created by cloning the input view
# (could eventually be done via an open_uri() argument)
output = open_uri(config['obi']['outputURI'],
input=False,
dms_only=True)
if output is None:
raise Exception("Could not create output view")
o_dms = output[0]
output_0 = output[0]
o_view_name = output[1]
# stdout output: create temporary view
if type(output_0)==BufferedWriter:
o_dms = i_dms
i=0
o_view_name = b"temp"
while o_view_name in i_dms: # Making sure view name is unique in output DMS
o_view_name = o_view_name+b"_"+str2bytes(str(i))
i+=1
imported_view_name = o_view_name
# If the input and output DMS are not the same, import the input view in the output DMS before cloning it to modify it
# (could be the other way around: clone and modify in the input DMS then import the new view in the output DMS)
if i_dms != o_dms:
imported_view_name = i_view_name
i=0
while imported_view_name in o_dms: # Making sure view name is unique in output DMS
imported_view_name = i_view_name+b"_"+str2bytes(str(i))
i+=1
View.import_view(i_dms.full_path[:-7], o_dms.full_path[:-7], i_view_name, imported_view_name)
i_view = o_dms[imported_view_name]
# Clone output view from input view
o_view = i_view.clone(o_view_name)
if o_view is None:
raise Exception("Couldn't create output view")
i_view.close()
# Open taxonomy
taxo_uri = open_uri(config['obi']['taxoURI'])
if taxo_uri is None or taxo_uri[2] == bytes:
raise Exception("Couldn't open taxonomy")
taxo = taxo_uri[1]
# Initialize the progress bar
if config['obi']['noprogressbar'] == False:
pb = ProgressBar(len(o_view), config)
else:
pb = None
try:
if config['taxonomy']['log_file']:
logfile = open(config['taxonomy']['log_file'], 'w')
else:
logfile = sys.stdout
if 'restricting_ancestor' in config['taxonomy']:
res_anc = int(config['taxonomy']['restricting_ancestor'])
else:
res_anc = None
taxid_column_name = config['taxonomy']['taxid_tag']
parent_taxid_column_name = "PARENT_TAXID" # TODO macro
taxon_name_column_name = config['taxonomy']['taxon_name_tag']
taxid_column = Column.new_column(o_view, taxid_column_name, OBI_INT)
if parent_taxid_column_name in o_view:
parent_taxid_column = o_view[parent_taxid_column_name]
else:
parent_taxid_column = None
#parent_taxid_column = Column.new_column(o_view, parent_taxid_column_name, OBI_INT)
taxon_name_column = o_view[taxon_name_column_name]
for i in range(len(o_view)):
PyErr_CheckSignals()
#if pb is not None:
# pb(i)
taxon_name = taxon_name_column[i]
taxon = taxo.get_taxon_by_name(taxon_name, res_anc)
if taxon is not None:
taxid_column[i] = taxon.taxid
if logfile:
print(f"Found taxon '{tostr(taxon_name)}' already existing with taxid {taxid_column[i]}", file=logfile)
else: # try finding genus or other parent taxon from the first word
#print(i, o_view[i].id)
if parent_taxid_column is not None and parent_taxid_column[i] is not None:
taxid_column[i] = taxo.add_taxon(taxon_name, 'species', parent_taxid_column[i])
if logfile:
print(f"Adding taxon '{tostr(taxon_name)}' under provided parent {parent_taxid_column[i]} with taxid {taxid_column[i]}", file=logfile)
else:
taxon_name_sp = taxon_name.split(b" ")
taxon = taxo.get_taxon_by_name(taxon_name_sp[0], res_anc)
if taxon is not None:
parent_taxid_column[i] = taxon.taxid
taxid_column[i] = taxo.add_taxon(taxon_name, 'species', taxon.taxid)
if logfile:
print(f"Adding taxon '{tostr(taxon_name)}' under '{tostr(taxon.name)}' ({taxon.taxid}) with taxid {taxid_column[i]}", file=logfile)
else:
taxid_column[i] = taxo.add_taxon(taxon_name, 'species', res_anc)
if logfile:
print(f"Adding taxon '{tostr(taxon_name)}' under provided restricting ancestor {res_anc} with taxid {taxid_column[i]}", file=logfile)
taxo.write(taxo.name, update=True)
except Exception, e:
raise RollbackException("obi taxonomy error, rollbacking view: "+str(e), o_view)
#if pb is not None:
# pb(i, force=True)
# print("", file=sys.stderr)
#logger("info", "\nTaxa already in the taxonomy: "+str(found_count)+"/"+str(len(o_view))+" ("+str(round(found_count*100.0/len(o_view), 2))+"%)")
#logger("info", "\nParent taxids found: "+str(parent_found_count)+"/"+str(len(o_view))+" ("+str(round(parent_found_count*100.0/len(o_view), 2))+"%)")
#logger("info", "\nTaxids not found: "+str(not_found_count)+"/"+str(len(o_view))+" ("+str(round(not_found_count*100.0/len(o_view), 2))+"%)")
# Save command config in View and DMS comments
command_line = " ".join(sys.argv[1:])
input_dms_name=[input[0].name]
input_view_name=[i_view_name]
if 'taxoURI' in config['obi'] and config['obi']['taxoURI'] is not None:
input_dms_name.append(config['obi']['taxoURI'].split("/")[-3])
input_view_name.append("taxonomy/"+config['obi']['taxoURI'].split("/")[-1])
o_view.write_config(config, "taxonomy", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
o_dms.record_command_line(command_line)
#print("\n\nOutput view:\n````````````", file=sys.stderr)
#print(repr(o_view), file=sys.stderr)
# stdout output: write to buffer
if type(output_0)==BufferedWriter:
logger("info", "Printing to output...")
o_view.print_to_output(output_0, noprogressbar=config['obi']['noprogressbar'])
o_view.close()
# If the input and the output DMS are different or if stdout output, delete the temporary imported view used to create the final view
if i_dms != o_dms or type(output_0)==BufferedWriter:
View.delete_view(o_dms, imported_view_name)
o_dms.close(force=True)
i_dms.close(force=True)
logger("info", "Done.")

View File

@ -58,7 +58,7 @@ cdef extern from "obidms_taxonomy.h" nogil:
OBIDMS_taxonomy_p obi_read_taxdump(const_char_p taxdump)
int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const_char_p tax_name)
int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const_char_p tax_name, bint update)
int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy)

View File

@ -19,8 +19,8 @@ cdef class Taxonomy(OBIWrapper) :
cpdef Taxon get_taxon_by_idx(self, int idx)
cpdef Taxon get_taxon_by_taxid(self, int taxid)
cpdef Taxon get_taxon_by_name(self, object taxon_name, object restricting_taxid=*)
cpdef write(self, object prefix)
cpdef int add_taxon(self, str name, str rank_name, int parent_taxid, int min_taxid=*)
cpdef write(self, object prefix, bint update=*)
cpdef int add_taxon(self, object name, object rank_name, int parent_taxid, int min_taxid=*)
cpdef object get_species(self, int taxid)
cpdef object get_genus(self, int taxid)
cpdef object get_family(self, int taxid)

View File

@ -174,6 +174,7 @@ cdef class Taxonomy(OBIWrapper) :
cpdef Taxon get_taxon_by_name(self, object taxon_name, object restricting_taxid=None):
#print(taxon_name)
taxon = self._name_dict.get(tobytes(taxon_name), None)
if not taxon:
return None
@ -282,12 +283,12 @@ cdef class Taxonomy(OBIWrapper) :
yield Taxon(taxon_capsule, self)
cpdef write(self, object prefix) :
if obi_write_taxonomy(self._dms.pointer(), self.pointer(), tobytes(prefix)) < 0 :
cpdef write(self, object prefix, bint update=False) :
if obi_write_taxonomy(self._dms.pointer(), self.pointer(), tobytes(prefix), update) < 0 :
raise Exception("Error writing the taxonomy to binary files")
cpdef int add_taxon(self, str name, str rank_name, int parent_taxid, int min_taxid=10000000) :
cpdef int add_taxon(self, object name, object rank_name, int parent_taxid, int min_taxid=10000000) :
cdef int taxid
taxid = obi_taxo_add_local_taxon(self.pointer(), tobytes(name), tobytes(rank_name), parent_taxid, min_taxid)
if taxid < 0 :
@ -329,6 +330,7 @@ cdef class Taxonomy(OBIWrapper) :
if taxon is not None:
while taxon.taxid != 1:
yield taxon
#print(taxon.taxid)
taxon = taxon.parent
yield taxon
else:

View File

@ -1092,7 +1092,7 @@ static int write_ranks_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxo
free(taxonomy_path);
// Create file
file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777);
file_descriptor = open(file_name, O_RDWR | O_CREAT, 0777);
if (file_descriptor < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
@ -1196,7 +1196,7 @@ static int write_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* t
free(taxonomy_path);
// Create file
file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777);
file_descriptor = open(file_name, O_RDWR | O_CREAT, 0777);
if (file_descriptor < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
@ -1472,7 +1472,7 @@ static int write_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxo
free(taxonomy_path);
// Create file
file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777);
file_descriptor = open(file_name, O_RDWR | O_CREAT, 0777);
if (file_descriptor < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
@ -1760,7 +1760,7 @@ static int write_merged_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax
free(taxonomy_path);
// Create file
file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777);
file_descriptor = open(file_name, O_RDWR | O_CREAT, 0777);
if (file_descriptor < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
@ -3250,15 +3250,15 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo
}
int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name)
int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name, bool update)
{
char* taxonomy_path;
if (!update) {
// Build the taxonomy directory path
taxonomy_path = get_taxonomy_path(dms, tax_name);
if (taxonomy_path == NULL)
return -1;
// Try to create the directory
if (mkdir(taxonomy_path, 00777) < 0)
{
@ -3268,8 +3268,8 @@ int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name
free(taxonomy_path);
return -1;
}
free(taxonomy_path);
}
if (write_ranks_idx(dms, tax, tax_name) < 0)
return -1;
@ -3279,18 +3279,19 @@ int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name
return -1;
if (write_merged_idx(dms, tax, tax_name) < 0)
return -1;
// Check if there are local taxa (if so last taxon is local)
if ((tax->taxa)->local_count > 0)
{
if (write_local_taxonomy_idx(dms, tax, tax_name) < 0)
return -1;
}
// Write preferred names if there are some
if (tax->preferred_names != NULL)
{
if (write_preferred_names_idx(dms, tax, tax_name) < 0)
return -1;
}
// Write local taxa if there are some
if ((tax->taxa)->local_count > 0)
{
if (write_local_taxonomy_idx(dms, tax, tax_name) < 0)
return -1;
}
return 0;
}
@ -3302,16 +3303,17 @@ int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy)
if (taxonomy)
{
// Update local informations (local taxa and preferred names) if there are any
if ((taxonomy->taxa)->local_count > 0)
{
if (taxonomy->dms == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError closing a taxonomy with local files but no DMS associated (probably read directly from taxdump)"); // TODO discuss
}
if (write_local_taxonomy_idx(taxonomy->dms, taxonomy, taxonomy->tax_name) < 0)
return -1;
}
// Done with write_taxo, edits all needed files. Only ldx file was edited in OBI1 but it led to issues. Discussable
// if ((taxonomy->taxa)->local_count > 0)
// {
// if (taxonomy->dms == NULL)
// {
// obi_set_errno(OBI_TAXONOMY_ERROR);
// obidebug(1, "\nError closing a taxonomy with local files but no DMS associated (probably read directly from taxdump)"); // TODO discuss
// }
// if (write_local_taxonomy_idx(taxonomy->dms, taxonomy, taxonomy->tax_name) < 0)
// return -1;
// }
// Write preferred names if there are some
if (taxonomy->preferred_names)
@ -3377,9 +3379,10 @@ int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy)
int obi_taxo_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid)
{
int32_t taxid;
int32_t count;
ecotx_t* taxon;
int i;
// econame_t* name_struct;
econame_t* name_struct;
// Enlarge the structure memory for a new taxon
tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * (((tax->taxa)->count) + 1));
@ -3441,42 +3444,65 @@ int obi_taxo_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char
((tax->taxa)->local_count)++;
(tax->taxa)->buffer_size = (tax->taxa)->count;
// // Add new name in names structure // Commented because the new name was not added in the .ndx file in the OBITools1
// // Allocate memory for new name
// tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->names)->count + 1));
// if (tax->names == NULL)
// {
// obi_set_errno(OBI_MALLOC_ERROR);
// obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon");
// return -1;
// }
//
// // Add new name
// name_struct = (tax->names)->names + ((tax->names)->count);
// name_struct->name = (char*) malloc((strlen(name) + 1) * sizeof(char));
// if (name_struct->name == NULL)
// {
// obi_set_errno(OBI_MALLOC_ERROR);
// obidebug(1, "\nError allocating memory for a taxon name to add a new taxon");
// return -1;
// }
// strcpy(name_struct->name, name);
// name_struct->class_name = (char*) malloc((strlen("scientific name") + 1) * sizeof(char));
// if (name_struct->class_name == NULL)
// {
// obi_set_errno(OBI_MALLOC_ERROR);
// obidebug(1, "\nError allocating memory for a taxon class name to add a new taxon");
// return -1;
// }
// strcpy(name_struct->class_name, "scientific name");
// name_struct->is_scientific_name = true;
// name_struct->taxon = ((tax->taxa)->taxon) + ((tax->taxa)->count) - 1;
//
// // Sort names in alphabetical order
// qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names);
//
// // Update name count
// ((tax->names)->count)++;
// Add new name in names structure // On the OBI1, the new name was not added in the .ndx file but it could create issues
// Allocate memory for new name
tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->names)->count + 1));
if (tax->names == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon");
return -1;
}
// Add new name
name_struct = (tax->names)->names + ((tax->names)->count);
name_struct->name = (char*) malloc((strlen(name) + 1) * sizeof(char));
if (name_struct->name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for a taxon name to add a new taxon");
return -1;
}
strcpy(name_struct->name, name);
name_struct->class_name = (char*) malloc((strlen("scientific name") + 1) * sizeof(char));
if (name_struct->class_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for a taxon class name to add a new taxon");
return -1;
}
strcpy(name_struct->class_name, "scientific name");
name_struct->is_scientific_name = true;
name_struct->taxon = ((tax->taxa)->taxon) + ((tax->taxa)->count) - 1;
// Update name count
((tax->names)->count)++;
// Sort names in alphabetical order
qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names);
// Add to merged index
tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * ((tax->merged_idx)->count + 1));
if (tax->merged_idx == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a taxonomy structure");
return -1;
}
count = (tax->merged_idx)->count;
(tax->merged_idx)->count = count + 1;
(tax->merged_idx)->merged[count].taxid = taxid;
(tax->merged_idx)->merged[count].idx = taxon->idx;
//fprintf(stderr, "\nEntered in merged taxon.idx=%d", (tax->merged_idx)->merged[(tax->merged_idx)->count -1].idx);
//fprintf(stderr, "\nEntered in merged taxon.taxid=%d", (tax->merged_idx)->merged[(tax->merged_idx)->count -1].taxid);
//fprintf(stderr, "\nEntered in merged at %d", (tax->merged_idx)->count -1);
//taxon = obi_taxo_get_taxon_with_taxid(tax, taxid);
//fprintf(stderr, "\ntaxon=%x", taxon);
//fprintf(stderr, "\ntaxon.taxid=%d", taxon->taxid);
//fprintf(stderr, "\ntaxon.name=%s", taxon->name);
//fprintf(stderr, "\ntaxon.idx=%d\n\n", ((tax->merged_idx)->count));
return taxid;
}
@ -3547,11 +3573,12 @@ int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon
name_struct->is_scientific_name = false;
name_struct->taxon = taxon;
// Update preferred name count
((tax->preferred_names)->count)++;
// Sort preferred names in alphabetical order
qsort((tax->preferred_names)->names, (tax->preferred_names)->count, sizeof(econame_t), cmp_names);
// Update preferred name count
((tax->preferred_names)->count)++;
return 0;
}
@ -3669,8 +3696,10 @@ ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid
else if (indexed_taxon->idx == -1)
current_taxon = NULL; // TODO discuss what to do when old deleted taxon
else
{
current_taxon = (taxonomy->taxa->taxon)+(indexed_taxon->idx);
//fprintf(stderr, "\n>>>idx %d, taxid %d<<<\n", indexed_taxon->idx, indexed_taxon->taxid);
}
return current_taxon;
}

View File

@ -239,6 +239,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo
* @param dms A pointer on the DMS to which the taxonomy belongs.
* @param tax A pointer on the taxonomy structure.
* @param tax_name The name (prefix) of the taxonomy.
* @param update Whether files should be rewritten or if it's a new taxonomy (set to true e.g. after adding local taxa).
*
* @returns An integer value indicating the success of the operation.
* @retval 0 on success.
@ -247,7 +248,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo
* @since 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name);
int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name, bool update);
/**