Files
ecoprimers/src/libecoprimer/taxstats.c
2010-11-29 22:57:56 +00:00

264 lines
6.1 KiB
C

/*
* taxstats.c
*
* Created on: 12 mars 2009
* Author: coissac
*/
#include <search.h>
#include "ecoprimer.h"
static int cmptaxon(const void *t1, const void* t2);
static int cmptaxon(const void *t1, const void* t2)
{
const size_t taxid1=(size_t)t1;
const size_t taxid2=(size_t)t2;
// fprintf(stderr,"==> counted taxid1 : %d\n",taxid1);
// fprintf(stderr,"==> counted taxid2 : %d\n",taxid2);
if (taxid1 < taxid2)
return -1;
if (taxid1 > taxid2)
return +1;
return 0;
}
int32_t counttaxon(int32_t taxid)
{
static void* taxontree=NULL;
static int32_t taxoncount=0;
// fprintf(stderr,"counted taxid : %d taxontree %p\n",taxid,taxontree);
if (taxid==-1)
{
if (taxontree)
ECOFREE(taxontree,"Free taxon tree");
taxontree=NULL;
taxoncount=0;
return 0;
}
if ((taxid > 0) && ((!taxontree) || (!tfind((void*)((size_t)taxid),&taxontree,cmptaxon))))
{
tsearch((void*)((size_t)taxid),&taxontree,cmptaxon);
taxoncount++;
}
return taxoncount;
}
int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy,
poptions_t options)
{
uint32_t i;
ecotx_t *taxon;
ecotx_t *tmptaxon;
counttaxon(-1);
options->intaxa = 0;
for (i=0;i<seqdbsize;i++)
{
taxon = &(taxonomy->taxons->taxon[seqdb[i]->taxid]);
seqdb[i]->isexample=isExampleTaxon(taxonomy,seqdb[i]->taxid,options);
tmptaxon = eco_findtaxonatrank(taxon,
options->taxonrankidx);
// fprintf(stderr,"Taxid : %d %p\n",taxon->taxid,tmptaxon);
if (tmptaxon)
{
// fprintf(stderr,"orig : %d trans : %d\n",taxon->taxid,
// tmptaxon->taxid);
seqdb[i]->ranktaxonid=tmptaxon->taxid;
if (seqdb[i]->isexample)
options->intaxa = counttaxon(tmptaxon->taxid);
}
else
seqdb[i]->ranktaxonid=-1;
}
counttaxon(-1);
options->outtaxa = 0;
for (i=0;i<seqdbsize;i++)
{
if (seqdb[i]->ranktaxonid>=0 && !seqdb[i]->isexample)
options->outtaxa = counttaxon(seqdb[i]->ranktaxonid);
}
return options->outtaxa + options->intaxa;
}
float taxonomycoverage(ppair_t pair, poptions_t options)
{
int32_t seqcount;
int32_t i;
int32_t incount=0;
int32_t outcount=0;
seqcount=pair->pcr.ampcount;
counttaxon(-1);
for (i=0; i < seqcount; i++)
if (pair->pcr.amplifias[i].sequence->isexample
&& pair->pcr.amplifias[i].sequence->ranktaxonid > 0 )
incount = counttaxon(pair->pcr.amplifias[i].sequence->ranktaxonid);
counttaxon(-1);
for (i=0; i < seqcount; i++)
if (!pair->pcr.amplifias[i].sequence->isexample
&& pair->pcr.amplifias[i].sequence->ranktaxonid)
outcount = counttaxon(pair->pcr.amplifias[i].sequence->ranktaxonid);
pair->intaxa=incount;
pair->outtaxa=outcount;
pair->bc=(float)incount/options->intaxa;
return pair->bc;
}
static int cmpamp(const void *ampf1, const void* ampf2)
{
int i;
int j = 0;
int incr = 1;
char cd1;
char cd2;
int chd = 0;
int len = 0;
pamptotaxon_t pampf1 = (pamptotaxon_t) ampf1;
pamptotaxon_t pampf2 = (pamptotaxon_t) ampf2;
if (pampf1->strand != pampf2->strand)
{
incr = -1;
j = pampf1->length - 1;
if (pampf2->strand)
{
pampf1 = (pamptotaxon_t) ampf2;
pampf2 = (pamptotaxon_t) ampf1;
chd = 1;
}
}
len = (pampf1->length <= pampf2->length)? pampf1->length: pampf2->length;
for (i = 0; i < len; i++, j += incr)
{
cd1 = pampf1->amplifia[i];
if (incr == -1)
cd2 = ecoComplementChar(pampf2->amplifia[j]);
else
cd2 = pampf2->amplifia[j];
if (cd1 < cd2) return chd ? 1: -1;
if (cd2 < cd1) return chd ? -1: 1;
}
if (pampf1->length > pampf2->length) return chd ? -1: 1;
if (pampf2->length > pampf1->length) return chd ? 1: -1;
return 0;
}
void twalkaction (const void *node, VISIT order, int level)
{
int32_t *taxid = (int32_t*)node;
//const size_t taxid=(size_t)node;
//printf ("\t%d:%p, ", *taxid, node);
counttaxon(*taxid);
}
int32_t gtxid;
void twalkaction2 (const void *node, VISIT order, int level)
{
int32_t *pt = (int32_t *) node;
gtxid = *pt;
}
void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
{
uint32_t i, j;
uint32_t ampfindex = 0;
int32_t taxid;
void *ampftree = NULL;
pamptotaxon_t pcurrentampf;
pamptotaxon_t *ptmp;
pamptotaxon_t ampfwithtaxtree = ECOMALLOC(sizeof(amptotaxon_t) * pair->pcr.ampcount,"Cannot allocate amplifia tree");
for (i = 0; i < pair->pcr.ampcount; i++)
{
/*populate taxon ids tree against each unique amplifia
i.e set of taxon ids for each amplifia*/
if (pair->pcr.amplifias[i].sequence->isexample)
{
ampfwithtaxtree[ampfindex].amplifia = pair->pcr.amplifias[i].amplifia;
ampfwithtaxtree[ampfindex].strand = pair->pcr.amplifias[i].strand;
ampfwithtaxtree[ampfindex].length = pair->pcr.amplifias[i].length;
pcurrentampf = &ampfwithtaxtree[ampfindex];
taxid = pair->pcr.amplifias[i].sequence->ranktaxonid;
ptmp = tfind((const void*)pcurrentampf, &ampftree, cmpamp);
if (ptmp == NULL)
{
pcurrentampf = &ampfwithtaxtree[ampfindex];
tsearch((void*)pcurrentampf,&ampftree,cmpamp);
ampfindex++;
}
else
pcurrentampf = *ptmp;
if (tfind((void*)((size_t)taxid), &(pcurrentampf->taxontree), cmptaxon) == NULL)
{
pcurrentampf->taxoncount++;
tsearch((void*)((size_t)taxid),&(pcurrentampf->taxontree),cmptaxon);
}
}
}
memset (pair->wellIdentifiedSeqs, 0, seqdbsize*sizeof (int));
counttaxon(-1);
for (i = 0; i < ampfindex; i++)
{
if (ampfwithtaxtree[i].taxoncount > 1)
twalk(ampfwithtaxtree[i].taxontree, twalkaction);
//TR 5/9/10 - added code for well identified seqs
else if(ampfwithtaxtree[i].taxoncount == 1) /*well identified*/
{
gtxid = -1;
twalk(ampfwithtaxtree[i].taxontree, twalkaction2);
if (gtxid != -1)
{
for (j = 0; j < seqdbsize; j++)
if (seqdb[j]->ranktaxonid == gtxid
&&(pair->p1->directCount[j] > 0
|| pair->p1->reverseCount[j] > 0)
&& (pair->p2->directCount[j] > 0
|| pair->p2->reverseCount[j] > 0))
{
pair->wellIdentifiedSeqs[j] = 1;
}
}
}
}
pair->notwellidentifiedtaxa = counttaxon(-2);
pair->bs = ((float)pair->intaxa - (float)pair->notwellidentifiedtaxa) / pair->intaxa;
ECOFREE (ampfwithtaxtree, "Free amplifia table");
}