Merge of eric-test branche to the trunk

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@200 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
2009-04-20 08:38:41 +00:00
parent b8af5dd65f
commit e3d922e103
17 changed files with 1308 additions and 264 deletions

View File

@ -7,34 +7,40 @@
#include "ecoprimer.h"
#include <string.h>
#include <stdlib.h>
primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t primers,poptions_t options);
static void buildPrimerPairsForOneSeq(uint32_t seqid,
pecodnadb_t seqdb,
pprimercount_t primers,
ppairtree_t pairs,
poptions_t options);
int32_t pairinlist (ppairs_t pairlist, word_t w1, word_t w2, uint32_t size)
{
uint32_t i;
for (i = 0; i < size; i++)
{
if (w1 == pairlist[i].w1 && w2 == pairlist[i].w2) return i;
if (w1 == pairlist[i].w2 && w2 == pairlist[i].w1) return i;
}
return -1;
}
char *addamplifiasetelem (ppairs_t pair, char* amplifia, int32_t taxid)
/*************************************
*
* pair collection management
*
*************************************/
#ifdef MASKEDCODE
char *addamplifiasetelem (ppair_t pair, char* amplifia, int32_t taxid)
{
uint32_t i;
uint32_t j;
char *ampused = NULL;
if(pair->ampsetcount == 0)
{
pair->ampsetcount = 500;
pair->ampsetindex = 0;
pair->ampset = ECOMALLOC(pair->ampsetcount * sizeof(ampseqset_t),"Cannot allocate amplifia set");
}
for (i = 0; i < pair->ampsetindex; i++)
{
if (strcmp (pair->ampset[i].amplifia, amplifia) == 0)
@ -43,43 +49,43 @@ char *addamplifiasetelem (ppairs_t pair, char* amplifia, int32_t taxid)
break;
}
}
if (i == 0)
{
pair->ampset[i].seqidcount = 100;
pair->ampset[i].seqidindex = 0;
pair->ampset[i].taxonids = ECOMALLOC(pair->ampset[i].seqidcount * sizeof(uint32_t),"Cannot allocate amplifia sequence table");
}
if (pair->ampsetindex == pair->ampsetcount)
{
pair->ampsetcount += 500;
pair->ampset = ECOREALLOC(pair->ampset, pair->ampsetcount * sizeof(ampseqset_t), "Cannot allocate amplifia set");
}
if (pair->ampset[i].seqidindex == pair->ampset[i].seqidcount)
{
pair->ampset[i].seqidcount += 100;
pair->ampset[i].taxonids = ECOREALLOC(pair->ampset[i].taxonids, pair->ampset[i].seqidcount * sizeof(int32_t), "Cannot allocate amplifia sequence table");
}
if (pair->ampset[i].amplifia == NULL)
{
pair->ampset[i].amplifia = amplifia;
pair->ampsetindex++;
}
for (j = 0; j < pair->ampset[i].seqidindex; j++)
{
if (pair->ampset[i].taxonids[j] == taxid) break;
}
if (j == pair->ampset[i].seqidindex)
pair->ampset[i].taxonids[pair->ampset[i].seqidindex++] = taxid;
return ampused;
}
void addtaxampsetelem (ppairs_t pair, int32_t taxid, char *amplifia)
void addtaxampsetelem (ppair_t pair, int32_t taxid, char *amplifia)
{
uint32_t i;
uint32_t j;
@ -90,42 +96,42 @@ void addtaxampsetelem (ppairs_t pair, int32_t taxid, char *amplifia)
pair->taxsetindex = 0;
pair->taxset = ECOMALLOC(pair->taxsetcount * sizeof(taxampset_t),"Cannot allocate taxon set");
}
for (i = 0; i < pair->taxsetindex; i++)
{
if (pair->taxset[i].taxonid == taxid) break;
}
if (i == 0)
{
pair->taxset[i].amplifiacount = 100;
pair->taxset[i].amplifiaindex = 0;
pair->taxset[i].amplifia = ECOMALLOC(pair->taxset[i].amplifiacount * sizeof(char *),"Cannot allocate amplifia table");
}
if (pair->taxsetindex == pair->taxsetcount)
{
pair->taxsetcount += 500;
pair->taxset = ECOREALLOC(pair->taxset, pair->taxsetcount * sizeof(taxampset_t), "Cannot allocate taxon set");
}
if (pair->taxset[i].amplifiaindex == pair->taxset[i].amplifiacount)
{
pair->taxset[i].amplifiacount += 100;
pair->taxset[i].amplifia = ECOREALLOC(pair->taxset[i].amplifia, pair->taxset[i].amplifiacount * sizeof(char *), "Cannot allocate amplifia table");
}
if (pair->taxset[i].taxonid == 0)
{
pair->taxset[i].taxonid = taxid;
pair->taxsetindex++;
}
for (j = 0; j < pair->taxset[i].amplifiaindex; j++)
{
if (strcmp(pair->taxset[i].amplifia[j], amplifia) == 0) break;
}
if (j == pair->taxset[i].amplifiaindex)
{
pair->taxset[i].amplifia[j] = amplifia;
@ -135,140 +141,62 @@ void addtaxampsetelem (ppairs_t pair, int32_t taxid, char *amplifia)
char *getamplifia (pecoseq_t seq, uint32_t start, uint32_t len)
{
fprintf(stderr,"start : %d length : %d\n",start,len);
char *amplifia = ECOMALLOC((len + 1) * sizeof(char),"Cannot allocate amplifia");
char *seqc = &seq->SQ[start];
strncpy(amplifia, seqc, len);
return amplifia;
}
#endif
/*TR: Added*/
pairscount_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options)
ppairtree_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options)
{
uint32_t i;
uint32_t j;
uint32_t k;
uint32_t d;
uint32_t strt;
uint32_t end;
uint32_t paircount = 0;
uint32_t pairslots = 500;
int32_t foundindex;
ppairs_t pairs;
pairscount_t primerpairs;
primermatchcount_t seqmatchcount;
word_t w1;
word_t w2;
char *amplifia;
char *oldamp;
ppairtree_t primerpairs;
pairs = ECOMALLOC(pairslots * sizeof(pairs_t),"Cannot allocate pairs table");
primerpairs = initpairtree(NULL);
for (i=0; i < seqdbsize; i++)
{
seqmatchcount = buildPrimerPairsForOneSeq(i, primers, options);
if (seqmatchcount.matchcount == 0) continue;
for (j=0; j < seqmatchcount.matchcount; j++)
{
strt = 0;
w1 = seqmatchcount.matches[j].word;
/*first word should b on direct strand*/
if (!seqmatchcount.matches[j].strand)
w1 = ecoComplementWord(w1, options->primer_length);
else
strt = options->primer_length;
for (k=j+1; k < seqmatchcount.matchcount; k++)
{
end = 0;
w2 = seqmatchcount.matches[k].word;
/*second word should be on reverse strand*/
if (seqmatchcount.matches[k].strand)
w2 = ecoComplementWord(w2, options->primer_length);
else
end = options->primer_length;
if (!(seqmatchcount.matches[j].good || seqmatchcount.matches[k].good)) continue;
if (w1 == w2) continue;
d = seqmatchcount.matches[k].position - seqmatchcount.matches[j].position;
if (d >= options->lmin && d <= options->lmax)
{
/*get amplified string*/
amplifia = getamplifia (seqdb[i], seqmatchcount.matches[j].position + strt, d - strt - end);
foundindex = pairinlist(pairs, w1, w2, paircount);
if (foundindex != -1) /*pair is found*/
{
if (seqdb[i]->isexample)
pairs[foundindex].inexample++;
else
pairs[foundindex].outexample++;
if (pairs[foundindex].mind > d) pairs[foundindex].mind = d;
else if (pairs[foundindex].maxd < d) pairs[foundindex].maxd = d;
oldamp = addamplifiasetelem (&pairs[foundindex], amplifia, seqdb[i]->ranktaxonid);
/*if exact same string is already in amplifia set then use that for taxon set, it will help for
* calculating the fully identified taxons i.e specificity, we will compare pointrs instead of strings
* because same string means same pointer*/
if (oldamp)
{
ECOFREE (amplifia, "free amplifia");
amplifia = oldamp;
}
addtaxampsetelem (&pairs[foundindex], seqdb[i]->ranktaxonid, amplifia);
continue;
}
if (paircount == pairslots)
{
pairslots += 500;
pairs = ECOREALLOC(pairs, pairslots * sizeof(pairs_t), "Cannot allocate pairs table");
}
pairs[paircount].w1 = w1;
pairs[paircount].w2 = w2;
if (seqdb[i]->isexample) pairs[paircount].inexample = 1;
else pairs[paircount].outexample = 1;
pairs[paircount].mind = d;
pairs[paircount].maxd = d;
oldamp = addamplifiasetelem (&pairs[paircount], amplifia, seqdb[i]->ranktaxonid);
addtaxampsetelem (&pairs[paircount], seqdb[i]->ranktaxonid, amplifia);
paircount++;
}
else if (d > options->lmax)
break; /*once if the distance is greater than lmax then it will keep on increasing*/
}
}
ECOFREE(seqmatchcount.matches, "Cannot free matches table");
buildPrimerPairsForOneSeq(i, seqdb, primers, primerpairs, options);
}
primerpairs.pairs = ECOREALLOC(pairs, paircount * sizeof(pairs_t), "Cannot allocate pairs table");
primerpairs.paircount = paircount;
return primerpairs;
}
primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t primers,poptions_t options)
{
uint32_t i,j,k;
uint32_t matchcount=0;
pprimermatch_t matches = NULL;
primermatchcount_t seqmatchcount;
#define DMAX (2000000000)
static void buildPrimerPairsForOneSeq(uint32_t seqid,
pecodnadb_t seqdb,
pprimercount_t primers,
ppairtree_t pairs,
poptions_t options)
{
static uint32_t paircount=0;
uint32_t i,j,k;
uint32_t matchcount=0;
pprimermatch_t matches = NULL;
primermatchcount_t seqmatchcount;
ppair_t pcurrent;
pair_t current;
pprimer_t wswp;
bool_t bswp;
size_t distance;
bool_t strand;
seqmatchcount.matchcount = 0;
seqmatchcount.matches = NULL;
for (i=0;i < primers->size; i++)
{
matchcount+=primers->primers[i].directCount[seqid];
matchcount+=primers->primers[i].reverseCount[seqid];
}
if (matchcount <= 0) return seqmatchcount;
if (matchcount <= 0)
return;
matches = ECOMALLOC(matchcount * sizeof(primermatch_t),"Cannot allocate primers match table");
for (i=0,j=0;i < primers->size; i++)
@ -277,17 +205,15 @@ primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t prime
{
if (primers->primers[i].directCount[seqid]==1)
{
matches[j].word = primers->primers[i].word;
matches[j].primer = primers->primers+i;
matches[j].strand=TRUE;
matches[j].good=primers->primers[i].good;/*TR: Added*/
matches[j].position=primers->primers[i].directPos[seqid].value;
j++;
}
else for (k=0; k < primers->primers[i].directCount[seqid]; k++,j++)
{
matches[j].word = primers->primers[i].word;
matches[j].primer = primers->primers+i;
matches[j].strand=TRUE;
matches[j].good=primers->primers[i].good;/*TR: Added*/
matches[j].position=primers->primers[i].directPos[seqid].pointer[k];
}
}
@ -296,26 +222,144 @@ primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t prime
{
if (primers->primers[i].reverseCount[seqid]==1)
{
matches[j].word = primers->primers[i].word;
matches[j].primer = primers->primers+i;
matches[j].strand=FALSE;
matches[j].good=primers->primers[i].good;/*TR: Added*/
matches[j].position=primers->primers[i].reversePos[seqid].value;
j++;
}
else for (k=0; k < primers->primers[i].reverseCount[seqid]; k++,j++)
{
matches[j].word = primers->primers[i].word;
matches[j].primer = primers->primers+i;
matches[j].strand=FALSE;
matches[j].good=primers->primers[i].good;/*TR: Added*/
matches[j].position=primers->primers[i].reversePos[seqid].pointer[k];
}
}
}
sortmatch(matches,matchcount); // sort in asscending order by position
/*TR: Added*/
seqmatchcount.matches = matches;
seqmatchcount.matchcount = matchcount;
return seqmatchcount;
if (matchcount>1)
{
// fprintf(stderr,"\n====================================\n");
sortmatch(matches,matchcount); // sort in ascending order by position
for (i=0; i < matchcount;i++)
{
// For all primers matching the sequence
for(j=i+1;
(j<matchcount)
&& ((distance=matches[j].position - matches[i].position - options->primer_length) < options->lmax);
j++
)
// For all not too far primers
if ( (matches[i].primer->good || matches[j].primer->good)
&& (distance > options->lmin)
)
{
// If possible primer pair
current.p1 = matches[i].primer;
current.asdirect1=matches[i].strand;
current.p2 = matches[j].primer;
current.asdirect2= !matches[j].strand;
current.maxd=DMAX;
current.mind=DMAX;
current.sumd=0;
current.inexample=0;
current.outexample=0;
// Standardize the pair
strand = current.p2->word > current.p1->word;
if (!strand)
{
wswp = current.p1;
current.p1=current.p2;
current.p2=wswp;
bswp = current.asdirect1;
current.asdirect1=current.asdirect2;
current.asdirect2=bswp;
}
// Look for the new pair in already seen pairs
pcurrent = insertpair(current,pairs);
if (seqdb[seqid]->isexample)
{
pcurrent->inexample++;
pcurrent->sumd+=distance;
if ((pcurrent->maxd==DMAX) || (distance > pcurrent->maxd))
pcurrent->maxd = distance;
if (distance < pcurrent->mind)
pcurrent->mind = distance;
}
else
pcurrent->outexample++;
if ((pcurrent->outexample+pcurrent->inexample)==1)
{
paircount++;
pcurrent->pcr.ampslot=200;
pcurrent->pcr.ampcount=0;
pcurrent->pcr.amplifias = ECOMALLOC(sizeof(amplifia_t)*pcurrent->pcr.ampslot,
"Cannot allocate amplifia table");
}
else
{
if (pcurrent->pcr.ampslot==pcurrent->pcr.ampcount)
{
pcurrent->pcr.ampslot+=200;
pcurrent->pcr.amplifias = ECOREALLOC(pcurrent->pcr.amplifias,
sizeof(amplifia_t)*pcurrent->pcr.ampslot,
"Cannot allocate amplifia table");
}
}
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].length=distance;
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].sequence=seqdb[seqid];
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].strand=strand;
if (strand)
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia= seqdb[seqid]->SQ + matches[i].position + options->primer_length;
else
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia= seqdb[seqid]->SQ + matches[j].position - 1 ;
pcurrent->pcr.ampcount++;
// fprintf(stderr,"%c%c W1 : %s direct : %c",
// "bG"[(int)pcurrent->p1->good],
// "bG"[(int)pcurrent->p2->good],
// ecoUnhashWord(pcurrent->p1->word, options->primer_length),
// "><"[(int)pcurrent->asdirect1]
// );
//
// fprintf(stderr," W2 : %s direct : %c distance : %d (min/max/avg : %d/%d/%f) in/out: %d/%d %c (%d pairs)\n",
// ecoUnhashWord(pcurrent->p2->word, options->primer_length),
// "><"[(int)pcurrent->asdirect2],
// distance,
// pcurrent->mind,pcurrent->maxd,
// (pcurrent->inexample) ? (float)pcurrent->sumd/pcurrent->inexample:0.0,
// pcurrent->inexample,pcurrent->outexample,
// " N"[(pcurrent->outexample+pcurrent->inexample)==1],
// paircount
//
// );
//
}
}
}
pairs->count=paircount;
}