322 lines
9.0 KiB
C
322 lines
9.0 KiB
C
|
/*
|
|||
|
* pairs.c
|
|||
|
*
|
|||
|
* Created on: 15 d<EFBFBD>c. 2008
|
|||
|
* Author: coissac
|
|||
|
*/
|
|||
|
|
|||
|
#include "ecoprimer.h"
|
|||
|
#include <string.h>
|
|||
|
|
|||
|
primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t primers,poptions_t options);
|
|||
|
|
|||
|
int32_t pairinlist (ppairs_t pairlist, word_t w1, word_t w2, uint32_t size)
|
|||
|
{
|
|||
|
uint32_t i;
|
|||
|
|
|||
|
for (i = 0; i < size; i++)
|
|||
|
{
|
|||
|
if (w1 == pairlist[i].w1 && w2 == pairlist[i].w2) return i;
|
|||
|
if (w1 == pairlist[i].w2 && w2 == pairlist[i].w1) return i;
|
|||
|
}
|
|||
|
return -1;
|
|||
|
}
|
|||
|
|
|||
|
char *addamplifiasetelem (ppairs_t pair, char* amplifia, int32_t taxid)
|
|||
|
{
|
|||
|
uint32_t i;
|
|||
|
uint32_t j;
|
|||
|
char *ampused = NULL;
|
|||
|
|
|||
|
if(pair->ampsetcount == 0)
|
|||
|
{
|
|||
|
pair->ampsetcount = 500;
|
|||
|
pair->ampsetindex = 0;
|
|||
|
pair->ampset = ECOMALLOC(pair->ampsetcount * sizeof(ampseqset_t),"Cannot allocate amplifia set");
|
|||
|
}
|
|||
|
|
|||
|
for (i = 0; i < pair->ampsetindex; i++)
|
|||
|
{
|
|||
|
if (strcmp (pair->ampset[i].amplifia, amplifia) == 0)
|
|||
|
{
|
|||
|
ampused = pair->ampset[i].amplifia;
|
|||
|
break;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
if (i == 0)
|
|||
|
{
|
|||
|
pair->ampset[i].seqidcount = 100;
|
|||
|
pair->ampset[i].seqidindex = 0;
|
|||
|
pair->ampset[i].taxonids = ECOMALLOC(pair->ampset[i].seqidcount * sizeof(uint32_t),"Cannot allocate amplifia sequence table");
|
|||
|
}
|
|||
|
|
|||
|
if (pair->ampsetindex == pair->ampsetcount)
|
|||
|
{
|
|||
|
pair->ampsetcount += 500;
|
|||
|
pair->ampset = ECOREALLOC(pair->ampset, pair->ampsetcount * sizeof(ampseqset_t), "Cannot allocate amplifia set");
|
|||
|
}
|
|||
|
|
|||
|
if (pair->ampset[i].seqidindex == pair->ampset[i].seqidcount)
|
|||
|
{
|
|||
|
pair->ampset[i].seqidcount += 100;
|
|||
|
pair->ampset[i].taxonids = ECOREALLOC(pair->ampset[i].taxonids, pair->ampset[i].seqidcount * sizeof(int32_t), "Cannot allocate amplifia sequence table");
|
|||
|
}
|
|||
|
|
|||
|
if (pair->ampset[i].amplifia == NULL)
|
|||
|
{
|
|||
|
pair->ampset[i].amplifia = amplifia;
|
|||
|
pair->ampsetindex++;
|
|||
|
}
|
|||
|
|
|||
|
for (j = 0; j < pair->ampset[i].seqidindex; j++)
|
|||
|
{
|
|||
|
if (pair->ampset[i].taxonids[j] == taxid) break;
|
|||
|
}
|
|||
|
|
|||
|
if (j == pair->ampset[i].seqidindex)
|
|||
|
pair->ampset[i].taxonids[pair->ampset[i].seqidindex++] = taxid;
|
|||
|
return ampused;
|
|||
|
}
|
|||
|
|
|||
|
void addtaxampsetelem (ppairs_t pair, int32_t taxid, char *amplifia)
|
|||
|
{
|
|||
|
uint32_t i;
|
|||
|
uint32_t j;
|
|||
|
|
|||
|
if(pair->taxsetcount == 0)
|
|||
|
{
|
|||
|
pair->taxsetcount = 500;
|
|||
|
pair->taxsetindex = 0;
|
|||
|
pair->taxset = ECOMALLOC(pair->taxsetcount * sizeof(taxampset_t),"Cannot allocate taxon set");
|
|||
|
}
|
|||
|
|
|||
|
for (i = 0; i < pair->taxsetindex; i++)
|
|||
|
{
|
|||
|
if (pair->taxset[i].taxonid == taxid) break;
|
|||
|
}
|
|||
|
|
|||
|
if (i == 0)
|
|||
|
{
|
|||
|
pair->taxset[i].amplifiacount = 100;
|
|||
|
pair->taxset[i].amplifiaindex = 0;
|
|||
|
pair->taxset[i].amplifia = ECOMALLOC(pair->taxset[i].amplifiacount * sizeof(char *),"Cannot allocate amplifia table");
|
|||
|
}
|
|||
|
|
|||
|
if (pair->taxsetindex == pair->taxsetcount)
|
|||
|
{
|
|||
|
pair->taxsetcount += 500;
|
|||
|
pair->taxset = ECOREALLOC(pair->taxset, pair->taxsetcount * sizeof(taxampset_t), "Cannot allocate taxon set");
|
|||
|
}
|
|||
|
|
|||
|
if (pair->taxset[i].amplifiaindex == pair->taxset[i].amplifiacount)
|
|||
|
{
|
|||
|
pair->taxset[i].amplifiacount += 100;
|
|||
|
pair->taxset[i].amplifia = ECOREALLOC(pair->taxset[i].amplifia, pair->taxset[i].amplifiacount * sizeof(char *), "Cannot allocate amplifia table");
|
|||
|
}
|
|||
|
|
|||
|
if (pair->taxset[i].taxonid == 0)
|
|||
|
{
|
|||
|
pair->taxset[i].taxonid = taxid;
|
|||
|
pair->taxsetindex++;
|
|||
|
}
|
|||
|
|
|||
|
for (j = 0; j < pair->taxset[i].amplifiaindex; j++)
|
|||
|
{
|
|||
|
if (strcmp(pair->taxset[i].amplifia[j], amplifia) == 0) break;
|
|||
|
}
|
|||
|
|
|||
|
if (j == pair->taxset[i].amplifiaindex)
|
|||
|
{
|
|||
|
pair->taxset[i].amplifia[j] = amplifia;
|
|||
|
pair->taxset[i].amplifiaindex++;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
char *getamplifia (pecoseq_t seq, uint32_t start, uint32_t len)
|
|||
|
{
|
|||
|
char *amplifia = ECOMALLOC((len + 1) * sizeof(char),"Cannot allocate amplifia");
|
|||
|
char *seqc = &seq->SQ[start];
|
|||
|
|
|||
|
strncpy(amplifia, seqc, len);
|
|||
|
return amplifia;
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
/*TR: Added*/
|
|||
|
pairscount_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options)
|
|||
|
{
|
|||
|
uint32_t i;
|
|||
|
uint32_t j;
|
|||
|
uint32_t k;
|
|||
|
uint32_t d;
|
|||
|
uint32_t strt;
|
|||
|
uint32_t end;
|
|||
|
uint32_t paircount = 0;
|
|||
|
uint32_t pairslots = 500;
|
|||
|
int32_t foundindex;
|
|||
|
ppairs_t pairs;
|
|||
|
pairscount_t primerpairs;
|
|||
|
primermatchcount_t seqmatchcount;
|
|||
|
word_t w1;
|
|||
|
word_t w2;
|
|||
|
char *amplifia;
|
|||
|
char *oldamp;
|
|||
|
|
|||
|
|
|||
|
pairs = ECOMALLOC(pairslots * sizeof(pairs_t),"Cannot allocate pairs table");
|
|||
|
|
|||
|
for (i=0; i < seqdbsize; i++)
|
|||
|
{
|
|||
|
seqmatchcount = buildPrimerPairsForOneSeq(i, primers, options);
|
|||
|
if (seqmatchcount.matchcount == 0) continue;
|
|||
|
|
|||
|
for (j=0; j < seqmatchcount.matchcount; j++)
|
|||
|
{
|
|||
|
strt = 0;
|
|||
|
w1 = seqmatchcount.matches[j].word;
|
|||
|
/*first word should b on direct strand*/
|
|||
|
if (!seqmatchcount.matches[j].strand)
|
|||
|
w1 = ecoComplementWord(w1, options->primer_length);
|
|||
|
else
|
|||
|
strt = options->primer_length;
|
|||
|
|
|||
|
for (k=j+1; k < seqmatchcount.matchcount; k++)
|
|||
|
{
|
|||
|
end = 0;
|
|||
|
w2 = seqmatchcount.matches[k].word;
|
|||
|
/*second word should be on reverse strand*/
|
|||
|
if (seqmatchcount.matches[k].strand)
|
|||
|
w2 = ecoComplementWord(w2, options->primer_length);
|
|||
|
else
|
|||
|
end = options->primer_length;
|
|||
|
|
|||
|
if (!(seqmatchcount.matches[j].good || seqmatchcount.matches[k].good)) continue;
|
|||
|
if (w1 == w2) continue;
|
|||
|
|
|||
|
d = seqmatchcount.matches[k].position - seqmatchcount.matches[j].position;
|
|||
|
if (d >= options->lmin && d <= options->lmax)
|
|||
|
{
|
|||
|
/*get amplified string*/
|
|||
|
amplifia = getamplifia (seqdb[i], seqmatchcount.matches[j].position + strt, d - strt - end);
|
|||
|
|
|||
|
foundindex = pairinlist(pairs, w1, w2, paircount);
|
|||
|
if (foundindex != -1) /*pair is found*/
|
|||
|
{
|
|||
|
if (seqdb[i]->isexample)
|
|||
|
pairs[foundindex].inexample++;
|
|||
|
else
|
|||
|
pairs[foundindex].outexample++;
|
|||
|
|
|||
|
if (pairs[foundindex].mind > d) pairs[foundindex].mind = d;
|
|||
|
else if (pairs[foundindex].maxd < d) pairs[foundindex].maxd = d;
|
|||
|
|
|||
|
oldamp = addamplifiasetelem (&pairs[foundindex], amplifia, seqdb[i]->ranktaxonid);
|
|||
|
/*if exact same string is already in amplifia set then use that for taxon set, it will help for
|
|||
|
* calculating the fully identified taxons i.e specificity, we will compare pointrs instead of strings
|
|||
|
* because same string means same pointer*/
|
|||
|
if (oldamp)
|
|||
|
{
|
|||
|
ECOFREE (amplifia, "free amplifia");
|
|||
|
amplifia = oldamp;
|
|||
|
}
|
|||
|
addtaxampsetelem (&pairs[foundindex], seqdb[i]->ranktaxonid, amplifia);
|
|||
|
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
if (paircount == pairslots)
|
|||
|
{
|
|||
|
pairslots += 500;
|
|||
|
pairs = ECOREALLOC(pairs, pairslots * sizeof(pairs_t), "Cannot allocate pairs table");
|
|||
|
}
|
|||
|
pairs[paircount].w1 = w1;
|
|||
|
pairs[paircount].w2 = w2;
|
|||
|
if (seqdb[i]->isexample) pairs[paircount].inexample = 1;
|
|||
|
else pairs[paircount].outexample = 1;
|
|||
|
pairs[paircount].mind = d;
|
|||
|
pairs[paircount].maxd = d;
|
|||
|
oldamp = addamplifiasetelem (&pairs[paircount], amplifia, seqdb[i]->ranktaxonid);
|
|||
|
addtaxampsetelem (&pairs[paircount], seqdb[i]->ranktaxonid, amplifia);
|
|||
|
|
|||
|
paircount++;
|
|||
|
}
|
|||
|
else if (d > options->lmax)
|
|||
|
break; /*once if the distance is greater than lmax then it will keep on increasing*/
|
|||
|
}
|
|||
|
}
|
|||
|
ECOFREE(seqmatchcount.matches, "Cannot free matches table");
|
|||
|
}
|
|||
|
primerpairs.pairs = ECOREALLOC(pairs, paircount * sizeof(pairs_t), "Cannot allocate pairs table");
|
|||
|
primerpairs.paircount = paircount;
|
|||
|
return primerpairs;
|
|||
|
}
|
|||
|
|
|||
|
primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t primers,poptions_t options)
|
|||
|
{
|
|||
|
uint32_t i,j,k;
|
|||
|
uint32_t matchcount=0;
|
|||
|
pprimermatch_t matches = NULL;
|
|||
|
primermatchcount_t seqmatchcount;
|
|||
|
|
|||
|
seqmatchcount.matchcount = 0;
|
|||
|
seqmatchcount.matches = NULL;
|
|||
|
|
|||
|
for (i=0;i < primers->size; i++)
|
|||
|
{
|
|||
|
matchcount+=primers->primers[i].directCount[seqid];
|
|||
|
matchcount+=primers->primers[i].reverseCount[seqid];
|
|||
|
}
|
|||
|
|
|||
|
if (matchcount <= 0) return seqmatchcount;
|
|||
|
matches = ECOMALLOC(matchcount * sizeof(primermatch_t),"Cannot allocate primers match table");
|
|||
|
|
|||
|
for (i=0,j=0;i < primers->size; i++)
|
|||
|
{
|
|||
|
if (primers->primers[i].directCount[seqid])
|
|||
|
{
|
|||
|
if (primers->primers[i].directCount[seqid]==1)
|
|||
|
{
|
|||
|
matches[j].word = primers->primers[i].word;
|
|||
|
matches[j].strand=TRUE;
|
|||
|
matches[j].good=primers->primers[i].good;/*TR: Added*/
|
|||
|
matches[j].position=primers->primers[i].directPos[seqid].value;
|
|||
|
j++;
|
|||
|
}
|
|||
|
else for (k=0; k < primers->primers[i].directCount[seqid]; k++,j++)
|
|||
|
{
|
|||
|
matches[j].word = primers->primers[i].word;
|
|||
|
matches[j].strand=TRUE;
|
|||
|
matches[j].good=primers->primers[i].good;/*TR: Added*/
|
|||
|
matches[j].position=primers->primers[i].directPos[seqid].pointer[k];
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
if (primers->primers[i].reverseCount[seqid])
|
|||
|
{
|
|||
|
if (primers->primers[i].reverseCount[seqid]==1)
|
|||
|
{
|
|||
|
matches[j].word = primers->primers[i].word;
|
|||
|
matches[j].strand=FALSE;
|
|||
|
matches[j].good=primers->primers[i].good;/*TR: Added*/
|
|||
|
matches[j].position=primers->primers[i].reversePos[seqid].value;
|
|||
|
j++;
|
|||
|
}
|
|||
|
else for (k=0; k < primers->primers[i].reverseCount[seqid]; k++,j++)
|
|||
|
{
|
|||
|
matches[j].word = primers->primers[i].word;
|
|||
|
matches[j].strand=FALSE;
|
|||
|
matches[j].good=primers->primers[i].good;/*TR: Added*/
|
|||
|
matches[j].position=primers->primers[i].reversePos[seqid].pointer[k];
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
sortmatch(matches,matchcount); // sort in asscending order by position
|
|||
|
|
|||
|
/*TR: Added*/
|
|||
|
seqmatchcount.matches = matches;
|
|||
|
seqmatchcount.matchcount = matchcount;
|
|||
|
return seqmatchcount;
|
|||
|
}
|