Merge of eric-test branche to the trunk
git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@200 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
BIN
src/ecoPrimer
Executable file
BIN
src/ecoPrimer
Executable file
Binary file not shown.
237
src/ecoprimer.c
237
src/ecoprimer.c
@ -28,6 +28,56 @@ static void PrintHelp()
|
|||||||
PP "------------------------------------------\n");
|
PP "------------------------------------------\n");
|
||||||
PP " ecoPrimer Version %s\n", VERSION);
|
PP " ecoPrimer Version %s\n", VERSION);
|
||||||
PP "------------------------------------------\n");
|
PP "------------------------------------------\n");
|
||||||
|
PP "synopsis : finding primers and measureing the quality of primers and barcode region\n");
|
||||||
|
PP "usage: ./ecoPrimer [options] \n");
|
||||||
|
PP "------------------------------------------\n");
|
||||||
|
PP "options:\n");
|
||||||
|
PP "-d : [D]atabase : to match the expected format, the database\n");
|
||||||
|
PP " has to be formated first by the ecoPCRFormat.py program located.\n");
|
||||||
|
PP " in the ecoPCR/tools directory.\n");
|
||||||
|
PP " ecoPCRFormat.py creates three file types :\n");
|
||||||
|
PP " .sdx : contains the sequences\n");
|
||||||
|
PP " .tdx : contains information concerning the taxonomy\n");
|
||||||
|
PP " .rdx : contains the taxonomy rank\n\n");
|
||||||
|
PP " ecoPrimer needs all the file type. As a result, you have to write the\n");
|
||||||
|
PP " database radical without any extension. For example /ecoPrimerDB/fstvert\n\n");
|
||||||
|
PP "-e : [E]rror : max error allowed by oligonucleotide (0 by default)\n\n");
|
||||||
|
PP "-h : [H]elp - print <this> help\n\n");
|
||||||
|
PP "-i : [I]gnore the given taxonomy id.\n\n");
|
||||||
|
PP "-l : minimum [L]ength : define the minimum amplication length. \n\n");
|
||||||
|
PP "-L : maximum [L]ength : define the maximum amplicationlength. \n\n");
|
||||||
|
PP "-r : [R]estricts the search to the given taxonomic id.\n\n");
|
||||||
|
PP "-c : Consider that the database sequences are [c]ircular\n\n");
|
||||||
|
PP "-3 : Three prime strict match\n\n");
|
||||||
|
PP "-q : Strict matching [q]uorum, percentage of the sequences in which strict primers are found. By default it is 70\n\n");
|
||||||
|
PP "-s : [S]ensitivity quorum\n\n");
|
||||||
|
PP "-t : required [t]axon level for results, by default the results are computed at species level\n\n");
|
||||||
|
PP "-x : false positive quorum\n\n");
|
||||||
|
PP "-D : set in [d]ouble strand mode\n\n");
|
||||||
|
PP "-S : Set in [s]ingle strand mode\n\n");
|
||||||
|
PP "-U : No multi match\n\n");
|
||||||
|
PP "\n");
|
||||||
|
PP "------------------------------------------\n");
|
||||||
|
PP "Table result description : \n");
|
||||||
|
PP "column 1 : serial number\n");
|
||||||
|
PP "column 2 : primer1\n");
|
||||||
|
PP "column 3 : primer2\n");
|
||||||
|
PP "column 4 : good/bad\n");
|
||||||
|
PP "column 5 : in sequence count\n");
|
||||||
|
PP "column 6 : out sequence count\n");
|
||||||
|
PP "column 7 : yule\n");
|
||||||
|
PP "column 8 : in taxa count\n");
|
||||||
|
PP "column 9 : out taxa count\n");
|
||||||
|
PP "column 10 : coverage\n");
|
||||||
|
PP "column 11 : specificity\n");
|
||||||
|
PP "column 12 : minimum amplified length\n");
|
||||||
|
PP "column 13 : maximum amplified length\n");
|
||||||
|
PP "column 14 : average amplified length\n");
|
||||||
|
PP "------------------------------------------\n");
|
||||||
|
PP " http://www.grenoble.prabi.fr/trac/ecoPrimer/\n");
|
||||||
|
PP "------------------------------------------\n\n");
|
||||||
|
PP "\n");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ExitUsage(int stat)
|
static void ExitUsage(int stat)
|
||||||
@ -56,7 +106,7 @@ void initoptions(poptions_t options)
|
|||||||
options->strict_exclude_quorum=0.1;
|
options->strict_exclude_quorum=0.1;
|
||||||
options->sensitivity_quorum=0.9;
|
options->sensitivity_quorum=0.9;
|
||||||
options->false_positive_quorum=0.1;
|
options->false_positive_quorum=0.1;
|
||||||
options->strict_three_prime=2;
|
options->strict_three_prime=0;
|
||||||
options->r=0;
|
options->r=0;
|
||||||
options->g=0;
|
options->g=0;
|
||||||
options->no_multi_match=FALSE;
|
options->no_multi_match=FALSE;
|
||||||
@ -75,7 +125,7 @@ void printcurrenttime ()
|
|||||||
/* Format and print the time, "ddd yyyy-mm-dd hh:mm:ss zzz" */
|
/* Format and print the time, "ddd yyyy-mm-dd hh:mm:ss zzz" */
|
||||||
ts = localtime(&now);
|
ts = localtime(&now);
|
||||||
strftime(buf, sizeof(buf), "%a %Y-%m-%d %H:%M:%S %Z", ts);
|
strftime(buf, sizeof(buf), "%a %Y-%m-%d %H:%M:%S %Z", ts);
|
||||||
fprintf(stderr,"#%d#, %s\n",now, buf);
|
fprintf(stderr,"#%d#, %s\n",(int)now, buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
void printcurrenttimeinmilli()
|
void printcurrenttimeinmilli()
|
||||||
@ -90,7 +140,125 @@ void printcurrenttimeinmilli()
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*TR: Added*/
|
/*TR: Added*/
|
||||||
|
|
||||||
|
void printapair(int32_t index,ppair_t pair, poptions_t options)
|
||||||
|
{
|
||||||
|
uint32_t wellidentifiedtaxa;
|
||||||
|
|
||||||
|
printf("%6d\t",index);
|
||||||
|
if (pair->asdirect1)
|
||||||
|
printf("%s\t",ecoUnhashWord(pair->p1->word,options->primer_length));
|
||||||
|
else
|
||||||
|
printf("%s\t",ecoUnhashWord(ecoComplementWord(pair->p1->word,
|
||||||
|
options->primer_length),options->primer_length));
|
||||||
|
|
||||||
|
if (pair->asdirect2)
|
||||||
|
printf("%s",ecoUnhashWord(pair->p2->word,options->primer_length));
|
||||||
|
else
|
||||||
|
printf("%s",ecoUnhashWord(ecoComplementWord(pair->p2->word,
|
||||||
|
options->primer_length),options->primer_length));
|
||||||
|
|
||||||
|
printf("\t%c%c", "bG"[(int)pair->p1->good],"bG"[(int)pair->p2->good]);
|
||||||
|
|
||||||
|
|
||||||
|
printf("\t%d", pair->inexample);
|
||||||
|
printf("\t%d", pair->outexample);
|
||||||
|
printf("\t%4.3f", pair->yule);
|
||||||
|
|
||||||
|
printf("\t%d", pair->intaxa);
|
||||||
|
printf("\t%d", pair->outtaxa);
|
||||||
|
printf("\t%4.3f", (float)pair->intaxa/options->intaxa);
|
||||||
|
|
||||||
|
wellidentifiedtaxa = (pair->intaxa + pair->outtaxa) - pair->notwellidentifiedtaxa;
|
||||||
|
|
||||||
|
//printf("\t%d", pair->notwellidentifiedtaxa);
|
||||||
|
|
||||||
|
//printf("\t%d", (pair->intaxa + pair->outtaxa));
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
printf("\t%4.3f", (float)wellidentifiedtaxa/(options->intaxa + options->outtaxa));
|
||||||
|
|
||||||
|
|
||||||
|
printf("\t%d", pair->mind);
|
||||||
|
printf("\t%d", pair->maxd);
|
||||||
|
printf("\t%3.2f\n", (float)pair->sumd/pair->inexample);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t options)
|
||||||
|
{
|
||||||
|
uint32_t i,j;
|
||||||
|
float q,qfp;
|
||||||
|
|
||||||
|
for (i=0,j=0;i < count;i++)
|
||||||
|
{
|
||||||
|
if (options->insamples)
|
||||||
|
q = (float)sortedpairs[i]->inexample/options->insamples;
|
||||||
|
else q=1.0;
|
||||||
|
|
||||||
|
if (options->outsamples)
|
||||||
|
qfp = (float)sortedpairs[i]->outexample/options->outsamples;
|
||||||
|
else qfp=0.0;
|
||||||
|
|
||||||
|
sortedpairs[i]->quorumin = q;
|
||||||
|
sortedpairs[i]->quorumout = qfp;
|
||||||
|
sortedpairs[i]->yule = q -qfp;
|
||||||
|
|
||||||
|
|
||||||
|
sortedpairs[j]=sortedpairs[i];
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if (q > options->sensitivity_quorum &&
|
||||||
|
qfp < options->false_positive_quorum)
|
||||||
|
{
|
||||||
|
(void)taxonomycoverage(sortedpairs[j],options);
|
||||||
|
taxonomyspecificity(sortedpairs[j]);
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return j;
|
||||||
|
}
|
||||||
|
|
||||||
|
void printpairs (ppairtree_t pairs, poptions_t options)
|
||||||
|
{
|
||||||
|
ppair_t* sortedpairs;
|
||||||
|
ppair_t* index;
|
||||||
|
ppairlist_t pl;
|
||||||
|
size_t i,j;
|
||||||
|
int32_t count;
|
||||||
|
|
||||||
|
//printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n");
|
||||||
|
|
||||||
|
fprintf(stderr,"Total pair count : %d\n",pairs->count);
|
||||||
|
|
||||||
|
sortedpairs = ECOMALLOC(pairs->count*sizeof(ppair_t),"Cannot Allocate ordered pairs");
|
||||||
|
index=sortedpairs;
|
||||||
|
pl=pairs->first;
|
||||||
|
j=0;
|
||||||
|
while(pl->next)
|
||||||
|
{
|
||||||
|
for (i=0;i<pl->paircount;i++,j++)
|
||||||
|
sortedpairs[j]=pl->pairs+i;
|
||||||
|
pl=pl->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i=0;i<pl->paircount;i++,j++)
|
||||||
|
sortedpairs[j]=pl->pairs+i;
|
||||||
|
|
||||||
|
count=filterandsortpairs(sortedpairs,pairs->count,options);
|
||||||
|
|
||||||
|
for (i=0;i < count;i++)
|
||||||
|
printapair(i,sortedpairs[i],options);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef MASKEDCODE
|
||||||
void printpairs (pairscount_t pairs, poptions_t options, int32_t rankdbstats, uint32_t seqdbsize)
|
void printpairs (pairscount_t pairs, poptions_t options, int32_t rankdbstats, uint32_t seqdbsize)
|
||||||
|
|
||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
uint32_t wordsize = options->primer_length;
|
uint32_t wordsize = options->primer_length;
|
||||||
@ -121,8 +289,11 @@ void printpairs (pairscount_t pairs, poptions_t options, int32_t rankdbstats, ui
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif /* MASKEDCODE */
|
||||||
|
|
||||||
/*updateseqparams: This function counts the insample and outsample sequences
|
/*updateseqparams: This function counts the insample and outsample sequences
|
||||||
* and with each sequences adds a tag of the taxon to which the sequence beongs*/
|
* and with each sequences adds a tag of the taxon to which the sequence beongs*/
|
||||||
|
|
||||||
void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy,
|
void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy,
|
||||||
poptions_t options, int32_t *insamples, int32_t *outsamples)
|
poptions_t options, int32_t *insamples, int32_t *outsamples)
|
||||||
{
|
{
|
||||||
@ -168,47 +339,10 @@ void setresulttaxonrank (ecotaxonomy_t *taxonomy, poptions_t options)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* to get db stats, totals of species, genus etc....*/
|
/* to get db stats, totals of species, genus etc....*/
|
||||||
int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy,
|
|
||||||
poptions_t options)
|
|
||||||
{
|
|
||||||
uint32_t i;
|
|
||||||
uint32_t j;
|
|
||||||
uint32_t nameslots = 500;
|
|
||||||
uint32_t namesindex = 0;
|
|
||||||
int32_t *ranktaxonids = ECOMALLOC(nameslots * sizeof(int32_t), "Error in taxon rank allocation");
|
|
||||||
int32_t taxid;
|
|
||||||
|
|
||||||
ecotx_t *tmptaxon;
|
#ifdef MASKEDCODE
|
||||||
|
|
||||||
for (i=0;i<seqdbsize;i++)
|
void setoktaxforspecificity (ppairtree_t pairs)
|
||||||
{
|
|
||||||
taxid = taxonomy->taxons->taxon[seqdb[i]->taxid].taxid;
|
|
||||||
tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid);
|
|
||||||
if (tmptaxon)
|
|
||||||
tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx);
|
|
||||||
if (tmptaxon)
|
|
||||||
{
|
|
||||||
for (j = 0; j < namesindex; j++)
|
|
||||||
{
|
|
||||||
if (tmptaxon->taxid == ranktaxonids[j]) break;
|
|
||||||
}
|
|
||||||
if (j < namesindex) continue; /* name is already in list, so no need to add it*/
|
|
||||||
|
|
||||||
if (namesindex == nameslots)
|
|
||||||
{
|
|
||||||
nameslots += 500;
|
|
||||||
ranktaxonids = ECOREALLOC(ranktaxonids, nameslots * sizeof(int32_t), "Cannot allocate pair rank taxon table");
|
|
||||||
}
|
|
||||||
ranktaxonids[namesindex] = tmptaxon->taxid;
|
|
||||||
namesindex++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ECOFREE(ranktaxonids, "free rank taxon table");
|
|
||||||
|
|
||||||
return namesindex;
|
|
||||||
}
|
|
||||||
|
|
||||||
void setoktaxforspecificity (ppairscount_t pairs)
|
|
||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
uint32_t j;
|
uint32_t j;
|
||||||
@ -251,6 +385,8 @@ void setoktaxforspecificity (ppairscount_t pairs)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
pecodnadb_t seqdb; /* of type ecoseq_t */
|
pecodnadb_t seqdb; /* of type ecoseq_t */
|
||||||
@ -267,7 +403,7 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
pwordcount_t words;
|
pwordcount_t words;
|
||||||
pprimercount_t primers;
|
pprimercount_t primers;
|
||||||
pairscount_t pairs;
|
ppairtree_t pairs;
|
||||||
|
|
||||||
int32_t rankdbstats = 0;
|
int32_t rankdbstats = 0;
|
||||||
|
|
||||||
@ -407,11 +543,16 @@ int main(int argc, char **argv)
|
|||||||
fprintf(stderr,"Sequence read : %d\n",(int32_t)seqdbsize);
|
fprintf(stderr,"Sequence read : %d\n",(int32_t)seqdbsize);
|
||||||
|
|
||||||
updateseqparams(seqdb, seqdbsize, taxonomy, &options, &insamples , &outsamples);
|
updateseqparams(seqdb, seqdbsize, taxonomy, &options, &insamples , &outsamples);
|
||||||
|
options.dbsize=seqdbsize;
|
||||||
|
options.insamples=insamples;
|
||||||
|
options.outsamples=outsamples;
|
||||||
|
|
||||||
rankdbstats = getrankdbstats(seqdb, seqdbsize, taxonomy, &options);
|
rankdbstats = getrankdbstats(seqdb, seqdbsize, taxonomy, &options);
|
||||||
|
|
||||||
fprintf(stderr,"Database is constituted of %5d examples\n",insamples);
|
fprintf(stderr,"Database is constituted of %5d examples corresponding to %5d %s\n",insamples,
|
||||||
fprintf(stderr," and %5d counterexamples\n",outsamples);
|
options.intaxa,options.taxonrank);
|
||||||
|
fprintf(stderr," and %5d counterexamples corresponding to %5d %s\n",outsamples,
|
||||||
|
options.outtaxa,options.taxonrank);
|
||||||
fprintf(stderr,"Total distinct %s count %d\n",options.taxonrank, rankdbstats);
|
fprintf(stderr,"Total distinct %s count %d\n",options.taxonrank, rankdbstats);
|
||||||
|
|
||||||
fprintf(stderr,"\nIndexing words in sequences\n");
|
fprintf(stderr,"\nIndexing words in sequences\n");
|
||||||
@ -460,13 +601,15 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
/*TR: Added*/
|
/*TR: Added*/
|
||||||
pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options);
|
pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options);
|
||||||
setoktaxforspecificity (&pairs);
|
|
||||||
|
|
||||||
printpairs (pairs, &options, rankdbstats, seqdbsize);
|
|
||||||
|
// setoktaxforspecificity (&pairs);
|
||||||
|
|
||||||
|
printpairs (pairs, &options);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
ECOFREE(pairs.pairs,"Free pairs table");
|
//ECOFREE(pairs.pairs,"Free pairs table");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
15
src/libecoPCR/ecoError.P
Normal file
15
src/libecoPCR/ecoError.P
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
ecoError.o ecoError.P : ecoError.c ecoPCR.h /usr/include/stdio.h \
|
||||||
|
/usr/include/_types.h /usr/include/sys/_types.h \
|
||||||
|
/usr/include/sys/cdefs.h /usr/include/machine/_types.h \
|
||||||
|
/usr/include/i386/_types.h /usr/include/inttypes.h \
|
||||||
|
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \
|
||||||
|
/usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \
|
||||||
|
/usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \
|
||||||
|
/usr/include/machine/signal.h /usr/include/i386/signal.h \
|
||||||
|
/usr/include/i386/_structs.h /usr/include/sys/_structs.h \
|
||||||
|
/usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \
|
||||||
|
/usr/include/sys/resource.h /usr/include/machine/endian.h \
|
||||||
|
/usr/include/i386/endian.h /usr/include/sys/_endian.h \
|
||||||
|
/usr/include/libkern/_OSByteOrder.h \
|
||||||
|
/usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \
|
||||||
|
/usr/include/machine/types.h /usr/include/i386/types.h
|
15
src/libecoPCR/ecoIOUtils.P
Normal file
15
src/libecoPCR/ecoIOUtils.P
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
ecoIOUtils.o ecoIOUtils.P : ecoIOUtils.c ecoPCR.h /usr/include/stdio.h \
|
||||||
|
/usr/include/_types.h /usr/include/sys/_types.h \
|
||||||
|
/usr/include/sys/cdefs.h /usr/include/machine/_types.h \
|
||||||
|
/usr/include/i386/_types.h /usr/include/inttypes.h \
|
||||||
|
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \
|
||||||
|
/usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \
|
||||||
|
/usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \
|
||||||
|
/usr/include/machine/signal.h /usr/include/i386/signal.h \
|
||||||
|
/usr/include/i386/_structs.h /usr/include/sys/_structs.h \
|
||||||
|
/usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \
|
||||||
|
/usr/include/sys/resource.h /usr/include/machine/endian.h \
|
||||||
|
/usr/include/i386/endian.h /usr/include/sys/_endian.h \
|
||||||
|
/usr/include/libkern/_OSByteOrder.h \
|
||||||
|
/usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \
|
||||||
|
/usr/include/machine/types.h /usr/include/i386/types.h
|
15
src/libecoPCR/ecoMalloc.P
Normal file
15
src/libecoPCR/ecoMalloc.P
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
ecoMalloc.o ecoMalloc.P : ecoMalloc.c ecoPCR.h /usr/include/stdio.h \
|
||||||
|
/usr/include/_types.h /usr/include/sys/_types.h \
|
||||||
|
/usr/include/sys/cdefs.h /usr/include/machine/_types.h \
|
||||||
|
/usr/include/i386/_types.h /usr/include/inttypes.h \
|
||||||
|
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \
|
||||||
|
/usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \
|
||||||
|
/usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \
|
||||||
|
/usr/include/machine/signal.h /usr/include/i386/signal.h \
|
||||||
|
/usr/include/i386/_structs.h /usr/include/sys/_structs.h \
|
||||||
|
/usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \
|
||||||
|
/usr/include/sys/resource.h /usr/include/machine/endian.h \
|
||||||
|
/usr/include/i386/endian.h /usr/include/sys/_endian.h \
|
||||||
|
/usr/include/libkern/_OSByteOrder.h \
|
||||||
|
/usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \
|
||||||
|
/usr/include/machine/types.h /usr/include/i386/types.h
|
5
src/libecoPCR/ecodna.P
Normal file
5
src/libecoPCR/ecodna.P
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
ecodna.o ecodna.P : ecodna.c /usr/include/string.h /usr/include/_types.h \
|
||||||
|
/usr/include/sys/_types.h /usr/include/sys/cdefs.h \
|
||||||
|
/usr/include/machine/_types.h /usr/include/i386/_types.h ecoPCR.h \
|
||||||
|
/usr/include/stdio.h /usr/include/inttypes.h \
|
||||||
|
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h
|
5
src/libecoPCR/ecofilter.P
Normal file
5
src/libecoPCR/ecofilter.P
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
ecofilter.o ecofilter.P : ecofilter.c ecoPCR.h /usr/include/stdio.h \
|
||||||
|
/usr/include/_types.h /usr/include/sys/_types.h \
|
||||||
|
/usr/include/sys/cdefs.h /usr/include/machine/_types.h \
|
||||||
|
/usr/include/i386/_types.h /usr/include/inttypes.h \
|
||||||
|
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h
|
15
src/libecoPCR/econame.P
Normal file
15
src/libecoPCR/econame.P
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
econame.o econame.P : econame.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \
|
||||||
|
/usr/include/sys/_types.h /usr/include/sys/cdefs.h \
|
||||||
|
/usr/include/machine/_types.h /usr/include/i386/_types.h \
|
||||||
|
/usr/include/inttypes.h \
|
||||||
|
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \
|
||||||
|
/usr/include/string.h /usr/include/stdlib.h /usr/include/available.h \
|
||||||
|
/usr/include/sys/wait.h /usr/include/sys/signal.h \
|
||||||
|
/usr/include/sys/appleapiopts.h /usr/include/machine/signal.h \
|
||||||
|
/usr/include/i386/signal.h /usr/include/i386/_structs.h \
|
||||||
|
/usr/include/sys/_structs.h /usr/include/machine/_structs.h \
|
||||||
|
/usr/include/mach/i386/_structs.h /usr/include/sys/resource.h \
|
||||||
|
/usr/include/machine/endian.h /usr/include/i386/endian.h \
|
||||||
|
/usr/include/sys/_endian.h /usr/include/libkern/_OSByteOrder.h \
|
||||||
|
/usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \
|
||||||
|
/usr/include/machine/types.h /usr/include/i386/types.h
|
15
src/libecoPCR/ecorank.P
Normal file
15
src/libecoPCR/ecorank.P
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
ecorank.o ecorank.P : ecorank.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \
|
||||||
|
/usr/include/sys/_types.h /usr/include/sys/cdefs.h \
|
||||||
|
/usr/include/machine/_types.h /usr/include/i386/_types.h \
|
||||||
|
/usr/include/inttypes.h \
|
||||||
|
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \
|
||||||
|
/usr/include/string.h /usr/include/stdlib.h /usr/include/available.h \
|
||||||
|
/usr/include/sys/wait.h /usr/include/sys/signal.h \
|
||||||
|
/usr/include/sys/appleapiopts.h /usr/include/machine/signal.h \
|
||||||
|
/usr/include/i386/signal.h /usr/include/i386/_structs.h \
|
||||||
|
/usr/include/sys/_structs.h /usr/include/machine/_structs.h \
|
||||||
|
/usr/include/mach/i386/_structs.h /usr/include/sys/resource.h \
|
||||||
|
/usr/include/machine/endian.h /usr/include/i386/endian.h \
|
||||||
|
/usr/include/sys/_endian.h /usr/include/libkern/_OSByteOrder.h \
|
||||||
|
/usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \
|
||||||
|
/usr/include/machine/types.h /usr/include/i386/types.h
|
19
src/libecoPCR/ecoseq.P
Normal file
19
src/libecoPCR/ecoseq.P
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
ecoseq.o ecoseq.P : ecoseq.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \
|
||||||
|
/usr/include/sys/_types.h /usr/include/sys/cdefs.h \
|
||||||
|
/usr/include/machine/_types.h /usr/include/i386/_types.h \
|
||||||
|
/usr/include/inttypes.h \
|
||||||
|
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \
|
||||||
|
/usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \
|
||||||
|
/usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \
|
||||||
|
/usr/include/machine/signal.h /usr/include/i386/signal.h \
|
||||||
|
/usr/include/i386/_structs.h /usr/include/sys/_structs.h \
|
||||||
|
/usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \
|
||||||
|
/usr/include/sys/resource.h /usr/include/machine/endian.h \
|
||||||
|
/usr/include/i386/endian.h /usr/include/sys/_endian.h \
|
||||||
|
/usr/include/libkern/_OSByteOrder.h \
|
||||||
|
/usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \
|
||||||
|
/usr/include/machine/types.h /usr/include/i386/types.h \
|
||||||
|
/usr/include/string.h /usr/include/zlib.h /usr/include/zconf.h \
|
||||||
|
/usr/include/sys/types.h /usr/include/unistd.h \
|
||||||
|
/usr/include/sys/unistd.h /usr/include/sys/select.h \
|
||||||
|
/usr/include/sys/_select.h
|
15
src/libecoPCR/ecotax.P
Normal file
15
src/libecoPCR/ecotax.P
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
ecotax.o ecotax.P : ecotax.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \
|
||||||
|
/usr/include/sys/_types.h /usr/include/sys/cdefs.h \
|
||||||
|
/usr/include/machine/_types.h /usr/include/i386/_types.h \
|
||||||
|
/usr/include/inttypes.h \
|
||||||
|
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \
|
||||||
|
/usr/include/string.h /usr/include/stdlib.h /usr/include/available.h \
|
||||||
|
/usr/include/sys/wait.h /usr/include/sys/signal.h \
|
||||||
|
/usr/include/sys/appleapiopts.h /usr/include/machine/signal.h \
|
||||||
|
/usr/include/i386/signal.h /usr/include/i386/_structs.h \
|
||||||
|
/usr/include/sys/_structs.h /usr/include/machine/_structs.h \
|
||||||
|
/usr/include/mach/i386/_structs.h /usr/include/sys/resource.h \
|
||||||
|
/usr/include/machine/endian.h /usr/include/i386/endian.h \
|
||||||
|
/usr/include/sys/_endian.h /usr/include/libkern/_OSByteOrder.h \
|
||||||
|
/usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \
|
||||||
|
/usr/include/machine/types.h /usr/include/i386/types.h
|
@ -10,7 +10,9 @@ SOURCES = goodtaxon.c \
|
|||||||
queue.c \
|
queue.c \
|
||||||
libstki.c \
|
libstki.c \
|
||||||
sortmatch.c \
|
sortmatch.c \
|
||||||
|
pairtree.c \
|
||||||
pairs.c \
|
pairs.c \
|
||||||
|
taxstats.c \
|
||||||
apat_search.c
|
apat_search.c
|
||||||
|
|
||||||
SRCS=$(SOURCES)
|
SRCS=$(SOURCES)
|
||||||
|
@ -61,7 +61,7 @@ void encodeSequence(ecoseq_t *seq)
|
|||||||
|
|
||||||
for (i=0;i<seq->SQ_length;i++,data++,cseq++)
|
for (i=0;i<seq->SQ_length;i++,data++,cseq++)
|
||||||
{
|
{
|
||||||
*data = encoder[(IS_UPPER(*cseq) ? *cseq - 'A' : 'Z')];
|
*data = encoder[(IS_UPPER(*cseq) ? *cseq : 'Z') - 'A'];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -79,28 +79,39 @@ typedef union {
|
|||||||
uint32_t value;
|
uint32_t value;
|
||||||
} poslist_t, *ppostlist_t;
|
} poslist_t, *ppostlist_t;
|
||||||
|
|
||||||
typedef struct {
|
/**
|
||||||
word_t word;
|
* primer_t structure store fuzzy match positions for a primer
|
||||||
uint32_t *directCount;
|
* on all sequences
|
||||||
ppostlist_t directPos;
|
*/
|
||||||
|
|
||||||
uint32_t *reverseCount;
|
typedef struct {
|
||||||
ppostlist_t reversePos;
|
word_t word; //< code for the primer
|
||||||
bool_t good;
|
uint32_t *directCount; //< Occurrence count on direct strand
|
||||||
uint32_t inexample;
|
ppostlist_t directPos; //< list of position list on direct strand
|
||||||
uint32_t outexample;
|
|
||||||
|
uint32_t *reverseCount; //< Occurrence count on reverse strand
|
||||||
|
ppostlist_t reversePos; //< list of position list on reverse strand
|
||||||
|
|
||||||
|
bool_t good; //< primer match more than quorum example and no
|
||||||
|
// more counterexample quorum.
|
||||||
|
|
||||||
|
uint32_t inexample; //< count of example sequences matching primer
|
||||||
|
uint32_t outexample; //< count of counterexample sequences matching primer
|
||||||
} primer_t, *pprimer_t;
|
} primer_t, *pprimer_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* primercount_t structure store fuzzy match positions for all primers
|
||||||
|
* on all sequences as a list of primer_t
|
||||||
|
*/
|
||||||
typedef struct {
|
typedef struct {
|
||||||
pprimer_t primers;
|
pprimer_t primers;
|
||||||
uint32_t size;
|
uint32_t size;
|
||||||
} primercount_t, *pprimercount_t;
|
} primercount_t, *pprimercount_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
word_t word;
|
pprimer_t primer;
|
||||||
uint32_t position;
|
uint32_t position;
|
||||||
bool_t strand;
|
bool_t strand;
|
||||||
bool_t good; /*TR: Added*/
|
|
||||||
} primermatch_t, *pprimermatch_t;
|
} primermatch_t, *pprimermatch_t;
|
||||||
|
|
||||||
/*TR: Added*/
|
/*TR: Added*/
|
||||||
@ -109,6 +120,19 @@ typedef struct {
|
|||||||
uint32_t matchcount;
|
uint32_t matchcount;
|
||||||
} primermatchcount_t, *pprimermatchcount_t;
|
} primermatchcount_t, *pprimermatchcount_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
pecoseq_t sequence;
|
||||||
|
bool_t strand;
|
||||||
|
const char *amplifia;
|
||||||
|
int32_t length;
|
||||||
|
} amplifia_t, *pamplifia_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
pamplifia_t amplifias;
|
||||||
|
uint32_t ampcount;
|
||||||
|
uint32_t ampslot;
|
||||||
|
} amplifiacount_t, *pamplifiacount_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
char *amplifia;
|
char *amplifia;
|
||||||
int32_t *taxonids;
|
int32_t *taxonids;
|
||||||
@ -124,30 +148,52 @@ typedef struct {
|
|||||||
} taxampset_t, *ptaxampset_t;
|
} taxampset_t, *ptaxampset_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
word_t w1;
|
pprimer_t p1;
|
||||||
word_t w2;
|
bool_t asdirect1;
|
||||||
uint32_t inexample; /*inexample count*/
|
pprimer_t p2;
|
||||||
uint32_t outexample; /*outexample count*/
|
bool_t asdirect2;
|
||||||
|
|
||||||
uint32_t mind;
|
amplifiacount_t pcr;
|
||||||
uint32_t maxd;
|
|
||||||
|
|
||||||
uint32_t ampsetcount;
|
uint32_t inexample; //< example sequence count
|
||||||
uint32_t ampsetindex;
|
uint32_t outexample; //< counterexample sequence count
|
||||||
pampseqset_t ampset;
|
uint32_t intaxa; //< example taxa count
|
||||||
|
uint32_t outtaxa; //< counterexample taxa count
|
||||||
|
uint32_t notwellidentifiedtaxa;
|
||||||
|
|
||||||
uint32_t taxsetcount;
|
|
||||||
uint32_t taxsetindex;
|
|
||||||
ptaxampset_t taxset;
|
|
||||||
|
|
||||||
uint32_t oktaxoncount;
|
// these statistics are relative to inexample sequences
|
||||||
} pairs_t, *ppairs_t;
|
|
||||||
|
uint32_t mind; //< minimum distance between primers
|
||||||
|
uint32_t maxd; //< maximum distance between primers
|
||||||
|
uint32_t sumd; //< distance sum
|
||||||
|
float yule;
|
||||||
|
float quorumin;
|
||||||
|
float quorumout;
|
||||||
|
//
|
||||||
|
// uint32_t taxsetcount;
|
||||||
|
// uint32_t taxsetindex;
|
||||||
|
// ptaxampset_t taxset;
|
||||||
|
//
|
||||||
|
// uint32_t oktaxoncount;
|
||||||
|
|
||||||
|
} pair_t, *ppair_t;
|
||||||
|
|
||||||
/*TR: Added*/
|
/*TR: Added*/
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ppairs_t pairs;
|
size_t paircount;
|
||||||
uint32_t paircount;
|
size_t pairslots;
|
||||||
}pairscount_t, *ppairscount_t;
|
void* next;
|
||||||
|
pair_t pairs[1];
|
||||||
|
} pairlist_t, *ppairlist_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
ppairlist_t first;
|
||||||
|
ppairlist_t last;
|
||||||
|
void *tree;
|
||||||
|
int32_t count;
|
||||||
|
} pairtree_t, *ppairtree_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
pword_t words;
|
pword_t words;
|
||||||
@ -168,6 +214,18 @@ typedef struct {
|
|||||||
uint32_t size;
|
uint32_t size;
|
||||||
} merge_t, *pmerge_t;
|
} merge_t, *pmerge_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
const char *amplifia;
|
||||||
|
bool_t strand;
|
||||||
|
int32_t length;
|
||||||
|
int32_t taxoncount;
|
||||||
|
void *taxontree;
|
||||||
|
}amptotaxon_t, *pamptotaxon_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int32_t taxid;
|
||||||
|
void *amptree;
|
||||||
|
}taxontoamp_t, *ptaxontoamp_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint32_t lmin; //**< Amplifia minimal length
|
uint32_t lmin; //**< Amplifia minimal length
|
||||||
@ -189,6 +247,14 @@ typedef struct {
|
|||||||
bool_t no_multi_match;
|
bool_t no_multi_match;
|
||||||
char taxonrank[20]; //TR to count ranks against a pair
|
char taxonrank[20]; //TR to count ranks against a pair
|
||||||
int32_t taxonrankidx; //TR to count ranks against a pair
|
int32_t taxonrankidx; //TR to count ranks against a pair
|
||||||
|
|
||||||
|
// Some statistics useful for options filters
|
||||||
|
|
||||||
|
int32_t dbsize;
|
||||||
|
int32_t insamples;
|
||||||
|
int32_t outsamples;
|
||||||
|
int32_t intaxa;
|
||||||
|
int32_t outtaxa;
|
||||||
} options_t, *poptions_t;
|
} options_t, *poptions_t;
|
||||||
|
|
||||||
typedef ecoseq_t **pecodnadb_t;
|
typedef ecoseq_t **pecodnadb_t;
|
||||||
@ -232,7 +298,21 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3
|
|||||||
|
|
||||||
void sortmatch(pprimermatch_t table,uint32_t N);
|
void sortmatch(pprimermatch_t table,uint32_t N);
|
||||||
|
|
||||||
|
ppairtree_t initpairtree(ppairtree_t tree);
|
||||||
|
ppair_t pairintree (pair_t key,ppairtree_t pairlist);
|
||||||
|
ppair_t insertpair(pair_t key,ppairtree_t list);
|
||||||
|
|
||||||
|
|
||||||
/*TR: Added*/
|
/*TR: Added*/
|
||||||
pairscount_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options);
|
ppairtree_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options);
|
||||||
|
|
||||||
|
int32_t counttaxon(int32_t taxid);
|
||||||
|
int32_t getrankdbstats(pecodnadb_t seqdb,
|
||||||
|
uint32_t seqdbsize,
|
||||||
|
ecotaxonomy_t *taxonomy,
|
||||||
|
poptions_t options);
|
||||||
|
float taxonomycoverage(ppair_t pair, poptions_t options);
|
||||||
|
char ecoComplementChar(char base);
|
||||||
|
void taxonomyspecificity (ppair_t pair);
|
||||||
|
|
||||||
#endif /* EPSORT_H_ */
|
#endif /* EPSORT_H_ */
|
||||||
|
@ -201,3 +201,8 @@ uint32_t ecoFindWord(pwordcount_t table,word_t word)
|
|||||||
return ~0;
|
return ~0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char ecoComplementChar(char base)
|
||||||
|
{
|
||||||
|
return (base < 4)? !base & 3: 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -7,22 +7,28 @@
|
|||||||
|
|
||||||
#include "ecoprimer.h"
|
#include "ecoprimer.h"
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t primers,poptions_t options);
|
static void buildPrimerPairsForOneSeq(uint32_t seqid,
|
||||||
|
pecodnadb_t seqdb,
|
||||||
|
pprimercount_t primers,
|
||||||
|
ppairtree_t pairs,
|
||||||
|
poptions_t options);
|
||||||
|
|
||||||
int32_t pairinlist (ppairs_t pairlist, word_t w1, word_t w2, uint32_t size)
|
|
||||||
{
|
|
||||||
uint32_t i;
|
|
||||||
|
|
||||||
for (i = 0; i < size; i++)
|
|
||||||
{
|
|
||||||
if (w1 == pairlist[i].w1 && w2 == pairlist[i].w2) return i;
|
|
||||||
if (w1 == pairlist[i].w2 && w2 == pairlist[i].w1) return i;
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
char *addamplifiasetelem (ppairs_t pair, char* amplifia, int32_t taxid)
|
|
||||||
|
|
||||||
|
|
||||||
|
/*************************************
|
||||||
|
*
|
||||||
|
* pair collection management
|
||||||
|
*
|
||||||
|
*************************************/
|
||||||
|
|
||||||
|
#ifdef MASKEDCODE
|
||||||
|
|
||||||
|
char *addamplifiasetelem (ppair_t pair, char* amplifia, int32_t taxid)
|
||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
uint32_t j;
|
uint32_t j;
|
||||||
@ -79,7 +85,7 @@ char *addamplifiasetelem (ppairs_t pair, char* amplifia, int32_t taxid)
|
|||||||
return ampused;
|
return ampused;
|
||||||
}
|
}
|
||||||
|
|
||||||
void addtaxampsetelem (ppairs_t pair, int32_t taxid, char *amplifia)
|
void addtaxampsetelem (ppair_t pair, int32_t taxid, char *amplifia)
|
||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
uint32_t j;
|
uint32_t j;
|
||||||
@ -135,6 +141,7 @@ void addtaxampsetelem (ppairs_t pair, int32_t taxid, char *amplifia)
|
|||||||
|
|
||||||
char *getamplifia (pecoseq_t seq, uint32_t start, uint32_t len)
|
char *getamplifia (pecoseq_t seq, uint32_t start, uint32_t len)
|
||||||
{
|
{
|
||||||
|
fprintf(stderr,"start : %d length : %d\n",start,len);
|
||||||
char *amplifia = ECOMALLOC((len + 1) * sizeof(char),"Cannot allocate amplifia");
|
char *amplifia = ECOMALLOC((len + 1) * sizeof(char),"Cannot allocate amplifia");
|
||||||
char *seqc = &seq->SQ[start];
|
char *seqc = &seq->SQ[start];
|
||||||
|
|
||||||
@ -142,125 +149,44 @@ char *getamplifia (pecoseq_t seq, uint32_t start, uint32_t len)
|
|||||||
return amplifia;
|
return amplifia;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
/*TR: Added*/
|
/*TR: Added*/
|
||||||
pairscount_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options)
|
ppairtree_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options)
|
||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
uint32_t j;
|
ppairtree_t primerpairs;
|
||||||
uint32_t k;
|
|
||||||
uint32_t d;
|
|
||||||
uint32_t strt;
|
|
||||||
uint32_t end;
|
|
||||||
uint32_t paircount = 0;
|
|
||||||
uint32_t pairslots = 500;
|
|
||||||
int32_t foundindex;
|
|
||||||
ppairs_t pairs;
|
|
||||||
pairscount_t primerpairs;
|
|
||||||
primermatchcount_t seqmatchcount;
|
|
||||||
word_t w1;
|
|
||||||
word_t w2;
|
|
||||||
char *amplifia;
|
|
||||||
char *oldamp;
|
|
||||||
|
|
||||||
|
primerpairs = initpairtree(NULL);
|
||||||
pairs = ECOMALLOC(pairslots * sizeof(pairs_t),"Cannot allocate pairs table");
|
|
||||||
|
|
||||||
for (i=0; i < seqdbsize; i++)
|
for (i=0; i < seqdbsize; i++)
|
||||||
{
|
{
|
||||||
seqmatchcount = buildPrimerPairsForOneSeq(i, primers, options);
|
buildPrimerPairsForOneSeq(i, seqdb, primers, primerpairs, options);
|
||||||
if (seqmatchcount.matchcount == 0) continue;
|
|
||||||
|
|
||||||
for (j=0; j < seqmatchcount.matchcount; j++)
|
|
||||||
{
|
|
||||||
strt = 0;
|
|
||||||
w1 = seqmatchcount.matches[j].word;
|
|
||||||
/*first word should b on direct strand*/
|
|
||||||
if (!seqmatchcount.matches[j].strand)
|
|
||||||
w1 = ecoComplementWord(w1, options->primer_length);
|
|
||||||
else
|
|
||||||
strt = options->primer_length;
|
|
||||||
|
|
||||||
for (k=j+1; k < seqmatchcount.matchcount; k++)
|
|
||||||
{
|
|
||||||
end = 0;
|
|
||||||
w2 = seqmatchcount.matches[k].word;
|
|
||||||
/*second word should be on reverse strand*/
|
|
||||||
if (seqmatchcount.matches[k].strand)
|
|
||||||
w2 = ecoComplementWord(w2, options->primer_length);
|
|
||||||
else
|
|
||||||
end = options->primer_length;
|
|
||||||
|
|
||||||
if (!(seqmatchcount.matches[j].good || seqmatchcount.matches[k].good)) continue;
|
|
||||||
if (w1 == w2) continue;
|
|
||||||
|
|
||||||
d = seqmatchcount.matches[k].position - seqmatchcount.matches[j].position;
|
|
||||||
if (d >= options->lmin && d <= options->lmax)
|
|
||||||
{
|
|
||||||
/*get amplified string*/
|
|
||||||
amplifia = getamplifia (seqdb[i], seqmatchcount.matches[j].position + strt, d - strt - end);
|
|
||||||
|
|
||||||
foundindex = pairinlist(pairs, w1, w2, paircount);
|
|
||||||
if (foundindex != -1) /*pair is found*/
|
|
||||||
{
|
|
||||||
if (seqdb[i]->isexample)
|
|
||||||
pairs[foundindex].inexample++;
|
|
||||||
else
|
|
||||||
pairs[foundindex].outexample++;
|
|
||||||
|
|
||||||
if (pairs[foundindex].mind > d) pairs[foundindex].mind = d;
|
|
||||||
else if (pairs[foundindex].maxd < d) pairs[foundindex].maxd = d;
|
|
||||||
|
|
||||||
oldamp = addamplifiasetelem (&pairs[foundindex], amplifia, seqdb[i]->ranktaxonid);
|
|
||||||
/*if exact same string is already in amplifia set then use that for taxon set, it will help for
|
|
||||||
* calculating the fully identified taxons i.e specificity, we will compare pointrs instead of strings
|
|
||||||
* because same string means same pointer*/
|
|
||||||
if (oldamp)
|
|
||||||
{
|
|
||||||
ECOFREE (amplifia, "free amplifia");
|
|
||||||
amplifia = oldamp;
|
|
||||||
}
|
|
||||||
addtaxampsetelem (&pairs[foundindex], seqdb[i]->ranktaxonid, amplifia);
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (paircount == pairslots)
|
|
||||||
{
|
|
||||||
pairslots += 500;
|
|
||||||
pairs = ECOREALLOC(pairs, pairslots * sizeof(pairs_t), "Cannot allocate pairs table");
|
|
||||||
}
|
|
||||||
pairs[paircount].w1 = w1;
|
|
||||||
pairs[paircount].w2 = w2;
|
|
||||||
if (seqdb[i]->isexample) pairs[paircount].inexample = 1;
|
|
||||||
else pairs[paircount].outexample = 1;
|
|
||||||
pairs[paircount].mind = d;
|
|
||||||
pairs[paircount].maxd = d;
|
|
||||||
oldamp = addamplifiasetelem (&pairs[paircount], amplifia, seqdb[i]->ranktaxonid);
|
|
||||||
addtaxampsetelem (&pairs[paircount], seqdb[i]->ranktaxonid, amplifia);
|
|
||||||
|
|
||||||
paircount++;
|
|
||||||
}
|
|
||||||
else if (d > options->lmax)
|
|
||||||
break; /*once if the distance is greater than lmax then it will keep on increasing*/
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ECOFREE(seqmatchcount.matches, "Cannot free matches table");
|
|
||||||
}
|
|
||||||
primerpairs.pairs = ECOREALLOC(pairs, paircount * sizeof(pairs_t), "Cannot allocate pairs table");
|
|
||||||
primerpairs.paircount = paircount;
|
|
||||||
return primerpairs;
|
return primerpairs;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t primers,poptions_t options)
|
#define DMAX (2000000000)
|
||||||
|
|
||||||
|
static void buildPrimerPairsForOneSeq(uint32_t seqid,
|
||||||
|
pecodnadb_t seqdb,
|
||||||
|
pprimercount_t primers,
|
||||||
|
ppairtree_t pairs,
|
||||||
|
poptions_t options)
|
||||||
{
|
{
|
||||||
|
static uint32_t paircount=0;
|
||||||
uint32_t i,j,k;
|
uint32_t i,j,k;
|
||||||
uint32_t matchcount=0;
|
uint32_t matchcount=0;
|
||||||
pprimermatch_t matches = NULL;
|
pprimermatch_t matches = NULL;
|
||||||
primermatchcount_t seqmatchcount;
|
primermatchcount_t seqmatchcount;
|
||||||
|
ppair_t pcurrent;
|
||||||
seqmatchcount.matchcount = 0;
|
pair_t current;
|
||||||
seqmatchcount.matches = NULL;
|
pprimer_t wswp;
|
||||||
|
bool_t bswp;
|
||||||
|
size_t distance;
|
||||||
|
bool_t strand;
|
||||||
|
|
||||||
for (i=0;i < primers->size; i++)
|
for (i=0;i < primers->size; i++)
|
||||||
{
|
{
|
||||||
@ -268,7 +194,9 @@ primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t prime
|
|||||||
matchcount+=primers->primers[i].reverseCount[seqid];
|
matchcount+=primers->primers[i].reverseCount[seqid];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (matchcount <= 0) return seqmatchcount;
|
if (matchcount <= 0)
|
||||||
|
return;
|
||||||
|
|
||||||
matches = ECOMALLOC(matchcount * sizeof(primermatch_t),"Cannot allocate primers match table");
|
matches = ECOMALLOC(matchcount * sizeof(primermatch_t),"Cannot allocate primers match table");
|
||||||
|
|
||||||
for (i=0,j=0;i < primers->size; i++)
|
for (i=0,j=0;i < primers->size; i++)
|
||||||
@ -277,17 +205,15 @@ primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t prime
|
|||||||
{
|
{
|
||||||
if (primers->primers[i].directCount[seqid]==1)
|
if (primers->primers[i].directCount[seqid]==1)
|
||||||
{
|
{
|
||||||
matches[j].word = primers->primers[i].word;
|
matches[j].primer = primers->primers+i;
|
||||||
matches[j].strand=TRUE;
|
matches[j].strand=TRUE;
|
||||||
matches[j].good=primers->primers[i].good;/*TR: Added*/
|
|
||||||
matches[j].position=primers->primers[i].directPos[seqid].value;
|
matches[j].position=primers->primers[i].directPos[seqid].value;
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
else for (k=0; k < primers->primers[i].directCount[seqid]; k++,j++)
|
else for (k=0; k < primers->primers[i].directCount[seqid]; k++,j++)
|
||||||
{
|
{
|
||||||
matches[j].word = primers->primers[i].word;
|
matches[j].primer = primers->primers+i;
|
||||||
matches[j].strand=TRUE;
|
matches[j].strand=TRUE;
|
||||||
matches[j].good=primers->primers[i].good;/*TR: Added*/
|
|
||||||
matches[j].position=primers->primers[i].directPos[seqid].pointer[k];
|
matches[j].position=primers->primers[i].directPos[seqid].pointer[k];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -296,26 +222,144 @@ primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t prime
|
|||||||
{
|
{
|
||||||
if (primers->primers[i].reverseCount[seqid]==1)
|
if (primers->primers[i].reverseCount[seqid]==1)
|
||||||
{
|
{
|
||||||
matches[j].word = primers->primers[i].word;
|
matches[j].primer = primers->primers+i;
|
||||||
matches[j].strand=FALSE;
|
matches[j].strand=FALSE;
|
||||||
matches[j].good=primers->primers[i].good;/*TR: Added*/
|
|
||||||
matches[j].position=primers->primers[i].reversePos[seqid].value;
|
matches[j].position=primers->primers[i].reversePos[seqid].value;
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
else for (k=0; k < primers->primers[i].reverseCount[seqid]; k++,j++)
|
else for (k=0; k < primers->primers[i].reverseCount[seqid]; k++,j++)
|
||||||
{
|
{
|
||||||
matches[j].word = primers->primers[i].word;
|
matches[j].primer = primers->primers+i;
|
||||||
matches[j].strand=FALSE;
|
matches[j].strand=FALSE;
|
||||||
matches[j].good=primers->primers[i].good;/*TR: Added*/
|
|
||||||
matches[j].position=primers->primers[i].reversePos[seqid].pointer[k];
|
matches[j].position=primers->primers[i].reversePos[seqid].pointer[k];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sortmatch(matches,matchcount); // sort in asscending order by position
|
if (matchcount>1)
|
||||||
|
{
|
||||||
|
// fprintf(stderr,"\n====================================\n");
|
||||||
|
|
||||||
|
sortmatch(matches,matchcount); // sort in ascending order by position
|
||||||
|
|
||||||
|
for (i=0; i < matchcount;i++)
|
||||||
|
{
|
||||||
|
// For all primers matching the sequence
|
||||||
|
|
||||||
|
for(j=i+1;
|
||||||
|
(j<matchcount)
|
||||||
|
&& ((distance=matches[j].position - matches[i].position - options->primer_length) < options->lmax);
|
||||||
|
j++
|
||||||
|
)
|
||||||
|
|
||||||
|
// For all not too far primers
|
||||||
|
|
||||||
|
if ( (matches[i].primer->good || matches[j].primer->good)
|
||||||
|
&& (distance > options->lmin)
|
||||||
|
)
|
||||||
|
{
|
||||||
|
|
||||||
|
// If possible primer pair
|
||||||
|
|
||||||
|
current.p1 = matches[i].primer;
|
||||||
|
current.asdirect1=matches[i].strand;
|
||||||
|
current.p2 = matches[j].primer;
|
||||||
|
current.asdirect2= !matches[j].strand;
|
||||||
|
current.maxd=DMAX;
|
||||||
|
current.mind=DMAX;
|
||||||
|
current.sumd=0;
|
||||||
|
current.inexample=0;
|
||||||
|
current.outexample=0;
|
||||||
|
|
||||||
|
|
||||||
|
// Standardize the pair
|
||||||
|
|
||||||
|
strand = current.p2->word > current.p1->word;
|
||||||
|
if (!strand)
|
||||||
|
{
|
||||||
|
wswp = current.p1;
|
||||||
|
current.p1=current.p2;
|
||||||
|
current.p2=wswp;
|
||||||
|
|
||||||
|
bswp = current.asdirect1;
|
||||||
|
current.asdirect1=current.asdirect2;
|
||||||
|
current.asdirect2=bswp;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Look for the new pair in already seen pairs
|
||||||
|
|
||||||
|
pcurrent = insertpair(current,pairs);
|
||||||
|
|
||||||
|
|
||||||
|
if (seqdb[seqid]->isexample)
|
||||||
|
|
||||||
|
{
|
||||||
|
pcurrent->inexample++;
|
||||||
|
pcurrent->sumd+=distance;
|
||||||
|
|
||||||
|
if ((pcurrent->maxd==DMAX) || (distance > pcurrent->maxd))
|
||||||
|
pcurrent->maxd = distance;
|
||||||
|
|
||||||
|
if (distance < pcurrent->mind)
|
||||||
|
pcurrent->mind = distance;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
pcurrent->outexample++;
|
||||||
|
|
||||||
|
if ((pcurrent->outexample+pcurrent->inexample)==1)
|
||||||
|
{
|
||||||
|
paircount++;
|
||||||
|
pcurrent->pcr.ampslot=200;
|
||||||
|
pcurrent->pcr.ampcount=0;
|
||||||
|
pcurrent->pcr.amplifias = ECOMALLOC(sizeof(amplifia_t)*pcurrent->pcr.ampslot,
|
||||||
|
"Cannot allocate amplifia table");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (pcurrent->pcr.ampslot==pcurrent->pcr.ampcount)
|
||||||
|
{
|
||||||
|
pcurrent->pcr.ampslot+=200;
|
||||||
|
pcurrent->pcr.amplifias = ECOREALLOC(pcurrent->pcr.amplifias,
|
||||||
|
sizeof(amplifia_t)*pcurrent->pcr.ampslot,
|
||||||
|
"Cannot allocate amplifia table");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].length=distance;
|
||||||
|
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].sequence=seqdb[seqid];
|
||||||
|
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].strand=strand;
|
||||||
|
|
||||||
|
if (strand)
|
||||||
|
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia= seqdb[seqid]->SQ + matches[i].position + options->primer_length;
|
||||||
|
else
|
||||||
|
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia= seqdb[seqid]->SQ + matches[j].position - 1 ;
|
||||||
|
|
||||||
|
pcurrent->pcr.ampcount++;
|
||||||
|
// fprintf(stderr,"%c%c W1 : %s direct : %c",
|
||||||
|
// "bG"[(int)pcurrent->p1->good],
|
||||||
|
// "bG"[(int)pcurrent->p2->good],
|
||||||
|
// ecoUnhashWord(pcurrent->p1->word, options->primer_length),
|
||||||
|
// "><"[(int)pcurrent->asdirect1]
|
||||||
|
// );
|
||||||
|
//
|
||||||
|
// fprintf(stderr," W2 : %s direct : %c distance : %d (min/max/avg : %d/%d/%f) in/out: %d/%d %c (%d pairs)\n",
|
||||||
|
// ecoUnhashWord(pcurrent->p2->word, options->primer_length),
|
||||||
|
// "><"[(int)pcurrent->asdirect2],
|
||||||
|
// distance,
|
||||||
|
// pcurrent->mind,pcurrent->maxd,
|
||||||
|
// (pcurrent->inexample) ? (float)pcurrent->sumd/pcurrent->inexample:0.0,
|
||||||
|
// pcurrent->inexample,pcurrent->outexample,
|
||||||
|
// " N"[(pcurrent->outexample+pcurrent->inexample)==1],
|
||||||
|
// paircount
|
||||||
|
//
|
||||||
|
// );
|
||||||
|
//
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pairs->count=paircount;
|
||||||
|
|
||||||
/*TR: Added*/
|
|
||||||
seqmatchcount.matches = matches;
|
|
||||||
seqmatchcount.matchcount = matchcount;
|
|
||||||
return seqmatchcount;
|
|
||||||
}
|
}
|
||||||
|
651
tools/ecoPCRFormat.py
Executable file
651
tools/ecoPCRFormat.py
Executable file
@ -0,0 +1,651 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import re
|
||||||
|
import gzip
|
||||||
|
import struct
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import getopt
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psycopg2
|
||||||
|
_dbenable=True
|
||||||
|
except ImportError:
|
||||||
|
_dbenable=False
|
||||||
|
|
||||||
|
#####
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Generic file function
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#####
|
||||||
|
|
||||||
|
def universalOpen(file):
|
||||||
|
if isinstance(file,str):
|
||||||
|
if file[-3:] == '.gz':
|
||||||
|
rep = gzip.open(file)
|
||||||
|
else:
|
||||||
|
rep = open(file)
|
||||||
|
else:
|
||||||
|
rep = file
|
||||||
|
return rep
|
||||||
|
|
||||||
|
def universalTell(file):
|
||||||
|
if isinstance(file, gzip.GzipFile):
|
||||||
|
file=file.myfileobj
|
||||||
|
return file.tell()
|
||||||
|
|
||||||
|
def fileSize(file):
|
||||||
|
if isinstance(file, gzip.GzipFile):
|
||||||
|
file=file.myfileobj
|
||||||
|
pos = file.tell()
|
||||||
|
file.seek(0,2)
|
||||||
|
length = file.tell()
|
||||||
|
file.seek(pos,0)
|
||||||
|
return length
|
||||||
|
|
||||||
|
def progressBar(pos,max,reset=False,delta=[]):
|
||||||
|
if reset:
|
||||||
|
del delta[:]
|
||||||
|
if not delta:
|
||||||
|
delta.append(time.time())
|
||||||
|
delta.append(time.time())
|
||||||
|
|
||||||
|
delta[1]=time.time()
|
||||||
|
elapsed = delta[1]-delta[0]
|
||||||
|
percent = float(pos)/max * 100
|
||||||
|
remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent)))
|
||||||
|
bar = '#' * int(percent/2)
|
||||||
|
bar+= '|/-\\-'[pos % 5]
|
||||||
|
bar+= ' ' * (50 - int(percent/2))
|
||||||
|
sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain))
|
||||||
|
|
||||||
|
#####
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# NCBI Dump Taxonomy reader
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#####
|
||||||
|
|
||||||
|
def endLessIterator(endedlist):
|
||||||
|
for x in endedlist:
|
||||||
|
yield x
|
||||||
|
while(1):
|
||||||
|
yield endedlist[-1]
|
||||||
|
|
||||||
|
class ColumnFile(object):
|
||||||
|
|
||||||
|
def __init__(self,stream,sep=None,strip=True,types=None):
|
||||||
|
if isinstance(stream,str):
|
||||||
|
self._stream = open(stream)
|
||||||
|
elif hasattr(stream,'next'):
|
||||||
|
self._stream = stream
|
||||||
|
else:
|
||||||
|
raise ValueError,'stream must be string or an iterator'
|
||||||
|
self._delimiter=sep
|
||||||
|
self._strip=strip
|
||||||
|
if types:
|
||||||
|
self._types=[x for x in types]
|
||||||
|
for i in xrange(len(self._types)):
|
||||||
|
if self._types[i] is bool:
|
||||||
|
self._types[i]=ColumnFile.str2bool
|
||||||
|
else:
|
||||||
|
self._types=None
|
||||||
|
|
||||||
|
def str2bool(x):
|
||||||
|
return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
|
||||||
|
|
||||||
|
str2bool = staticmethod(str2bool)
|
||||||
|
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def next(self):
|
||||||
|
ligne = self._stream.next()
|
||||||
|
data = ligne.split(self._delimiter)
|
||||||
|
if self._strip or self._types:
|
||||||
|
data = [x.strip() for x in data]
|
||||||
|
if self._types:
|
||||||
|
it = endLessIterator(self._types)
|
||||||
|
data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
|
||||||
|
return data
|
||||||
|
|
||||||
|
def taxonCmp(t1,t2):
|
||||||
|
if t1[0] < t2[0]:
|
||||||
|
return -1
|
||||||
|
elif t1[0] > t2[0]:
|
||||||
|
return +1
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def bsearchTaxon(taxonomy,taxid):
|
||||||
|
taxCount = len(taxonomy)
|
||||||
|
begin = 0
|
||||||
|
end = taxCount
|
||||||
|
oldcheck=taxCount
|
||||||
|
check = begin + end / 2
|
||||||
|
while check != oldcheck and taxonomy[check][0]!=taxid :
|
||||||
|
if taxonomy[check][0] < taxid:
|
||||||
|
begin=check
|
||||||
|
else:
|
||||||
|
end=check
|
||||||
|
oldcheck=check
|
||||||
|
check = (begin + end) / 2
|
||||||
|
|
||||||
|
|
||||||
|
if taxonomy[check][0]==taxid:
|
||||||
|
return check
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def readNodeTable(file):
|
||||||
|
|
||||||
|
file = universalOpen(file)
|
||||||
|
|
||||||
|
nodes = ColumnFile(file,
|
||||||
|
sep='|',
|
||||||
|
types=(int,int,str,
|
||||||
|
str,str,bool,
|
||||||
|
int,bool,int,
|
||||||
|
bool,bool,bool,str))
|
||||||
|
print >>sys.stderr,"Reading taxonomy dump file..."
|
||||||
|
taxonomy=[[n[0],n[2],n[1]] for n in nodes]
|
||||||
|
print >>sys.stderr,"List all taxonomy rank..."
|
||||||
|
ranks =list(set(x[1] for x in taxonomy))
|
||||||
|
ranks.sort()
|
||||||
|
ranks = dict(map(None,ranks,xrange(len(ranks))))
|
||||||
|
|
||||||
|
print >>sys.stderr,"Sorting taxons..."
|
||||||
|
taxonomy.sort(taxonCmp)
|
||||||
|
|
||||||
|
print >>sys.stderr,"Indexing taxonomy..."
|
||||||
|
index = {}
|
||||||
|
for t in taxonomy:
|
||||||
|
index[t[0]]=bsearchTaxon(taxonomy, t[0])
|
||||||
|
|
||||||
|
print >>sys.stderr,"Indexing parent and rank..."
|
||||||
|
for t in taxonomy:
|
||||||
|
t[1]=ranks[t[1]]
|
||||||
|
t[2]=index[t[2]]
|
||||||
|
|
||||||
|
|
||||||
|
return taxonomy,ranks,index
|
||||||
|
|
||||||
|
def nameIterator(file):
|
||||||
|
file = universalOpen(file)
|
||||||
|
names = ColumnFile(file,
|
||||||
|
sep='|',
|
||||||
|
types=(int,str,
|
||||||
|
str,str))
|
||||||
|
for taxid,name,unique,classname,white in names:
|
||||||
|
yield taxid,name,classname
|
||||||
|
|
||||||
|
def mergedNodeIterator(file):
|
||||||
|
file = universalOpen(file)
|
||||||
|
merged = ColumnFile(file,
|
||||||
|
sep='|',
|
||||||
|
types=(int,int,str))
|
||||||
|
for taxid,current,white in merged:
|
||||||
|
yield taxid,current
|
||||||
|
|
||||||
|
def deletedNodeIterator(file):
|
||||||
|
file = universalOpen(file)
|
||||||
|
deleted = ColumnFile(file,
|
||||||
|
sep='|',
|
||||||
|
types=(int,str))
|
||||||
|
for taxid,white in deleted:
|
||||||
|
yield taxid
|
||||||
|
|
||||||
|
def readTaxonomyDump(taxdir):
|
||||||
|
taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir)
|
||||||
|
|
||||||
|
print >>sys.stderr,"Adding scientific name..."
|
||||||
|
|
||||||
|
alternativeName=[]
|
||||||
|
for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir):
|
||||||
|
alternativeName.append((name,classname,index[taxid]))
|
||||||
|
if classname == 'scientific name':
|
||||||
|
taxonomy[index[taxid]].append(name)
|
||||||
|
|
||||||
|
print >>sys.stderr,"Adding taxid alias..."
|
||||||
|
for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir):
|
||||||
|
index[taxid]=index[current]
|
||||||
|
|
||||||
|
print >>sys.stderr,"Adding deleted taxid..."
|
||||||
|
for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir):
|
||||||
|
index[taxid]=None
|
||||||
|
|
||||||
|
return taxonomy,ranks,alternativeName,index
|
||||||
|
|
||||||
|
def readTaxonomyDB(dbname):
|
||||||
|
connection = psycopg2.connect(database=dbname)
|
||||||
|
|
||||||
|
cursor = connection.cursor()
|
||||||
|
cursor.execute("select numid,rank,parent from ncbi_taxonomy.taxon")
|
||||||
|
taxonomy=[list(x) for x in cursor]
|
||||||
|
|
||||||
|
cursor.execute("select rank_class from ncbi_taxonomy.taxon_rank_class order by rank_class")
|
||||||
|
ranks=cursor.fetchall()
|
||||||
|
ranks = dict(map(None,(x[0] for x in ranks),xrange(len(ranks))))
|
||||||
|
|
||||||
|
print >>sys.stderr,"Sorting taxons..."
|
||||||
|
taxonomy.sort(taxonCmp)
|
||||||
|
|
||||||
|
print >>sys.stderr,"Indexing taxonomy..."
|
||||||
|
index = {}
|
||||||
|
for t in taxonomy:
|
||||||
|
index[t[0]]=bsearchTaxon(taxonomy, t[0])
|
||||||
|
|
||||||
|
print >>sys.stderr,"Indexing parent and rank..."
|
||||||
|
for t in taxonomy:
|
||||||
|
t[1]=ranks[t[1]]
|
||||||
|
try:
|
||||||
|
t[2]=index[t[2]]
|
||||||
|
except KeyError,e:
|
||||||
|
if t[2] is None and t[0]==1:
|
||||||
|
t[2]=index[t[0]]
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
cursor.execute("select taxid,name,category from ncbi_taxonomy.name")
|
||||||
|
|
||||||
|
alternativeName=[]
|
||||||
|
for taxid,name,classname in cursor:
|
||||||
|
alternativeName.append((name,classname,index[taxid]))
|
||||||
|
if classname == 'scientific name':
|
||||||
|
taxonomy[index[taxid]].append(name)
|
||||||
|
|
||||||
|
cursor.execute("select old_numid,current_numid from ncbi_taxonomy.taxon_id_alias")
|
||||||
|
|
||||||
|
print >>sys.stderr,"Adding taxid alias..."
|
||||||
|
for taxid,current in cursor:
|
||||||
|
if current is not None:
|
||||||
|
index[taxid]=index[current]
|
||||||
|
else:
|
||||||
|
index[taxid]=None
|
||||||
|
|
||||||
|
|
||||||
|
return taxonomy,ranks,alternativeName,index
|
||||||
|
|
||||||
|
#####
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Genbank/EMBL sequence reader
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#####
|
||||||
|
|
||||||
|
def entryIterator(file):
|
||||||
|
file = universalOpen(file)
|
||||||
|
rep =[]
|
||||||
|
for ligne in file:
|
||||||
|
rep.append(ligne)
|
||||||
|
if ligne == '//\n':
|
||||||
|
rep = ''.join(rep)
|
||||||
|
yield rep
|
||||||
|
rep = []
|
||||||
|
|
||||||
|
def fastaEntryIterator(file):
|
||||||
|
file = universalOpen(file)
|
||||||
|
rep =[]
|
||||||
|
for ligne in file:
|
||||||
|
if ligne[0] == '>' and rep:
|
||||||
|
rep = ''.join(rep)
|
||||||
|
yield rep
|
||||||
|
rep = []
|
||||||
|
rep.append(ligne)
|
||||||
|
if rep:
|
||||||
|
rep = ''.join(rep)
|
||||||
|
yield rep
|
||||||
|
|
||||||
|
_cleanSeq = re.compile('[ \n0-9]+')
|
||||||
|
|
||||||
|
def cleanSeq(seq):
|
||||||
|
return _cleanSeq.sub('',seq)
|
||||||
|
|
||||||
|
|
||||||
|
_gbParseID = re.compile('(?<=^LOCUS {7})[^ ]+(?= )',re.MULTILINE)
|
||||||
|
_gbParseDE = re.compile('(?<=^DEFINITION {2}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL)
|
||||||
|
_gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL)
|
||||||
|
_gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
|
||||||
|
|
||||||
|
def genbankEntryParser(entry):
|
||||||
|
Id = _gbParseID.findall(entry)[0]
|
||||||
|
De = ' '.join(_gbParseDE.findall(entry)[0].split())
|
||||||
|
Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper())
|
||||||
|
try:
|
||||||
|
Tx = int(_gbParseTX.findall(entry)[0])
|
||||||
|
except IndexError:
|
||||||
|
Tx = None
|
||||||
|
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
|
||||||
|
|
||||||
|
######################
|
||||||
|
|
||||||
|
_cleanDef = re.compile('[\nDE]')
|
||||||
|
|
||||||
|
def cleanDef(definition):
|
||||||
|
return _cleanDef.sub('',definition)
|
||||||
|
|
||||||
|
_emblParseID = re.compile('(?<=^ID {3})[^ ]+(?=;)',re.MULTILINE)
|
||||||
|
_emblParseDE = re.compile('(?<=^DE {3}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL)
|
||||||
|
_emblParseSQ = re.compile('(?<=^ ).+?(?=^//$)',re.MULTILINE+re.DOTALL)
|
||||||
|
_emblParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
|
||||||
|
|
||||||
|
def emblEntryParser(entry):
|
||||||
|
Id = _emblParseID.findall(entry)[0]
|
||||||
|
De = ' '.join(cleanDef(_emblParseDE.findall(entry)[0]).split())
|
||||||
|
Sq = cleanSeq(_emblParseSQ.findall(entry)[0].upper())
|
||||||
|
try:
|
||||||
|
Tx = int(_emblParseTX.findall(entry)[0])
|
||||||
|
except IndexError:
|
||||||
|
Tx = None
|
||||||
|
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
|
||||||
|
|
||||||
|
|
||||||
|
######################
|
||||||
|
|
||||||
|
_fastaSplit=re.compile(';\W*')
|
||||||
|
|
||||||
|
def parseFasta(seq):
|
||||||
|
seq=seq.split('\n')
|
||||||
|
title = seq[0].strip()[1:].split(None,1)
|
||||||
|
id=title[0]
|
||||||
|
if len(title) == 2:
|
||||||
|
field = _fastaSplit.split(title[1])
|
||||||
|
else:
|
||||||
|
field=[]
|
||||||
|
info = dict(x.split('=',1) for x in field if '=' in x)
|
||||||
|
definition = ' '.join([x for x in field if '=' not in x])
|
||||||
|
seq=(''.join([x.strip() for x in seq[1:]])).upper()
|
||||||
|
return id,seq,definition,info
|
||||||
|
|
||||||
|
|
||||||
|
def fastaEntryParser(entry):
|
||||||
|
id,seq,definition,info = parseFasta(entry)
|
||||||
|
Tx = info.get('taxid',None)
|
||||||
|
if Tx is not None:
|
||||||
|
Tx=int(Tx)
|
||||||
|
return {'id':id,'taxid':Tx,'definition':definition,'sequence':seq}
|
||||||
|
|
||||||
|
|
||||||
|
def sequenceIteratorFactory(entryParser,entryIterator):
|
||||||
|
def sequenceIterator(file):
|
||||||
|
for entry in entryIterator(file):
|
||||||
|
yield entryParser(entry)
|
||||||
|
return sequenceIterator
|
||||||
|
|
||||||
|
|
||||||
|
def taxonomyInfo(entry,connection):
|
||||||
|
taxid = entry['taxid']
|
||||||
|
curseur = connection.cursor()
|
||||||
|
curseur.execute("""
|
||||||
|
select taxid,species,genus,family,
|
||||||
|
taxonomy.scientificName(taxid) as sn,
|
||||||
|
taxonomy.scientificName(species) as species_sn,
|
||||||
|
taxonomy.scientificName(genus) as genus_sn,
|
||||||
|
taxonomy.scientificName(family) as family_sn
|
||||||
|
from
|
||||||
|
(
|
||||||
|
select alias as taxid,
|
||||||
|
taxonomy.getSpecies(alias) as species,
|
||||||
|
taxonomy.getGenus(alias) as genus,
|
||||||
|
taxonomy.getFamily(alias) as family
|
||||||
|
from taxonomy.aliases
|
||||||
|
where id=%d ) as tax
|
||||||
|
""" % taxid)
|
||||||
|
rep = curseur.fetchone()
|
||||||
|
entry['current_taxid']=rep[0]
|
||||||
|
entry['species']=rep[1]
|
||||||
|
entry['genus']=rep[2]
|
||||||
|
entry['family']=rep[3]
|
||||||
|
entry['scientific_name']=rep[4]
|
||||||
|
entry['species_sn']=rep[5]
|
||||||
|
entry['genus_sn']=rep[6]
|
||||||
|
entry['family_sn']=rep[7]
|
||||||
|
return entry
|
||||||
|
|
||||||
|
#####
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Binary writer
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#####
|
||||||
|
|
||||||
|
def ecoSeqPacker(sq):
|
||||||
|
|
||||||
|
compactseq = gzip.zlib.compress(sq['sequence'],9)
|
||||||
|
cptseqlength = len(compactseq)
|
||||||
|
delength = len(sq['definition'])
|
||||||
|
|
||||||
|
totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
|
||||||
|
|
||||||
|
packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength),
|
||||||
|
totalSize,
|
||||||
|
sq['taxid'],
|
||||||
|
sq['id'],
|
||||||
|
delength,
|
||||||
|
len(sq['sequence']),
|
||||||
|
cptseqlength,
|
||||||
|
sq['definition'],
|
||||||
|
compactseq)
|
||||||
|
|
||||||
|
assert len(packed) == totalSize+4, "error in sequence packing"
|
||||||
|
|
||||||
|
return packed
|
||||||
|
|
||||||
|
def ecoTaxPacker(tx):
|
||||||
|
|
||||||
|
namelength = len(tx[3])
|
||||||
|
|
||||||
|
totalSize = 4 + 4 + 4 + 4 + namelength
|
||||||
|
|
||||||
|
packed = struct.pack('> I I I I I %ds' % namelength,
|
||||||
|
totalSize,
|
||||||
|
tx[0],
|
||||||
|
tx[1],
|
||||||
|
tx[2],
|
||||||
|
namelength,
|
||||||
|
tx[3])
|
||||||
|
|
||||||
|
return packed
|
||||||
|
|
||||||
|
def ecoRankPacker(rank):
|
||||||
|
|
||||||
|
namelength = len(rank)
|
||||||
|
|
||||||
|
packed = struct.pack('> I %ds' % namelength,
|
||||||
|
namelength,
|
||||||
|
rank)
|
||||||
|
|
||||||
|
return packed
|
||||||
|
|
||||||
|
def ecoNamePacker(name):
|
||||||
|
|
||||||
|
namelength = len(name[0])
|
||||||
|
classlength= len(name[1])
|
||||||
|
totalSize = namelength + classlength + 4 + 4 + 4 + 4
|
||||||
|
|
||||||
|
packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength),
|
||||||
|
totalSize,
|
||||||
|
int(name[1]=='scientific name'),
|
||||||
|
namelength,
|
||||||
|
classlength,
|
||||||
|
name[2],
|
||||||
|
name[0],
|
||||||
|
name[1])
|
||||||
|
|
||||||
|
return packed
|
||||||
|
|
||||||
|
def ecoSeqWriter(file,input,taxindex,parser):
|
||||||
|
output = open(file,'wb')
|
||||||
|
input = universalOpen(input)
|
||||||
|
inputsize = fileSize(input)
|
||||||
|
entries = parser(input)
|
||||||
|
seqcount=0
|
||||||
|
skipped = []
|
||||||
|
|
||||||
|
output.write(struct.pack('> I',seqcount))
|
||||||
|
|
||||||
|
progressBar(1, inputsize,reset=True)
|
||||||
|
for entry in entries:
|
||||||
|
if entry['taxid'] is not None:
|
||||||
|
try:
|
||||||
|
entry['taxid']=taxindex[entry['taxid']]
|
||||||
|
except KeyError:
|
||||||
|
entry['taxid']=None
|
||||||
|
if entry['taxid'] is not None:
|
||||||
|
seqcount+=1
|
||||||
|
output.write(ecoSeqPacker(entry))
|
||||||
|
else:
|
||||||
|
skipped.append(entry['id'])
|
||||||
|
where = universalTell(input)
|
||||||
|
progressBar(where, inputsize)
|
||||||
|
print >>sys.stderr," Readed sequences : %d " % seqcount,
|
||||||
|
else:
|
||||||
|
skipped.append(entry['id'])
|
||||||
|
|
||||||
|
print >>sys.stderr
|
||||||
|
output.seek(0,0)
|
||||||
|
output.write(struct.pack('> I',seqcount))
|
||||||
|
|
||||||
|
output.close()
|
||||||
|
return skipped
|
||||||
|
|
||||||
|
|
||||||
|
def ecoTaxWriter(file,taxonomy):
|
||||||
|
output = open(file,'wb')
|
||||||
|
output.write(struct.pack('> I',len(taxonomy)))
|
||||||
|
|
||||||
|
for tx in taxonomy:
|
||||||
|
output.write(ecoTaxPacker(tx))
|
||||||
|
|
||||||
|
output.close()
|
||||||
|
|
||||||
|
def ecoRankWriter(file,ranks):
|
||||||
|
output = open(file,'wb')
|
||||||
|
output.write(struct.pack('> I',len(ranks)))
|
||||||
|
|
||||||
|
rankNames = ranks.keys()
|
||||||
|
rankNames.sort()
|
||||||
|
|
||||||
|
for rank in rankNames:
|
||||||
|
output.write(ecoRankPacker(rank))
|
||||||
|
|
||||||
|
output.close()
|
||||||
|
|
||||||
|
def nameCmp(n1,n2):
|
||||||
|
name1=n1[0].upper()
|
||||||
|
name2=n2[0].upper()
|
||||||
|
if name1 < name2:
|
||||||
|
return -1
|
||||||
|
elif name1 > name2:
|
||||||
|
return 1
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def ecoNameWriter(file,names):
|
||||||
|
output = open(file,'wb')
|
||||||
|
output.write(struct.pack('> I',len(names)))
|
||||||
|
|
||||||
|
names.sort(nameCmp)
|
||||||
|
|
||||||
|
for name in names:
|
||||||
|
output.write(ecoNamePacker(name))
|
||||||
|
|
||||||
|
output.close()
|
||||||
|
|
||||||
|
def ecoDBWriter(prefix,taxonomy,seqFileNames,parser):
|
||||||
|
|
||||||
|
ecoRankWriter('%s.rdx' % prefix, taxonomy[1])
|
||||||
|
ecoTaxWriter('%s.tdx' % prefix, taxonomy[0])
|
||||||
|
ecoNameWriter('%s.ndx' % prefix, taxonomy[2])
|
||||||
|
|
||||||
|
filecount = 0
|
||||||
|
for filename in seqFileNames:
|
||||||
|
filecount+=1
|
||||||
|
sk=ecoSeqWriter('%s_%03d.sdx' % (prefix,filecount),
|
||||||
|
filename,
|
||||||
|
taxonomy[3],
|
||||||
|
parser)
|
||||||
|
if sk:
|
||||||
|
print >>sys.stderr,"Skipped entry :"
|
||||||
|
print >>sys.stderr,sk
|
||||||
|
|
||||||
|
def ecoParseOptions(arguments):
|
||||||
|
opt = {
|
||||||
|
'prefix' : 'ecodb',
|
||||||
|
'taxdir' : 'taxdump',
|
||||||
|
'parser' : sequenceIteratorFactory(genbankEntryParser,
|
||||||
|
entryIterator)
|
||||||
|
}
|
||||||
|
|
||||||
|
o,filenames = getopt.getopt(arguments,
|
||||||
|
'ht:T:n:gfe',
|
||||||
|
['help',
|
||||||
|
'taxonomy=',
|
||||||
|
'taxonomy_db=',
|
||||||
|
'name=',
|
||||||
|
'genbank',
|
||||||
|
'fasta',
|
||||||
|
'embl'])
|
||||||
|
|
||||||
|
for name,value in o:
|
||||||
|
if name in ('-h','--help'):
|
||||||
|
printHelp()
|
||||||
|
exit()
|
||||||
|
elif name in ('-t','--taxonomy'):
|
||||||
|
opt['taxmod']='dump'
|
||||||
|
opt['taxdir']=value
|
||||||
|
elif name in ('-T','--taxonomy_db'):
|
||||||
|
opt['taxmod']='db'
|
||||||
|
opt['taxdb']=value
|
||||||
|
elif name in ('-n','--name'):
|
||||||
|
opt['prefix']=value
|
||||||
|
elif name in ('-g','--genbank'):
|
||||||
|
opt['parser']=sequenceIteratorFactory(genbankEntryParser,
|
||||||
|
entryIterator)
|
||||||
|
|
||||||
|
elif name in ('-f','--fasta'):
|
||||||
|
opt['parser']=sequenceIteratorFactory(fastaEntryParser,
|
||||||
|
fastaEntryIterator)
|
||||||
|
|
||||||
|
elif name in ('-e','--embl'):
|
||||||
|
opt['parser']=sequenceIteratorFactory(emblEntryParser,
|
||||||
|
entryIterator)
|
||||||
|
else:
|
||||||
|
raise ValueError,'Unknown option %s' % name
|
||||||
|
|
||||||
|
return opt,filenames
|
||||||
|
|
||||||
|
def printHelp():
|
||||||
|
print "-----------------------------------"
|
||||||
|
print " ecoPCRFormat.py"
|
||||||
|
print "-----------------------------------"
|
||||||
|
print "ecoPCRFormat.py [option] <argument>"
|
||||||
|
print "-----------------------------------"
|
||||||
|
print "-e --embl :[E]mbl format"
|
||||||
|
print "-f --fasta :[F]asta format"
|
||||||
|
print "-g --genbank :[G]enbank format"
|
||||||
|
print "-h --help :[H]elp - print this help"
|
||||||
|
print "-n --name :[N]ame of the new database created"
|
||||||
|
print "-t --taxonomy :[T]axonomy - path to the taxonomy database"
|
||||||
|
print " :bcp-like dump from GenBank taxonomy database."
|
||||||
|
print "-----------------------------------"
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
opt,filenames = ecoParseOptions(sys.argv[1:])
|
||||||
|
|
||||||
|
if opt['taxmod']=='dump':
|
||||||
|
taxonomy = readTaxonomyDump(opt['taxdir'])
|
||||||
|
elif opt['taxmod']=='db':
|
||||||
|
taxonomy = readTaxonomyDB(opt['taxdb'])
|
||||||
|
|
||||||
|
|
||||||
|
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])
|
||||||
|
|
Reference in New Issue
Block a user