Files
ecopcr/src/ecopcr.c

475 lines
13 KiB
C
Raw Normal View History

#include "libecoPCR/ecoPCR.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <getopt.h>
#define VERSION "0.1"
/* ----------------------------------------------- */
/* printout help */ /* ----------------------------------------------- */
#define PP fprintf(stdout,
static void PrintHelp()
{
PP "------------------------------------------\n");
PP " ecoPCR Version %s\n", VERSION);
PP "------------------------------------------\n");
PP "synopsis : searching for sequence and taxonomy hybridingwith given primers\n");
PP "usage: ecoPCR [options] datafile\n");
PP "------------------------------------------\n");
PP "options:\n");
PP "-1 : [FIRST] oligonucleotide for direct strand\n\n");
PP "-2 : [SECOND] oligonucleotide for reverse strand\n\n");
PP "-e : [E]rror \n");
PP " : max error allowed by oligonucleotide\n\n");
PP "-h : [H]elp - print <this> help\n\n");
PP "-i : [I]gnore the given taxonomy id.\n");
PP " taxonomy id are available using the ecofind program.\n");
PP " see its help typing ecofind -h for more information.\n");
PP "-k : [K]ingdom mode\n");
PP " set the kingdom mode\n");
PP " super kingdom mode by default.\n\n");
PP "-l : minimum [L]ength\n");
PP " define the minimum amplication length. \n\n");
PP "-L : maximum [L]ength\n");
PP " define the maximum amplicationlength. \n\n");
PP "-r : [R]estricts the search to the given taxonomy id.\n");
PP " taxonomy id are available using the ecofind program.\n");
PP " see its help typing ecofind -h for more information.\n");
PP "\n");
PP "------------------------------------------\n");
PP "datafile : to match the expected format, the database\n");
PP "has to be formated first by the ecoPCRFormat.py program located.\n");
PP "in the tools directory.\n");
PP "ecoPCRFormat.py creates three file types :");
PP " .sdx : contains the sequences\n");
PP " .tdx : contains information concerning the taxonomy\n");
PP " .rdx : contains the taxonomy rank\n\n");
PP "ecoPCR needs all the file type. As a result, you have to write the\n");
PP "datafile radical without any extension. For example /database/gbmam\n");
PP "------------------------------------------\n");
PP "\n");
}
#undef PP
/* ----------------------------------------------- */
/* printout usage and exit */
/* ----------------------------------------------- */
#define PP fprintf(stderr,
static void ExitUsage(stat)
int stat;
{
PP "usage: ecoPCR [-1 oligo1] [-2 oligo2] [-l value] [-L value] [-e value] [-k] datafile\n");
PP "type \"apat -h\" for help\n");
if (stat)
exit(stat);
}
#undef PP
void printRepeat(ecoseq_t *seq,
PatternPtr o1, PatternPtr o2,
char strand,
char kingdom,
int32_t pos1, int32_t pos2,
int32_t err1, int32_t err2,
ecotaxonomy_t *taxonomy)
{
char *AC;
int32_t seqlength;
int32_t taxid;
int32_t species_taxid;
int32_t genus_taxid;
int32_t family_taxid;
int32_t superkingdom_taxid;
char *rank;
char *scientificName;
char *genus_name;
char *family_name;
char *superkingdom_name;
ecotx_t *taxon;
ecotx_t *main_taxon;
char oligo1[MAX_PAT_LEN+1];
char oligo2[MAX_PAT_LEN+1];
int32_t error1;
int32_t error2;
char *amplifia = NULL;
int32_t amplength;
AC = seq->AC;
seqlength = seq->SQ_length;
main_taxon = &taxonomy->taxons->taxon[seq->taxid];
taxid = main_taxon->taxid;
scientificName= main_taxon->name;
rank = taxonomy->ranks->label[main_taxon->rank];
taxon = eco_getspecies(main_taxon,taxonomy);
if (taxon)
{
species_taxid = taxon->taxid;
scientificName= taxon->name;
}
else
species_taxid = -1;
taxon = eco_getgenus((taxon) ? taxon:main_taxon,taxonomy);
if (taxon)
{
genus_taxid = taxon->taxid;
genus_name= taxon->name;
}
else
{
genus_taxid = -1;
genus_name = "###";
}
taxon = eco_getfamily((taxon) ? taxon:main_taxon,taxonomy);
if (taxon)
{
family_taxid = taxon->taxid;
family_name= taxon->name;
}
else
{
family_taxid = -1;
family_name = "###";
}
if (kingdom)
taxon = eco_getkingdom((taxon) ? taxon:main_taxon,taxonomy);
else
taxon = eco_getsuperkingdom((taxon) ? taxon:main_taxon,taxonomy);
if (taxon)
{
superkingdom_taxid = taxon->taxid;
superkingdom_name= taxon->name;
}
else
{
superkingdom_taxid = -1;
superkingdom_name = "###";
}
amplength = pos2-pos1;
amplifia = getSubSequence(seq->SQ,pos1,pos2);
if (strand=='R')
{
ecoComplementSequence(amplifia);
strncpy(oligo1,amplifia,o2->patlen);
oligo1[o2->patlen]=0;
error1=err2;
strncpy(oligo2,amplifia + amplength - o1->patlen,o1->patlen);
oligo2[o1->patlen]=0;
error2=err1;
amplifia+=o2->patlen;
}
else
{
strncpy(oligo1,amplifia,o1->patlen);
oligo1[o1->patlen]=0;
error1=err1;
strncpy(oligo2,amplifia + amplength - o2->patlen,o2->patlen);
oligo2[o2->patlen]=0;
error2=err2;
amplifia+=o1->patlen;
}
ecoComplementSequence(oligo2);
amplifia[amplength - o2->patlen - o1->patlen]=0;
printf("%-15s | %9d | %8d | %-20s | %8d | %-30s | %8d | %-30s | %8d | %-30s | %8d | %-30s | %c | %-32s | %2d | %-32s | %2d | %5d | %s | %s\n",
AC,
seqlength,
taxid,
rank,
species_taxid,
scientificName,
genus_taxid,
genus_name,
family_taxid,
family_name,
superkingdom_taxid,
superkingdom_name,
strand,
oligo1,
error1,
oligo2,
error2,
amplength,
amplifia,
seq->DE
);
}
int main(int argc, char **argv)
{
ecoseq_t *seq;
ecotaxonomy_t *taxonomy;
char *scname;
char head[11];
char tail[11];
int carg;
char *oligo1=NULL;
char *oligo2=NULL;
PatternPtr o1;
PatternPtr o2;
PatternPtr o1c;
PatternPtr o2c;
int32_t lmin=0;
int32_t lmax=0;
int32_t error_max=0;
int32_t errflag=0;
char kingdom_mode=0;
char *prefix;
int32_t checkedSequence = 0;
int32_t positiveSequence= 0;
int32_t amplifiatCount = 0;
int32_t o1Hits;
int32_t o2Hits;
int32_t o1cHits;
int32_t o2cHits;
int32_t begin;
int32_t length;
SeqPtr apatseq=NULL;
StackiPtr stktmp;
int32_t i;
int32_t j;
int32_t posi;
int32_t posj;
int32_t erri;
int32_t errj;
while ((carg = getopt(argc, argv, "h1:2:l:L:e:k")) != -1) {
switch (carg) {
/* -------------------- */
case '1': /* first primer */
/* -------------------- */
oligo1 = ECOMALLOC(strlen(optarg)+1,
"Error on oligo 1 allocation");
strcpy(oligo1,optarg);
break;
/* -------------------- */
case '2': /* second primer */
/* -------------------- */
oligo2 = ECOMALLOC(strlen(optarg)+1,
"Error on oligo 1 allocation");
strcpy(oligo2,optarg);
break;
/* -------------------- */
case 'h': /* help */
/* -------------------- */
PrintHelp();
exit(0);
break;
/* ------------------------- */
case 'l': /* min amplification lenght */
/* ------------------------- */
sscanf(optarg,"%d",&lmin);
break;
/* -------------------------- */
case 'L': /* max amplification lenght */
/* -------------------------- */
sscanf(optarg,"%d",&lmax);
break;
/* -------------------- */
case 'e': /* error max */
/* -------------------- */
sscanf(optarg,"%d",&error_max);
break;
/* -------------------- */
case 'k': /* set the kingdom mode */
/* -------------------- */
kingdom_mode = 1;
break;
/* -------------------- */
case '?': /* bad option */
/* -------------------- */
errflag++;
}
}
/**
* check the path to the database is given as last argument
*/
if ((argc -= optind) != 1)
errflag++;
if (!oligo1 || !oligo2)
errflag++;
if (errflag)
ExitUsage(errflag);
prefix = argv[optind];
o1 = buildPattern(oligo1,error_max);
o2 = buildPattern(oligo2,error_max);
o1c = complementPattern(o1);
o2c = complementPattern(o2);
printf("########\n");
printf("# ecoPCR version %s\n",VERSION);
printf("# direct strand oligo1 : %-32s ; oligo2c : %32s\n", o1->cpat,o2c->cpat);
printf("# reverse strand oligo2 : %-32s ; oligo1c : %32s\n", o2->cpat,o1c->cpat);
printf("# max error count by oligonucleotide : %d\n",error_max);
printf("# database : %s\n",prefix);
if (lmin && lmax)
printf("# amplifiat length between [%d,%d] bp\n",lmin,lmax);
else if (lmin)
printf("# amplifiat length larger than %d bp\n",lmin);
else if (lmax)
printf("# amplifiat length smaller than %d bp\n",lmax);
if (kingdom_mode)
printf("# output in kingdom mode\n");
else
printf("# output in superkingdom mode\n");
printf("#\n");
taxonomy = read_taxonomy(prefix);
seq = ecoseq_iterator(prefix);
checkedSequence = 0;
positiveSequence= 0;
amplifiatCount = 0;
while(seq)
{
checkedSequence++;
scname = taxonomy->taxons->taxon[seq->taxid].name;
strncpy(head,seq->SQ,10);
head[10]=0;
strncpy(tail,seq->SQ+seq->SQ_length-10,10);
tail[10]=0;
apatseq=ecoseq2apatseq(seq,apatseq);
o1Hits = ManberAll(apatseq,o1,0,0,apatseq->seqlen);
o2cHits= 0;
if (o1Hits)
{
stktmp = apatseq->hitpos[0];
begin = stktmp->val[0] + o1->patlen;
if (lmax)
length= stktmp->val[stktmp->top-1] + o1->patlen - begin + lmax + o2->patlen;
else
length= apatseq->seqlen - begin;
o2cHits = ManberAll(apatseq,o2c,1,begin,length);
if (o2cHits)
for (i=0; i < o1Hits;i++)
{
posi = apatseq->hitpos[0]->val[i];
erri = apatseq->hiterr[0]->val[i];
for (j=0; j < o2cHits; j++)
{
posj =apatseq->hitpos[1]->val[j] + o2c->patlen;
errj =apatseq->hiterr[1]->val[j];
length=posj - posi + 1 - o1->patlen - o2->patlen;
if ((!lmin || (length >= lmin)) &&
(!lmax || (length <= lmax)))
printRepeat(seq,o1,o2c,'D',kingdom_mode,posi,posj,erri,errj,taxonomy);
//printf("%s\tD\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o1Hits,o2cHits,posi,posj,scname);
}
}
}
o2Hits = ManberAll(apatseq,o2,2,0,apatseq->seqlen);
o1cHits= 0;
if (o2Hits)
{
stktmp = apatseq->hitpos[2];
begin = stktmp->val[0] + o2->patlen;
if (lmax)
length= stktmp->val[stktmp->top-1] + o2->patlen - begin + lmax + o1->patlen;
else
length= apatseq->seqlen - begin;
o1cHits = ManberAll(apatseq,o1c,3,begin,length);
if (o1cHits)
for (i=0; i < o2Hits;i++)
{
posi = apatseq->hitpos[2]->val[i];
erri = apatseq->hiterr[2]->val[i];
for (j=0; j < o1cHits; j++)
{
posj=apatseq->hitpos[3]->val[j] + o1c->patlen;
errj=apatseq->hiterr[3]->val[j];
length=posj - posi + 1 - o1->patlen - o2->patlen;
if ((!lmin || (length >= lmin)) &&
(!lmax || (length <= lmax)))
printRepeat(seq,o2,o1c,'R',kingdom_mode,posi,posj,erri,errj,taxonomy);
//printf("%s\tR\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o2Hits,o1cHits,posi,posj,scname);
}
}
}
delete_ecoseq(seq);
seq = ecoseq_iterator(NULL);
}
return 0;
}