git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@114 60f365c0-8329-0410-b2a4-ec073aeeaa1d

This commit is contained in:
2007-09-24 09:48:08 +00:00
parent 6cd1141130
commit bc4c7656c6
15 changed files with 1494 additions and 240 deletions

View File

@ -8,104 +8,69 @@
#define VERSION "0.1"
/* ----------------------------------------------- */
/* printout help */ /* ----------------------------------------------- */
/* printout help */
/* ----------------------------------------------- */
#define PP fprintf(stdout,
static void PrintHelp()
{
PP "------------------------------------------\n");
PP " Apat Version %s\n", VERSION);
PP " ecoPCR Version %s\n", VERSION);
PP "------------------------------------------\n");
PP "synopsis : pattern(s) searching program\n");
PP "usage: apat [options] patfile datafile\n");
PP "synopsis : searching for sequence and taxonomy hybriding with given primers\n");
PP "usage: ecoPCR [options] <nucleotidic patterns>\n");
PP "------------------------------------------\n");
PP "options:\n");
PP "-a code : [A]lphabet encoding for pattern\n");
PP " code is one of : \n");
PP " dna: use IUPAC equivalences for dna/rna\n");
PP " prot: use IUPAC equivalences for proteins\n");
PP " alpha: no equivalences, just treat plain symbols\n");
PP " note: the equivalences are used in pattern only\n");
PP " *not* in sequence(s) (see note (4) below)\n");
PP " dft: alpha\n");
PP "-c : [C]ooccurences\n");
PP " print patterns cooccurence matrix \n");
PP " dft: off\n");
PP "-h : [H]elp - print <this> help\n");
PP "-m : [M]ultiple occurences\n");
PP " see -q option \n");
PP " dft: off\n");
PP "-o file : [O]utput sequences\n");
PP " additionaly output sequence(s) that match into\n");
PP " 'file' in fasta format\n");
PP " dft: off\n");
PP "-p : no [Print] - don't printout hits\n");
PP " when just counts are needed\n");
PP " dft: off\n");
PP "-q nn : [Quorum]\n");
PP " printout result if at least nn\n");
PP " different patterns are found on the sequence\n");
PP " (with -m : at least nn different <hits>)\n");
PP " dft: # of patterns read\n");
PP "-s : no [Sort] - don't sort hits before printing\n");
PP " usually hits are printed by increasing position\n");
PP " this option will list them by pattern\n");
PP " dft: off\n");
PP "-t : [T]est sequence\n");
PP " additionnaly check if sequences are uppercase\n");
PP " this is mostly used for testing\n");
PP " dft: off\n");
PP "-u : [U]pper\n");
PP " force lower->upper sequence conversion\n");
PP " without this option lowercase symbols in sequence\n");
PP " will not be considered to as matches\n");
PP " dft: off\n");
PP "-v : [V]erbose\n");
PP " just display a kind of progress clock on stderr\n");
PP " (this is only useful if you redirect stdout)\n");
PP "\n");
PP "patfile : pattern file (see below)\n");
PP "datafile : database file (see below)\n");
PP "------------------------------------------\n");
PP "pattern file format :\n");
PP " one pattern/line\n");
PP " format : <pattern> <space> #errors\n");
PP " <pattern> := pattern<symbol>\n");
PP " or !pattern<symbol>\n");
PP " or pattern<symbol>#\n");
PP " or !pattern<symbol>#\n");
PP " <symbol> := <letter>\n");
PP " or [<letter>....<letter>]\n");
PP " <letter> := uppercase letter (A-Z)\n");
PP " <number> := a positive number indicates max number of mismatches\n");
PP " a negative number indicates max number of mismatches or indels\n");
PP " # means that no error is allowed at this position\n");
PP " ! complement the <symbol>\n");
PP " [...] means that all symbols within [] are allowed\n");
PP " in addition IUPAC equivalences may be used as symbols\n");
PP " with the -a option\n");
PP "\n");
PP "example: G[DE]S#[GIV]!HP![DE]# 1\n");
PP "-d : [D]atabase : to match the expected format, the database\n");
PP " has to be formated first by the ecoPCRFormat.py program located.\n");
PP " in the tools directory.\n");
PP " ecoPCRFormat.py creates three file types :\n");
PP " .sdx : contains the sequences\n");
PP " .tdx : contains information concerning the taxonomy\n");
PP " .rdx : contains the taxonomy rank\n\n");
PP " ecoPCR needs all the file type. As a result, you have to write the\n");
PP " database radical without any extension. For example /ecoPCRDB/gbmam\n\n");
PP "-e : [E]rror : max error allowed by oligonucleotide (0 by default)\n\n");
PP "-h : [H]elp - print <this> help\n\n");
PP "-i : [I]gnore the given taxonomy id.\n");
PP " Taxonomy id are available using the ecofind program.\n");
PP " see its help typing ecofind -h for more information.\n\n");
PP "-k : [K]ingdom mode : set the kingdom mode\n");
PP " super kingdom mode by default.\n\n");
PP "-l : minimum [L]ength : define the minimum amplication length. \n\n");
PP "-L : maximum [L]ength : define the maximum amplicationlength. \n\n");
PP "-r : [R]estricts the search to the given taxonomic id.\n");
PP " Taxonomy id are available using the ecofind program.\n");
PP " see its help typing ecofind -h for more information.\n");
PP "\n");
PP "------------------------------------------\n");
PP "datafile contains one or more sequences in\n");
PP "Fasta format, with *uppercase* symbols \n");
PP "\n");
PP "first argument : oligonucleotide for direct strand\n\n");
PP "second argument : oligonucleotide for reverse strand\n\n");
PP "------------------------------------------\n");
PP "note (1): the maximum number of patterns is %d\n", MAX_PATTERN);
PP "\n");
PP "note (2): the maximum length for one pattern is %d\n", MAX_PAT_LEN);
PP "\n");
PP "note (3): indels are still experimental and are :\n");
PP " not handled gracefully with the # syntax\n");
PP " and hits are not printed very nicely\n");
PP "\n");
PP "note (4): the IUPAC equivalences (-a option) are used\n");
PP " in pattern only *not* in sequence(s).\n");
PP " for instance GATN (with option -a dna) is equivalent to GAT[ACGT]\n");
PP " and will match GATA/GATC/GATG/GATC but will not match GATN\n");
PP " (nor NNNN) in sequence.\n");
PP "Table result description : \n");
PP "column 1 : accession number\n");
PP "column 2 : sequence length\n");
PP "column 3 : taxonomic id\n");
PP "column 4 : rank\n");
PP "column 5 : species taxonomic id\n");
PP "column 6 : scientific name\n");
PP "column 7 : genus taxonomic id\n");
PP "column 8 : genus name\n");
PP "column 9 : family taxonomic id\n");
PP "column 10 : family name\n");
PP "column 11 : super kingdom taxonomic id\n");
PP "column 12 : super kingdom name\n");
PP "column 13 : strand (direct or reverse)\n");
PP "column 14 : first oligonucleotide\n");
PP "column 15 : number of errors for the first strand\n");
PP "column 16 : second oligonucleotide\n");
PP "column 17 : number of errors for the second strand\n");
PP "column 18 : amplification length\n");
PP "column 19 : sequence\n");
PP "column 20 : definition\n");
PP "------------------------------------------\n");
PP " http://www.grenoble.prabi.fr/trac/ecoPCR/\n");
PP "------------------------------------------\n\n");
PP "\n");
}
@ -121,10 +86,8 @@ static void PrintHelp()
static void ExitUsage(stat)
int stat;
{
PP "usage: apat [-a dna|prot] [-c] [-h] [-m] [-o file] [-p]\n");
PP " [-q nn] [-t] [-u] [-v]\n");
PP " patfile datafile\n");
PP "type \"apat -h\" for help\n");
PP "usage: ecoPCR [-d database] [-l value] [-L value] [-e value] [-r taxid] [-i taxid] [-k] oligo1 oligo2\n");
PP "type \"ecoPCR -h\" for help\n");
if (stat)
exit(stat);
@ -306,7 +269,7 @@ int main(int argc, char **argv)
int32_t errflag=0;
char kingdom_mode=0;
char *prefix;
char *prefix = NULL;
int32_t checkedSequence = 0;
int32_t positiveSequence= 0;
@ -330,43 +293,39 @@ int main(int argc, char **argv)
int32_t erri;
int32_t errj;
int32_t *restricted_taxid = NULL;
int32_t *ignored_taxid = NULL;
int32_t r=0;
int32_t g=0;
while ((carg = getopt(argc, argv, "h1:2:l:L:e:k")) != -1) {
while ((carg = getopt(argc, argv, "hd:l:L:e:i:r:k")) != -1) {
switch (carg) {
/* -------------------- */
case '1': /* prenier oligo */
case 'd': /* database name */
/* -------------------- */
oligo1 = ECOMALLOC(strlen(optarg)+1,
"Error on oligo 1 allocation");
strcpy(oligo1,optarg);
break;
/* -------------------- */
case '2': /* coocurence option */
/* -------------------- */
oligo2 = ECOMALLOC(strlen(optarg)+1,
"Error on oligo 1 allocation");
strcpy(oligo2,optarg);
prefix = ECOMALLOC(strlen(optarg)+1,
"Error on prefix allocation");
strcpy(prefix,optarg);
break;
/* -------------------- */
case 'h': /* help */
/* -------------------- */
PrintHelp();
exit(0);
break;
/* -------------------- */
case 'l': /* lmin amplification */
/* -------------------- */
/* ------------------------- */
case 'l': /* min amplification lenght */
/* ------------------------- */
sscanf(optarg,"%d",&lmin);
break;
/* -------------------- */
case 'L': /* lmax amplification */
/* -------------------- */
/* -------------------------- */
case 'L': /* max amplification lenght */
/* -------------------------- */
sscanf(optarg,"%d",&lmax);
break;
@ -374,11 +333,29 @@ int main(int argc, char **argv)
case 'e': /* error max */
/* -------------------- */
sscanf(optarg,"%d",&error_max);
break;
/* -------------------- */
case 'k': /* set the kingdom mode */
kingdom_mode = 1; /* -------------------- */
break;
/* ------------------------------------------ */
case 'r': /* stores the restricting search taxonomic id */
/* ------------------------------------------ */
restricted_taxid = ECOREALLOC(restricted_taxid,sizeof(int32_t)*(r+1),
"Error on restricted_taxid reallocation");
sscanf(optarg,"%d",&restricted_taxid[r]);
r++;
break;
case 'k': /* error max */
/* -------------------- */
kingdom_mode = 1;
/* --------------------------------- */
case 'i': /* stores the taxonomic id to ignore */
/* --------------------------------- */
ignored_taxid = ECOREALLOC(ignored_taxid,sizeof(int32_t)*(g+1),
"Error on excluded_taxid reallocation");
sscanf(optarg,"%d",&ignored_taxid[g]);
g++;
break;
/* -------------------- */
@ -389,25 +366,43 @@ int main(int argc, char **argv)
}
if ((argc -= optind) != 1)
errflag++;
/**
* check the path to the database is given as last argument
*/
if ((argc -= optind) == 2)
{
oligo1 = ECOMALLOC(strlen(argv[optind])+1,
"Error on oligo1 allocation");
strcpy(oligo1,argv[optind]);
optind++;
oligo2 = ECOMALLOC(strlen(argv[optind])+1,
"Error on oligo1 allocation");
strcpy(oligo2,argv[optind]);
}
else
errflag++;
if (prefix == NULL)
{
prefix = getenv("ECOPCRDB");
if (prefix == NULL)
errflag++;
}
if (!oligo1 || !oligo2)
errflag++;
if (errflag)
ExitUsage(errflag);
prefix = argv[optind];
o1 = buildPattern(oligo1,error_max);
o2 = buildPattern(oligo2,error_max);
o1c = complementPattern(o1);
o2c = complementPattern(o2);
printf("#\n");
printf("# ecoPCR version %s\n",VERSION);
printf("# direct strand oligo1 : %-32s ; oligo2c : %32s\n", o1->cpat,o2c->cpat);
@ -427,102 +422,120 @@ int main(int argc, char **argv)
printf("# output in superkingdom mode\n");
printf("#\n");
taxonomy = read_taxonomy(prefix);
taxonomy = read_taxonomy(prefix,0);
seq = ecoseq_iterator(prefix);
checkedSequence = 0;
positiveSequence= 0;
amplifiatCount = 0;
while(seq)
{
{
checkedSequence++;
scname = taxonomy->taxons->taxon[seq->taxid].name;
strncpy(head,seq->SQ,10);
head[10]=0;
strncpy(tail,seq->SQ+seq->SQ_length-10,10);
tail[10]=0;
/**
* check if current sequence should be included
**/
if ( (r == 0) ||
(eco_is_taxid_included(taxonomy,
restricted_taxid,
r,
taxonomy->taxons->taxon[seq->taxid].taxid)
)
)
if ((g == 0) ||
!(eco_is_taxid_included(taxonomy,
ignored_taxid,
g,
taxonomy->taxons->taxon[seq->taxid].taxid)
)
)
{
apatseq=ecoseq2apatseq(seq,apatseq);
scname = taxonomy->taxons->taxon[seq->taxid].name;
strncpy(head,seq->SQ,10);
head[10]=0;
strncpy(tail,seq->SQ+seq->SQ_length-10,10);
tail[10]=0;
o1Hits = ManberAll(apatseq,o1,0,0,apatseq->seqlen);
o2cHits= 0;
if (o1Hits)
{
stktmp = apatseq->hitpos[0];
begin = stktmp->val[0] + o1->patlen;
if (lmax)
length= stktmp->val[stktmp->top-1] + o1->patlen - begin + lmax + o2->patlen;
else
length= apatseq->seqlen - begin;
apatseq=ecoseq2apatseq(seq,apatseq);
o2cHits = ManberAll(apatseq,o2c,1,begin,length);
if (o2cHits)
for (i=0; i < o1Hits;i++)
{
posi = apatseq->hitpos[0]->val[i];
erri = apatseq->hiterr[0]->val[i];
for (j=0; j < o2cHits; j++)
{
posj =apatseq->hitpos[1]->val[j] + o2c->patlen;
errj =apatseq->hiterr[1]->val[j];
length=posj - posi + 1 - o1->patlen - o2->patlen;
if ((!lmin || (length >= lmin)) &&
(!lmax || (length <= lmax)))
printRepeat(seq,o1,o2c,'D',kingdom_mode,posi,posj,erri,errj,taxonomy);
//printf("%s\tD\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o1Hits,o2cHits,posi,posj,scname);
}
}
}
o2Hits = ManberAll(apatseq,o2,2,0,apatseq->seqlen);
o1cHits= 0;
if (o2Hits)
{
stktmp = apatseq->hitpos[2];
begin = stktmp->val[0] + o2->patlen;
if (lmax)
length= stktmp->val[stktmp->top-1] + o2->patlen - begin + lmax + o1->patlen;
else
length= apatseq->seqlen - begin;
o1Hits = ManberAll(apatseq,o1,0,0,apatseq->seqlen);
o2cHits= 0;
o1cHits = ManberAll(apatseq,o1c,3,begin,length);
if (o1cHits)
for (i=0; i < o2Hits;i++)
if (o1Hits)
{
posi = apatseq->hitpos[2]->val[i];
erri = apatseq->hiterr[2]->val[i];
for (j=0; j < o1cHits; j++)
{
posj=apatseq->hitpos[3]->val[j] + o1c->patlen;
errj=apatseq->hiterr[3]->val[j];
length=posj - posi + 1 - o1->patlen - o2->patlen;
stktmp = apatseq->hitpos[0];
begin = stktmp->val[0] + o1->patlen;
if (lmax)
length= stktmp->val[stktmp->top-1] + o1->patlen - begin + lmax + o2->patlen;
else
length= apatseq->seqlen - begin;
if ((!lmin || (length >= lmin)) &&
(!lmax || (length <= lmax)))
printRepeat(seq,o2,o1c,'R',kingdom_mode,posi,posj,erri,errj,taxonomy);
//printf("%s\tR\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o2Hits,o1cHits,posi,posj,scname);
}
o2cHits = ManberAll(apatseq,o2c,1,begin,length);
if (o2cHits)
for (i=0; i < o1Hits;i++)
{
posi = apatseq->hitpos[0]->val[i];
erri = apatseq->hiterr[0]->val[i];
for (j=0; j < o2cHits; j++)
{
posj =apatseq->hitpos[1]->val[j] + o2c->patlen;
errj =apatseq->hiterr[1]->val[j];
length=posj - posi + 1 - o1->patlen - o2->patlen;
if ((!lmin || (length >= lmin)) &&
(!lmax || (length <= lmax)))
printRepeat(seq,o1,o2c,'D',kingdom_mode,posi,posj,erri,errj,taxonomy);
//printf("%s\tD\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o1Hits,o2cHits,posi,posj,scname);
}
}
}
}
o2Hits = ManberAll(apatseq,o2,2,0,apatseq->seqlen);
o1cHits= 0;
if (o2Hits)
{
stktmp = apatseq->hitpos[2];
begin = stktmp->val[0] + o2->patlen;
if (lmax)
length= stktmp->val[stktmp->top-1] + o2->patlen - begin + lmax + o1->patlen;
else
length= apatseq->seqlen - begin;
o1cHits = ManberAll(apatseq,o1c,3,begin,length);
if (o1cHits)
for (i=0; i < o2Hits;i++)
{
posi = apatseq->hitpos[2]->val[i];
erri = apatseq->hiterr[2]->val[i];
for (j=0; j < o1cHits; j++)
{
posj=apatseq->hitpos[3]->val[j] + o1c->patlen;
errj=apatseq->hiterr[3]->val[j];
length=posj - posi + 1 - o1->patlen - o2->patlen;
if ((!lmin || (length >= lmin)) &&
(!lmax || (length <= lmax)))
printRepeat(seq,o2,o1c,'R',kingdom_mode,posi,posj,erri,errj,taxonomy);
//printf("%s\tR\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o2Hits,o1cHits,posi,posj,scname);
}
}
}
} /* End of taxonomic selection */
delete_ecoseq(seq);
seq = ecoseq_iterator(NULL);
}
ECOFREE(restricted_taxid, "Error: could not free restricted_taxid\n");
ECOFREE(ignored_taxid, "Error: could not free excluded_taxid\n");
return 0;
}