Initial version of ecoPCR
git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@5 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
512
src/ecopcr.c
Normal file
512
src/ecopcr.c
Normal file
@ -0,0 +1,512 @@
|
||||
#include "libecoPCR/ecoPCR.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <getopt.h>
|
||||
|
||||
|
||||
#define VERSION "0.1"
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* printout help */ /* ----------------------------------------------- */
|
||||
|
||||
#define PP fprintf(stdout,
|
||||
|
||||
static void PrintHelp()
|
||||
{
|
||||
PP "------------------------------------------\n");
|
||||
PP " Apat Version %s\n", VERSION);
|
||||
PP "------------------------------------------\n");
|
||||
PP "synopsis : pattern(s) searching program\n");
|
||||
PP "usage: apat [options] patfile datafile\n");
|
||||
PP "------------------------------------------\n");
|
||||
PP "options:\n");
|
||||
PP "-a code : [A]lphabet encoding for pattern\n");
|
||||
PP " code is one of : \n");
|
||||
PP " dna: use IUPAC equivalences for dna/rna\n");
|
||||
PP " prot: use IUPAC equivalences for proteins\n");
|
||||
PP " alpha: no equivalences, just treat plain symbols\n");
|
||||
PP " note: the equivalences are used in pattern only\n");
|
||||
PP " *not* in sequence(s) (see note (4) below)\n");
|
||||
PP " dft: alpha\n");
|
||||
PP "-c : [C]ooccurences\n");
|
||||
PP " print patterns cooccurence matrix \n");
|
||||
PP " dft: off\n");
|
||||
PP "-h : [H]elp - print <this> help\n");
|
||||
PP "-m : [M]ultiple occurences\n");
|
||||
PP " see -q option \n");
|
||||
PP " dft: off\n");
|
||||
PP "-o file : [O]utput sequences\n");
|
||||
PP " additionaly output sequence(s) that match into\n");
|
||||
PP " 'file' in fasta format\n");
|
||||
PP " dft: off\n");
|
||||
PP "-p : no [Print] - don't printout hits\n");
|
||||
PP " when just counts are needed\n");
|
||||
PP " dft: off\n");
|
||||
PP "-q nn : [Quorum]\n");
|
||||
PP " printout result if at least nn\n");
|
||||
PP " different patterns are found on the sequence\n");
|
||||
PP " (with -m : at least nn different <hits>)\n");
|
||||
PP " dft: # of patterns read\n");
|
||||
PP "-s : no [Sort] - don't sort hits before printing\n");
|
||||
PP " usually hits are printed by increasing position\n");
|
||||
PP " this option will list them by pattern\n");
|
||||
PP " dft: off\n");
|
||||
PP "-t : [T]est sequence\n");
|
||||
PP " additionnaly check if sequences are uppercase\n");
|
||||
PP " this is mostly used for testing\n");
|
||||
PP " dft: off\n");
|
||||
PP "-u : [U]pper\n");
|
||||
PP " force lower->upper sequence conversion\n");
|
||||
PP " without this option lowercase symbols in sequence\n");
|
||||
PP " will not be considered to as matches\n");
|
||||
PP " dft: off\n");
|
||||
PP "-v : [V]erbose\n");
|
||||
PP " just display a kind of progress clock on stderr\n");
|
||||
PP " (this is only useful if you redirect stdout)\n");
|
||||
PP "\n");
|
||||
PP "patfile : pattern file (see below)\n");
|
||||
PP "datafile : database file (see below)\n");
|
||||
PP "------------------------------------------\n");
|
||||
PP "pattern file format :\n");
|
||||
PP " one pattern/line\n");
|
||||
PP " format : <pattern> <space> #errors\n");
|
||||
PP " <pattern> := pattern<symbol>\n");
|
||||
PP " or !pattern<symbol>\n");
|
||||
PP " or pattern<symbol>#\n");
|
||||
PP " or !pattern<symbol>#\n");
|
||||
PP " <symbol> := <letter>\n");
|
||||
PP " or [<letter>....<letter>]\n");
|
||||
PP " <letter> := uppercase letter (A-Z)\n");
|
||||
PP " <number> := a positive number indicates max number of mismatches\n");
|
||||
PP " a negative number indicates max number of mismatches or indels\n");
|
||||
PP " # means that no error is allowed at this position\n");
|
||||
PP " ! complement the <symbol>\n");
|
||||
PP " [...] means that all symbols within [] are allowed\n");
|
||||
PP " in addition IUPAC equivalences may be used as symbols\n");
|
||||
PP " with the -a option\n");
|
||||
PP "\n");
|
||||
PP "example: G[DE]S#[GIV]!HP![DE]# 1\n");
|
||||
PP "\n");
|
||||
PP "------------------------------------------\n");
|
||||
PP "datafile contains one or more sequences in\n");
|
||||
PP "Fasta format, with *uppercase* symbols \n");
|
||||
PP "\n");
|
||||
PP "------------------------------------------\n");
|
||||
PP "note (1): the maximum number of patterns is %d\n", MAX_PATTERN);
|
||||
PP "\n");
|
||||
PP "note (2): the maximum length for one pattern is %d\n", MAX_PAT_LEN);
|
||||
PP "\n");
|
||||
PP "note (3): indels are still experimental and are :\n");
|
||||
PP " not handled gracefully with the # syntax\n");
|
||||
PP " and hits are not printed very nicely\n");
|
||||
PP "\n");
|
||||
PP "note (4): the IUPAC equivalences (-a option) are used\n");
|
||||
PP " in pattern only *not* in sequence(s).\n");
|
||||
PP " for instance GATN (with option -a dna) is equivalent to GAT[ACGT]\n");
|
||||
PP " and will match GATA/GATC/GATG/GATC but will not match GATN\n");
|
||||
PP " (nor NNNN) in sequence.\n");
|
||||
PP "\n");
|
||||
|
||||
}
|
||||
|
||||
#undef PP
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* printout usage and exit */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
#define PP fprintf(stderr,
|
||||
|
||||
static void ExitUsage(stat)
|
||||
int stat;
|
||||
{
|
||||
PP "usage: apat [-a dna|prot] [-c] [-h] [-m] [-o file] [-p]\n");
|
||||
PP " [-q nn] [-t] [-u] [-v]\n");
|
||||
PP " patfile datafile\n");
|
||||
PP "type \"apat -h\" for help\n");
|
||||
|
||||
if (stat)
|
||||
exit(stat);
|
||||
}
|
||||
|
||||
#undef PP
|
||||
|
||||
void printRepeat(ecoseq_t *seq,
|
||||
PatternPtr o1, PatternPtr o2,
|
||||
char strand,
|
||||
int32_t pos1, int32_t pos2,
|
||||
int32_t err1, int32_t err2,
|
||||
ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
char *AC;
|
||||
int32_t seqlength;
|
||||
int32_t taxid;
|
||||
int32_t species_taxid;
|
||||
int32_t genus_taxid;
|
||||
int32_t family_taxid;
|
||||
int32_t superkingdom_taxid;
|
||||
char *rank;
|
||||
char *scientificName;
|
||||
char *genus_name;
|
||||
char *family_name;
|
||||
char *superkingdom_name;
|
||||
|
||||
ecotx_t *taxon;
|
||||
ecotx_t *main_taxon;
|
||||
|
||||
char oligo1[MAX_PAT_LEN+1];
|
||||
char oligo2[MAX_PAT_LEN+1];
|
||||
|
||||
int32_t error1;
|
||||
int32_t error2;
|
||||
|
||||
char *amplifia = NULL;
|
||||
int32_t amplength;
|
||||
|
||||
AC = seq->AC;
|
||||
seqlength = seq->SQ_length;
|
||||
|
||||
main_taxon = &taxonomy->taxons->taxon[seq->taxid];
|
||||
taxid = main_taxon->taxid;
|
||||
scientificName= main_taxon->name;
|
||||
rank = taxonomy->ranks->label[main_taxon->rank];
|
||||
taxon = eco_getspecies(main_taxon,taxonomy);
|
||||
if (taxon)
|
||||
{
|
||||
species_taxid = taxon->taxid;
|
||||
scientificName= taxon->name;
|
||||
}
|
||||
else
|
||||
species_taxid = -1;
|
||||
|
||||
taxon = eco_getgenus((taxon) ? taxon:main_taxon,taxonomy);
|
||||
if (taxon)
|
||||
{
|
||||
genus_taxid = taxon->taxid;
|
||||
genus_name= taxon->name;
|
||||
}
|
||||
else
|
||||
{
|
||||
genus_taxid = -1;
|
||||
genus_name = "###";
|
||||
}
|
||||
|
||||
taxon = eco_getfamily((taxon) ? taxon:main_taxon,taxonomy);
|
||||
if (taxon)
|
||||
{
|
||||
family_taxid = taxon->taxid;
|
||||
family_name= taxon->name;
|
||||
}
|
||||
else
|
||||
{
|
||||
family_taxid = -1;
|
||||
family_name = "###";
|
||||
}
|
||||
|
||||
taxon = eco_getsuperkingdom((taxon) ? taxon:main_taxon,taxonomy);
|
||||
if (taxon)
|
||||
{
|
||||
superkingdom_taxid = taxon->taxid;
|
||||
superkingdom_name= taxon->name;
|
||||
}
|
||||
else
|
||||
{
|
||||
superkingdom_taxid = -1;
|
||||
superkingdom_name = "###";
|
||||
}
|
||||
|
||||
amplength = pos2-pos1;
|
||||
|
||||
amplifia = getSubSequence(seq->SQ,pos1,pos2);
|
||||
|
||||
|
||||
if (strand=='R')
|
||||
{
|
||||
ecoComplementSequence(amplifia);
|
||||
strncpy(oligo1,amplifia,o2->patlen);
|
||||
oligo1[o2->patlen]=0;
|
||||
error1=err2;
|
||||
|
||||
strncpy(oligo2,amplifia + amplength - o1->patlen,o1->patlen);
|
||||
oligo2[o1->patlen]=0;
|
||||
error2=err1;
|
||||
|
||||
amplifia+=o2->patlen;
|
||||
}
|
||||
else
|
||||
{
|
||||
strncpy(oligo1,amplifia,o1->patlen);
|
||||
oligo1[o1->patlen]=0;
|
||||
error1=err1;
|
||||
|
||||
strncpy(oligo2,amplifia + amplength - o2->patlen,o2->patlen);
|
||||
oligo2[o2->patlen]=0;
|
||||
error2=err2;
|
||||
|
||||
amplifia+=o1->patlen;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
ecoComplementSequence(oligo2);
|
||||
amplifia[amplength - o2->patlen - o1->patlen]=0;
|
||||
|
||||
printf("%-15s | %9d | %8d | %-20s | %8d | %-30s | %8d | %-30s | %8d | %-30s | %8d | %-30s | %c | %-32s | %2d | %-32s | %2d | %5d | %s | %s\n",
|
||||
AC,
|
||||
seqlength,
|
||||
taxid,
|
||||
rank,
|
||||
species_taxid,
|
||||
scientificName,
|
||||
genus_taxid,
|
||||
genus_name,
|
||||
family_taxid,
|
||||
family_name,
|
||||
superkingdom_taxid,
|
||||
superkingdom_name,
|
||||
strand,
|
||||
oligo1,
|
||||
error1,
|
||||
oligo2,
|
||||
error2,
|
||||
amplength,
|
||||
amplifia,
|
||||
seq->DE
|
||||
);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
ecoseq_t *seq;
|
||||
ecotaxonomy_t *taxonomy;
|
||||
char *scname;
|
||||
char head[11];
|
||||
char tail[11];
|
||||
|
||||
int carg;
|
||||
|
||||
char *oligo1=NULL;
|
||||
char *oligo2=NULL;
|
||||
|
||||
PatternPtr o1;
|
||||
PatternPtr o2;
|
||||
PatternPtr o1c;
|
||||
PatternPtr o2c;
|
||||
|
||||
int32_t lmin=0;
|
||||
int32_t lmax=0;
|
||||
int32_t error_max=0;
|
||||
int32_t errflag=0;
|
||||
|
||||
char *prefix;
|
||||
|
||||
int32_t checkedSequence = 0;
|
||||
int32_t positiveSequence= 0;
|
||||
int32_t amplifiatCount = 0;
|
||||
|
||||
int32_t o1Hits;
|
||||
int32_t o2Hits;
|
||||
int32_t o1cHits;
|
||||
int32_t o2cHits;
|
||||
|
||||
int32_t begin;
|
||||
int32_t length;
|
||||
|
||||
SeqPtr apatseq=NULL;
|
||||
StackiPtr stktmp;
|
||||
|
||||
int32_t i;
|
||||
int32_t j;
|
||||
int32_t posi;
|
||||
int32_t posj;
|
||||
int32_t erri;
|
||||
int32_t errj;
|
||||
|
||||
|
||||
while ((carg = getopt(argc, argv, "h1:2:l:L:e:")) != -1) {
|
||||
|
||||
switch (carg) {
|
||||
/* -------------------- */
|
||||
case '1': /* prenier oligo */
|
||||
/* -------------------- */
|
||||
oligo1 = ECOMALLOC(strlen(optarg)+1,
|
||||
"Error on oligo 1 allocation");
|
||||
strcpy(oligo1,optarg);
|
||||
break;
|
||||
|
||||
/* -------------------- */
|
||||
case '2': /* coocurence option */
|
||||
/* -------------------- */
|
||||
oligo2 = ECOMALLOC(strlen(optarg)+1,
|
||||
"Error on oligo 1 allocation");
|
||||
strcpy(oligo2,optarg);
|
||||
break;
|
||||
|
||||
/* -------------------- */
|
||||
case 'h': /* help */
|
||||
/* -------------------- */
|
||||
|
||||
PrintHelp();
|
||||
exit(0);
|
||||
break;
|
||||
|
||||
/* -------------------- */
|
||||
case 'l': /* lmin amplification */
|
||||
/* -------------------- */
|
||||
sscanf(optarg,"%d",&lmin);
|
||||
break;
|
||||
|
||||
/* -------------------- */
|
||||
case 'L': /* lmax amplification */
|
||||
/* -------------------- */
|
||||
sscanf(optarg,"%d",&lmax);
|
||||
break;
|
||||
|
||||
/* -------------------- */
|
||||
case 'e': /* error max */
|
||||
/* -------------------- */
|
||||
sscanf(optarg,"%d",&error_max);
|
||||
break;
|
||||
|
||||
/* -------------------- */
|
||||
case '?': /* bad option */
|
||||
/* -------------------- */
|
||||
errflag++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if ((argc -= optind) != 1)
|
||||
errflag++;
|
||||
|
||||
if (!oligo1 || !oligo2)
|
||||
errflag++;
|
||||
|
||||
if (errflag)
|
||||
ExitUsage(errflag);
|
||||
|
||||
prefix = argv[optind];
|
||||
|
||||
o1 = buildPattern(oligo1,error_max);
|
||||
o2 = buildPattern(oligo2,error_max);
|
||||
|
||||
o1c = complementPattern(o1);
|
||||
o2c = complementPattern(o2);
|
||||
|
||||
|
||||
|
||||
printf("#\n");
|
||||
printf("# ecoPCR version %s\n",VERSION);
|
||||
printf("# direct strand oligo1 : %-32s ; oligo2c : %32s\n", o1->cpat,o2c->cpat);
|
||||
printf("# reverse strand oligo2 : %-32s ; oligo1c : %32s\n", o2->cpat,o1c->cpat);
|
||||
printf("# max error count by oligonucleotide : %d\n",error_max);
|
||||
printf("# database : %s\n",prefix);
|
||||
if (lmin && lmax)
|
||||
printf("# amplifiat length between [%d,%d] bp\n",lmin,lmax);
|
||||
else if (lmin)
|
||||
printf("# amplifiat length larger than %d bp\n",lmin);
|
||||
else if (lmax)
|
||||
printf("# amplifiat length smaller than %d bp\n",lmax);
|
||||
printf("#\n");
|
||||
|
||||
taxonomy = read_taxonomy(prefix);
|
||||
|
||||
seq = ecoseq_iterator(prefix);
|
||||
|
||||
|
||||
checkedSequence = 0;
|
||||
positiveSequence= 0;
|
||||
amplifiatCount = 0;
|
||||
|
||||
while(seq)
|
||||
{
|
||||
checkedSequence++;
|
||||
|
||||
scname = taxonomy->taxons->taxon[seq->taxid].name;
|
||||
strncpy(head,seq->SQ,10);
|
||||
head[10]=0;
|
||||
strncpy(tail,seq->SQ+seq->SQ_length-10,10);
|
||||
tail[10]=0;
|
||||
|
||||
|
||||
apatseq=ecoseq2apatseq(seq,apatseq);
|
||||
|
||||
o1Hits = ManberAll(apatseq,o1,0,0,apatseq->seqlen);
|
||||
o2cHits= 0;
|
||||
|
||||
|
||||
if (o1Hits)
|
||||
{
|
||||
stktmp = apatseq->hitpos[0];
|
||||
begin = stktmp->val[0] + o1->patlen;
|
||||
|
||||
if (lmax)
|
||||
length= stktmp->val[stktmp->top-1] + o1->patlen - begin + lmax + o2->patlen;
|
||||
else
|
||||
length= apatseq->seqlen - begin;
|
||||
|
||||
o2cHits = ManberAll(apatseq,o2c,1,begin,length);
|
||||
|
||||
if (o2cHits)
|
||||
for (i=0; i < o1Hits;i++)
|
||||
{
|
||||
posi = apatseq->hitpos[0]->val[i];
|
||||
erri = apatseq->hiterr[0]->val[i];
|
||||
for (j=0; j < o2cHits; j++)
|
||||
{
|
||||
posj =apatseq->hitpos[1]->val[j] + o2c->patlen;
|
||||
errj =apatseq->hiterr[1]->val[j];
|
||||
length=posj - posi + 1 - o1->patlen - o2->patlen;
|
||||
|
||||
if ((!lmin || (length >= lmin)) &&
|
||||
(!lmax || (length <= lmax)))
|
||||
printRepeat(seq,o1,o2c,'D',posi,posj,erri,errj,taxonomy);
|
||||
//printf("%s\tD\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o1Hits,o2cHits,posi,posj,scname);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
o2Hits = ManberAll(apatseq,o2,2,0,apatseq->seqlen);
|
||||
o1cHits= 0;
|
||||
if (o2Hits)
|
||||
{
|
||||
stktmp = apatseq->hitpos[2];
|
||||
begin = stktmp->val[0] + o2->patlen;
|
||||
|
||||
if (lmax)
|
||||
length= stktmp->val[stktmp->top-1] + o2->patlen - begin + lmax + o1->patlen;
|
||||
else
|
||||
length= apatseq->seqlen - begin;
|
||||
|
||||
o1cHits = ManberAll(apatseq,o1c,3,begin,length);
|
||||
|
||||
if (o1cHits)
|
||||
for (i=0; i < o2Hits;i++)
|
||||
{
|
||||
posi = apatseq->hitpos[2]->val[i];
|
||||
erri = apatseq->hiterr[2]->val[i];
|
||||
for (j=0; j < o1cHits; j++)
|
||||
{
|
||||
posj=apatseq->hitpos[3]->val[j] + o1c->patlen;
|
||||
errj=apatseq->hiterr[3]->val[j];
|
||||
length=posj - posi + 1 - o1->patlen - o2->patlen;
|
||||
|
||||
if ((!lmin || (length >= lmin)) &&
|
||||
(!lmax || (length <= lmax)))
|
||||
printRepeat(seq,o2,o1c,'R',posi,posj,erri,errj,taxonomy);
|
||||
//printf("%s\tR\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o2Hits,o1cHits,posi,posj,scname);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
delete_ecoseq(seq);
|
||||
|
||||
seq = ecoseq_iterator(NULL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Reference in New Issue
Block a user