#include "libecoPCR/ecoPCR.h" #include #include #include #include #define VERSION "0.1" /* ----------------------------------------------- */ /* printout help */ /* ----------------------------------------------- */ #define PP fprintf(stdout, static void PrintHelp() { PP "------------------------------------------\n"); PP " Apat Version %s\n", VERSION); PP "------------------------------------------\n"); PP "synopsis : pattern(s) searching program\n"); PP "usage: apat [options] patfile datafile\n"); PP "------------------------------------------\n"); PP "options:\n"); PP "-a code : [A]lphabet encoding for pattern\n"); PP " code is one of : \n"); PP " dna: use IUPAC equivalences for dna/rna\n"); PP " prot: use IUPAC equivalences for proteins\n"); PP " alpha: no equivalences, just treat plain symbols\n"); PP " note: the equivalences are used in pattern only\n"); PP " *not* in sequence(s) (see note (4) below)\n"); PP " dft: alpha\n"); PP "-c : [C]ooccurences\n"); PP " print patterns cooccurence matrix \n"); PP " dft: off\n"); PP "-h : [H]elp - print help\n"); PP "-m : [M]ultiple occurences\n"); PP " see -q option \n"); PP " dft: off\n"); PP "-o file : [O]utput sequences\n"); PP " additionaly output sequence(s) that match into\n"); PP " 'file' in fasta format\n"); PP " dft: off\n"); PP "-p : no [Print] - don't printout hits\n"); PP " when just counts are needed\n"); PP " dft: off\n"); PP "-q nn : [Quorum]\n"); PP " printout result if at least nn\n"); PP " different patterns are found on the sequence\n"); PP " (with -m : at least nn different )\n"); PP " dft: # of patterns read\n"); PP "-s : no [Sort] - don't sort hits before printing\n"); PP " usually hits are printed by increasing position\n"); PP " this option will list them by pattern\n"); PP " dft: off\n"); PP "-t : [T]est sequence\n"); PP " additionnaly check if sequences are uppercase\n"); PP " this is mostly used for testing\n"); PP " dft: off\n"); PP "-u : [U]pper\n"); PP " force lower->upper sequence conversion\n"); PP " without this option lowercase symbols in sequence\n"); PP " will not be considered to as matches\n"); PP " dft: off\n"); PP "-v : [V]erbose\n"); PP " just display a kind of progress clock on stderr\n"); PP " (this is only useful if you redirect stdout)\n"); PP "\n"); PP "patfile : pattern file (see below)\n"); PP "datafile : database file (see below)\n"); PP "------------------------------------------\n"); PP "pattern file format :\n"); PP " one pattern/line\n"); PP " format : #errors\n"); PP " := pattern\n"); PP " or !pattern\n"); PP " or pattern#\n"); PP " or !pattern#\n"); PP " := \n"); PP " or [....]\n"); PP " := uppercase letter (A-Z)\n"); PP " := a positive number indicates max number of mismatches\n"); PP " a negative number indicates max number of mismatches or indels\n"); PP " # means that no error is allowed at this position\n"); PP " ! complement the \n"); PP " [...] means that all symbols within [] are allowed\n"); PP " in addition IUPAC equivalences may be used as symbols\n"); PP " with the -a option\n"); PP "\n"); PP "example: G[DE]S#[GIV]!HP![DE]# 1\n"); PP "\n"); PP "------------------------------------------\n"); PP "datafile contains one or more sequences in\n"); PP "Fasta format, with *uppercase* symbols \n"); PP "\n"); PP "------------------------------------------\n"); PP "note (1): the maximum number of patterns is %d\n", MAX_PATTERN); PP "\n"); PP "note (2): the maximum length for one pattern is %d\n", MAX_PAT_LEN); PP "\n"); PP "note (3): indels are still experimental and are :\n"); PP " not handled gracefully with the # syntax\n"); PP " and hits are not printed very nicely\n"); PP "\n"); PP "note (4): the IUPAC equivalences (-a option) are used\n"); PP " in pattern only *not* in sequence(s).\n"); PP " for instance GATN (with option -a dna) is equivalent to GAT[ACGT]\n"); PP " and will match GATA/GATC/GATG/GATC but will not match GATN\n"); PP " (nor NNNN) in sequence.\n"); PP "\n"); } #undef PP /* ----------------------------------------------- */ /* printout usage and exit */ /* ----------------------------------------------- */ #define PP fprintf(stderr, static void ExitUsage(stat) int stat; { PP "usage: apat [-a dna|prot] [-c] [-h] [-m] [-o file] [-p]\n"); PP " [-q nn] [-t] [-u] [-v]\n"); PP " patfile datafile\n"); PP "type \"apat -h\" for help\n"); if (stat) exit(stat); } #undef PP void printRepeat(ecoseq_t *seq, PatternPtr o1, PatternPtr o2, char strand, int32_t pos1, int32_t pos2, int32_t err1, int32_t err2, ecotaxonomy_t *taxonomy) { char *AC; int32_t seqlength; int32_t taxid; int32_t species_taxid; int32_t genus_taxid; int32_t family_taxid; int32_t superkingdom_taxid; char *rank; char *scientificName; char *genus_name; char *family_name; char *superkingdom_name; ecotx_t *taxon; ecotx_t *main_taxon; char oligo1[MAX_PAT_LEN+1]; char oligo2[MAX_PAT_LEN+1]; int32_t error1; int32_t error2; char *amplifia = NULL; int32_t amplength; AC = seq->AC; seqlength = seq->SQ_length; main_taxon = &taxonomy->taxons->taxon[seq->taxid]; taxid = main_taxon->taxid; scientificName= main_taxon->name; rank = taxonomy->ranks->label[main_taxon->rank]; taxon = eco_getspecies(main_taxon,taxonomy); if (taxon) { species_taxid = taxon->taxid; scientificName= taxon->name; } else species_taxid = -1; taxon = eco_getgenus((taxon) ? taxon:main_taxon,taxonomy); if (taxon) { genus_taxid = taxon->taxid; genus_name= taxon->name; } else { genus_taxid = -1; genus_name = "###"; } taxon = eco_getfamily((taxon) ? taxon:main_taxon,taxonomy); if (taxon) { family_taxid = taxon->taxid; family_name= taxon->name; } else { family_taxid = -1; family_name = "###"; } taxon = eco_getsuperkingdom((taxon) ? taxon:main_taxon,taxonomy); if (taxon) { superkingdom_taxid = taxon->taxid; superkingdom_name= taxon->name; } else { superkingdom_taxid = -1; superkingdom_name = "###"; } amplength = pos2-pos1; amplifia = getSubSequence(seq->SQ,pos1,pos2); if (strand=='R') { ecoComplementSequence(amplifia); strncpy(oligo1,amplifia,o2->patlen); oligo1[o2->patlen]=0; error1=err2; strncpy(oligo2,amplifia + amplength - o1->patlen,o1->patlen); oligo2[o1->patlen]=0; error2=err1; amplifia+=o2->patlen; } else { strncpy(oligo1,amplifia,o1->patlen); oligo1[o1->patlen]=0; error1=err1; strncpy(oligo2,amplifia + amplength - o2->patlen,o2->patlen); oligo2[o2->patlen]=0; error2=err2; amplifia+=o1->patlen; } ecoComplementSequence(oligo2); amplifia[amplength - o2->patlen - o1->patlen]=0; printf("%-15s | %9d | %8d | %-20s | %8d | %-30s | %8d | %-30s | %8d | %-30s | %8d | %-30s | %c | %-32s | %2d | %-32s | %2d | %5d | %s | %s\n", AC, seqlength, taxid, rank, species_taxid, scientificName, genus_taxid, genus_name, family_taxid, family_name, superkingdom_taxid, superkingdom_name, strand, oligo1, error1, oligo2, error2, amplength, amplifia, seq->DE ); } int main(int argc, char **argv) { ecoseq_t *seq; ecotaxonomy_t *taxonomy; char *scname; char head[11]; char tail[11]; int carg; char *oligo1=NULL; char *oligo2=NULL; PatternPtr o1; PatternPtr o2; PatternPtr o1c; PatternPtr o2c; int32_t lmin=0; int32_t lmax=0; int32_t error_max=0; int32_t errflag=0; char *prefix; int32_t checkedSequence = 0; int32_t positiveSequence= 0; int32_t amplifiatCount = 0; int32_t o1Hits; int32_t o2Hits; int32_t o1cHits; int32_t o2cHits; int32_t begin; int32_t length; SeqPtr apatseq=NULL; StackiPtr stktmp; int32_t i; int32_t j; int32_t posi; int32_t posj; int32_t erri; int32_t errj; while ((carg = getopt(argc, argv, "h1:2:l:L:e:")) != -1) { switch (carg) { /* -------------------- */ case '1': /* prenier oligo */ /* -------------------- */ oligo1 = ECOMALLOC(strlen(optarg)+1, "Error on oligo 1 allocation"); strcpy(oligo1,optarg); break; /* -------------------- */ case '2': /* coocurence option */ /* -------------------- */ oligo2 = ECOMALLOC(strlen(optarg)+1, "Error on oligo 1 allocation"); strcpy(oligo2,optarg); break; /* -------------------- */ case 'h': /* help */ /* -------------------- */ PrintHelp(); exit(0); break; /* -------------------- */ case 'l': /* lmin amplification */ /* -------------------- */ sscanf(optarg,"%d",&lmin); break; /* -------------------- */ case 'L': /* lmax amplification */ /* -------------------- */ sscanf(optarg,"%d",&lmax); break; /* -------------------- */ case 'e': /* error max */ /* -------------------- */ sscanf(optarg,"%d",&error_max); break; /* -------------------- */ case '?': /* bad option */ /* -------------------- */ errflag++; } } if ((argc -= optind) != 1) errflag++; if (!oligo1 || !oligo2) errflag++; if (errflag) ExitUsage(errflag); prefix = argv[optind]; o1 = buildPattern(oligo1,error_max); o2 = buildPattern(oligo2,error_max); o1c = complementPattern(o1); o2c = complementPattern(o2); printf("#\n"); printf("# ecoPCR version %s\n",VERSION); printf("# direct strand oligo1 : %-32s ; oligo2c : %32s\n", o1->cpat,o2c->cpat); printf("# reverse strand oligo2 : %-32s ; oligo1c : %32s\n", o2->cpat,o1c->cpat); printf("# max error count by oligonucleotide : %d\n",error_max); printf("# database : %s\n",prefix); if (lmin && lmax) printf("# amplifiat length between [%d,%d] bp\n",lmin,lmax); else if (lmin) printf("# amplifiat length larger than %d bp\n",lmin); else if (lmax) printf("# amplifiat length smaller than %d bp\n",lmax); printf("#\n"); taxonomy = read_taxonomy(prefix); seq = ecoseq_iterator(prefix); checkedSequence = 0; positiveSequence= 0; amplifiatCount = 0; while(seq) { checkedSequence++; scname = taxonomy->taxons->taxon[seq->taxid].name; strncpy(head,seq->SQ,10); head[10]=0; strncpy(tail,seq->SQ+seq->SQ_length-10,10); tail[10]=0; apatseq=ecoseq2apatseq(seq,apatseq); o1Hits = ManberAll(apatseq,o1,0,0,apatseq->seqlen); o2cHits= 0; if (o1Hits) { stktmp = apatseq->hitpos[0]; begin = stktmp->val[0] + o1->patlen; if (lmax) length= stktmp->val[stktmp->top-1] + o1->patlen - begin + lmax + o2->patlen; else length= apatseq->seqlen - begin; o2cHits = ManberAll(apatseq,o2c,1,begin,length); if (o2cHits) for (i=0; i < o1Hits;i++) { posi = apatseq->hitpos[0]->val[i]; erri = apatseq->hiterr[0]->val[i]; for (j=0; j < o2cHits; j++) { posj =apatseq->hitpos[1]->val[j] + o2c->patlen; errj =apatseq->hiterr[1]->val[j]; length=posj - posi + 1 - o1->patlen - o2->patlen; if ((!lmin || (length >= lmin)) && (!lmax || (length <= lmax))) printRepeat(seq,o1,o2c,'D',posi,posj,erri,errj,taxonomy); //printf("%s\tD\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o1Hits,o2cHits,posi,posj,scname); } } } o2Hits = ManberAll(apatseq,o2,2,0,apatseq->seqlen); o1cHits= 0; if (o2Hits) { stktmp = apatseq->hitpos[2]; begin = stktmp->val[0] + o2->patlen; if (lmax) length= stktmp->val[stktmp->top-1] + o2->patlen - begin + lmax + o1->patlen; else length= apatseq->seqlen - begin; o1cHits = ManberAll(apatseq,o1c,3,begin,length); if (o1cHits) for (i=0; i < o2Hits;i++) { posi = apatseq->hitpos[2]->val[i]; erri = apatseq->hiterr[2]->val[i]; for (j=0; j < o1cHits; j++) { posj=apatseq->hitpos[3]->val[j] + o1c->patlen; errj=apatseq->hiterr[3]->val[j]; length=posj - posi + 1 - o1->patlen - o2->patlen; if ((!lmin || (length >= lmin)) && (!lmax || (length <= lmax))) printRepeat(seq,o2,o1c,'R',posi,posj,erri,errj,taxonomy); //printf("%s\tR\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o2Hits,o1cHits,posi,posj,scname); } } } delete_ecoseq(seq); seq = ecoseq_iterator(NULL); } return 0; }