From 9f4aa93249132072211373fb1c2c7aae096b4e99 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 14 Jun 2007 13:01:41 +0000 Subject: [PATCH] git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/branches/refactoring@63 60f365c0-8329-0410-b2a4-ec073aeeaa1d --- src/ecogrep.c | 318 +++++++++++++++++++++++++++----------------------- 1 file changed, 171 insertions(+), 147 deletions(-) diff --git a/src/ecogrep.c b/src/ecogrep.c index de687ec..5752568 100644 --- a/src/ecogrep.c +++ b/src/ecogrep.c @@ -6,8 +6,41 @@ #include #include +typedef enum +{ + FALSE = 0, + TRUE = 1 +} BOOL; + #define VERSION "0.1" +void getLineContent(char *stream, ecoseq_t *seq){ + + int i; + char *buffer; + + for( i=0, buffer = strtok(stream,"|"); + buffer != NULL; + i++, buffer = strtok(NULL,"|")) + { + switch (i) { + case 0: + seq->AC = strdup(buffer); + break; + case 4: + sscanf(buffer,"%d",&seq->taxid); + break; + case 18: + seq->SQ = strdup(buffer); + seq->SQ_length = strlen(buffer); + break; + default: + break; + } + } +} + + void freememory(char **tab, int32_t num){ int32_t i; for (i=0;itaxid); - seq->AC = strdup(line[0]); - seq->SQ = strdup(line[18]); - seq->SQ_length = strlen(line[18]); - - apatseq=ecoseq2apatseq(seq,apatseq); - - for (i=0; i < numpattern ;i++){ - current_patt = buildPattern(pattern[i],error_max); - if(ManberAll(apatseq,current_patt,0,0,apatseq->seqlen)) - return 1; +int ispatternmatching(ecoseq_t *seq, PatternPtr pattern){ + if (pattern != NULL) + { + SeqPtr apatseq = NULL; + apatseq=ecoseq2apatseq(seq,apatseq); + return ManberAll(apatseq,pattern,0,0,apatseq->seqlen) > 0; } - return 0; -} - -/** - *returns the result on standard output - * @param line array containing sequence information - * @param i length of line - **/ -void printline(char **line, int32_t i){ - int32_t k=0; - for (k=0; k < i; k++) - printf("%s |",line[k]); - printf("\n\n"); + else return 0; } /* ----------------------------------------------- */ @@ -75,13 +83,13 @@ static void PrintHelp() PP "------------------------------------------\n"); PP " options:\n"); PP " -d : [D]atabase containing taxonomic information\n\n"); - PP " -f : [F]ile name : ecoPCR ouput file\n\n"); + PP " -p : [P]attern oligonucleotide pattern\n\n"); PP " -h : [H]elp - print help\n\n"); PP " -i : [I]gnore taxonomic id\n\n"); PP " -r : [R]estrict taxomic id\n\n"); PP " -v : in[V]ert the sense of matching, to select non-matching lines.\n"); PP "------------------------------------------\n"); - PP " Pattern : oligonucleotide pattern\n"); + PP "ecoPCR ouput file name\n"); PP "------------------------------------------\n\n"); PP " https://www.grenoble.prabi.fr/trac/ecoPCR/wiki\n"); PP "------------------------------------------\n\n"); @@ -98,7 +106,7 @@ static void PrintHelp() static void ExitUsage(stat) int stat; { - PP "usage: ecogrep [-d database] [-f filename] [-i taxid] [-r taxid] [-v] [-h] \n"); + PP "usage: ecogrep [-d database] [-p pattern] [-i taxid] [-r taxid] [-v] [-h] \n"); PP "type \"ecogrep -h\" for help\n"); if (stat) @@ -111,41 +119,44 @@ static void ExitUsage(stat) /* MAIN */ /* ----------------------------------------------- */ +#define LINE_BUFF_SIZE 10000 + int main(int argc, char **argv){ int32_t carg = 0; int32_t r = 0; // number of restricted taxid - int32_t g = 0; // number of ignored taxid + int32_t i = 0; // number of ignored taxid int32_t v = 0; // stores if -v mode is active - int32_t p = 0; // number of pattern - int32_t i = 0; - int32_t errflag = 0; + int32_t k = 0; // file counter + int32_t errflag = 0; int32_t error_max = 0; // stores the error rate allowed by the user int32_t matchingresult = 0; // stores number of matching result ecotaxonomy_t *taxonomy; // stores the taxonomy - + ecoseq_t *seq = NULL; // stores sequence info + + + char *p = NULL; // number of pattern char *database = NULL; // stores the database path (for taxonomy) - char **pattern = NULL; // stores the regex pattern - char *line[19] = {0}; // stores the line + PatternPtr pattern = NULL; // stores the build pattern int32_t *restricted_taxid = NULL; // stores the restricted taxid int32_t *ignored_taxid = NULL; // stores the ignored taxid - int32_t current_taxid; - + FILE *file = NULL; // stores the data stream, stdin by default - char *stream = ECOMALLOC(sizeof(char *)*10000,"error stream buffer allocation"); - char *buffer; + char *stream = ECOMALLOC(sizeof(char *)*LINE_BUFF_SIZE,"error stream buffer allocation"); + char *orig = ECOMALLOC(sizeof(char *)*LINE_BUFF_SIZE,"error orig buffer allocation"); + + int is_ignored = 0; + int is_included = 0; + int is_matching = 0; + + seq = new_ecoseq(); /** * Parse commande line options **/ - while ((carg = getopt(argc, argv, "f:d:i:r:e:vh")) != -1) { + while ((carg = getopt(argc, argv, "p:d:i:r:e:vh")) != -1) { switch (carg) { - case 'f': - if ( (file = fopen(optarg, "r")) == NULL) - errflag++; - break; - case 'd': database = ECOMALLOC(strlen(optarg)+1, "Error on datafile allocation"); @@ -154,10 +165,10 @@ int main(int argc, char **argv){ case 'i': ignored_taxid = ECOREALLOC( ignored_taxid, - sizeof(int32_t)*(g+1), + sizeof(int32_t)*(i+1), "Error on ignored_taxid reallocation"); - sscanf(optarg,"%d",&ignored_taxid[g]); - g++; + sscanf(optarg,"%d",&ignored_taxid[i]); + i++; break; case 'r': @@ -178,41 +189,33 @@ int main(int argc, char **argv){ break; case 'e': - sscanf(optarg,"%d",&error_max); - break; + sscanf(optarg,"%d",&error_max); + break; + + case 'p': + p = ECOMALLOC(strlen(optarg)+1, + "Error on pattern allocation"); + strcpy(p,optarg); + break; case '?': errflag++; } } - + /** - * Get the left-over command line arguments back - * and check the pattern is not more than 32 character long - */ - pattern = ECOMALLOC(sizeof *pattern * (argc - optind), "Error in pattern allocation"); - for (p=0 ; argc > optind ; optind++, p++){ - if (strlen(argv[optind]) <= 32) - pattern[p] = strdup(argv[optind]); - else - { - printf("# Sorry, ecogrep doesn't handle pattern longer than 32 characters.\ - \n# Please check it out : %s\n",argv[optind]); - exit(0); - } - } - - /** - * check standard input if no file name given in -f option + * Check pattern length and build it in PatternPtr format **/ - if (file == NULL) - { - if (isatty(fileno(stdin))) - errflag++; - else - file = stdin; - } - + if(p && strlen(p) > 32) + { + printf("# Sorry, ecogrep doesn't handle pattern longer than 32 characters.\ + \n# Please check it out : %s\n",p); + exit(EXIT_FAILURE); + } + else if (p) + if ( (pattern = buildPattern(p,error_max)) == NULL) + exit(EXIT_FAILURE); + /** * try to get the database name from environment variable * if no database name specified in the -d option @@ -228,9 +231,10 @@ int main(int argc, char **argv){ * check at leat one processing is asked * either patterns or taxid filters */ - if ( !p && restricted_taxid == NULL && ignored_taxid == NULL ) + if ( p == NULL && restricted_taxid == NULL && ignored_taxid == NULL ) + { errflag++; - + } if (errflag) ExitUsage(errflag); @@ -241,73 +245,93 @@ int main(int argc, char **argv){ /** * Parse the stream - */ - while( fgets(stream, 10000, file) != NULL ){ - - if (stream[0]!= '#') - { - for( i=0, buffer = strtok(stream,"|"); - buffer != NULL; - i++, buffer = strtok(NULL,"|")) + */ + for (k=0 ; argc >= optind ; optind++, k++){ + + matchingresult = 0; + + if ( (file = fopen(argv[optind], "r")) == NULL) + { + if (isatty(fileno(stdin)) == 0) { - printf("¤¤ %s\n",buffer); - line[i] = strdup(buffer); + file = stdin; + printf("# Processing standard input...\n"); } - - sscanf(line[4],"%d",¤t_taxid); - - if(!v) // normal mode - { - if ( (r > 0) && !(eco_is_taxid_included( taxonomy, - restricted_taxid, - r, - current_taxid)) - ) - continue; - - if ( (g > 0) && (eco_is_taxid_included( taxonomy, - ignored_taxid, - g, - current_taxid)) - ) - continue; - } - - else // v mode, invert ignore and restrict options - { - if ( (r > 0) && (eco_is_taxid_included( taxonomy, - restricted_taxid, - r, - current_taxid)) - ) - continue; - if ( (g > 0) && !(eco_is_taxid_included( taxonomy, - ignored_taxid, - g, - current_taxid)) - ) - continue; - } - - if ( p == 0 || (ispatternmatching(line,pattern,p,error_max))) - { - printline(line,i); - matchingresult++; - } - } + else + break; + } + else + printf("# Processing %s...\n",argv[optind]); + + while( fgets(stream, LINE_BUFF_SIZE, file) != NULL ){ + + if (stream[0]!= '#') + { + + stream[LINE_BUFF_SIZE-1]=0; + + strcpy(orig,stream); + + getLineContent(stream,seq); + + /* -----------------------------------------------*/ + /* ignored if : */ + /* - v mode and no ignored */ + /* OR */ + /* - at least one -i option used */ + /* AND */ + /* - -i option is parent of current taxid */ + /* -----------------------------------------------*/ + is_ignored = ( (v && i==0) || + ( (i > 0) && (eco_is_taxid_included( taxonomy, + ignored_taxid, + i, + seq->taxid)) + ) + ); + + /* -----------------------------------------------*/ + /* included if : */ + /* - normal mode and no restriction */ + /* OR */ + /* - is -r option is parent of current taxid */ + /* -----------------------------------------------*/ + is_included = ( (!v && (r == 0)) || (eco_is_taxid_included( taxonomy, + restricted_taxid, + r, + seq->taxid)) + ); + + /* -----------------------------------------------*/ + /* match if no pattern or if function return 1 */ + /* -----------------------------------------------*/ + is_matching = ( (!v && !p) || (ispatternmatching(seq,pattern))); + + if ( + (!v && (!is_ignored && is_included && is_matching) ) || + ( v && (is_ignored && !is_included && !is_matching) ) + ) + { + printf("%s",orig); + matchingresult++; + } + } + + + } + if ( file != stdin ) + fclose(file); + + printf("# %d matching result(s)\n#\n",matchingresult); } - - printf("# %d matching result\n",matchingresult); - + /** - * clean, close and free before leaving - **/ - if ( file != stdin ) - fclose(file); - freememory(line,i); - freememory(pattern,p); - ECOFREE(pattern,"Error in free pattern"); + * clean and free before leaving + **/ + ECOFREE(orig,"Error in free orig"); ECOFREE(stream,"Error in free stream"); + ECOFREE(ignored_taxid,"Error in free stream"); + ECOFREE(restricted_taxid,"Error in free stream"); return 0; } \ No newline at end of file