313 lines
8.8 KiB
C
313 lines
8.8 KiB
C
#include "libecoPCR/ecoPCR.h"
|
||
#include <stdio.h>
|
||
#include <string.h>
|
||
#include <stdlib.h>
|
||
#include <getopt.h>
|
||
#include <stdlib.h>
|
||
#include <sys/stat.h>
|
||
|
||
#define VERSION "0.1"
|
||
|
||
void freememory(char **tab, int32_t num){
|
||
int32_t i;
|
||
for (i=0;i<num-1;i++){
|
||
ECOFREE(tab[i],"Error in freememory function");
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Check if a pattern match a string using mamberall libapat function
|
||
* @param line array containing sequence information
|
||
* @param pattern array containing patterns to test on the sequence
|
||
* @param numpattern number of pattern in pattern array
|
||
* @param error_max error rate allowed by the user
|
||
*
|
||
* @return int 1 if a pattern match, else 0
|
||
**/
|
||
int ispatternmatching(char **line, char **pattern, int32_t numpattern, int32_t error_max){
|
||
int i;
|
||
SeqPtr apatseq = NULL;
|
||
ecoseq_t *seq = NULL;
|
||
PatternPtr current_patt;
|
||
|
||
seq = new_ecoseq();
|
||
|
||
sscanf(line[4],"%d",&seq->taxid);
|
||
seq->AC = strdup(line[0]);
|
||
seq->SQ = strdup(line[18]);
|
||
seq->SQ_length = strlen(line[18]);
|
||
|
||
apatseq=ecoseq2apatseq(seq,apatseq);
|
||
|
||
for (i=0; i < numpattern ;i++){
|
||
current_patt = buildPattern(pattern[i],error_max);
|
||
if(ManberAll(apatseq,current_patt,0,0,apatseq->seqlen))
|
||
return 1;
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
/**
|
||
*returns the result on standard output
|
||
* @param line array containing sequence information
|
||
* @param i length of line
|
||
**/
|
||
void printline(char **line, int32_t i){
|
||
int32_t k=0;
|
||
for (k=0; k < i; k++)
|
||
printf("%s |",line[k]);
|
||
printf("\n\n");
|
||
}
|
||
|
||
/* ----------------------------------------------- */
|
||
/* printout help */
|
||
/* ----------------------------------------------- */
|
||
#define PP fprintf(stdout,
|
||
|
||
static void PrintHelp()
|
||
{
|
||
PP "\n------------------------------------------\n");
|
||
PP " ecogrep Version %s\n", VERSION);
|
||
PP "------------------------------------------\n");
|
||
PP " synopsis : filtering ecoPCR result based on\n");
|
||
PP " taxonomic id filter and regular expression pattern\n");
|
||
PP " usage: ecogrep [options] database\n");
|
||
PP "------------------------------------------\n");
|
||
PP " options:\n");
|
||
PP " -d : [D]atabase containing taxonomic information\n\n");
|
||
PP " -f : [F]ile name : ecoPCR ouput file\n\n");
|
||
PP " -h : [H]elp - print <this> help\n\n");
|
||
PP " -i : [I]gnore taxonomic id\n\n");
|
||
PP " -r : [R]estrict taxomic id\n\n");
|
||
PP " -v : in[V]ert the sense of matching, to select non-matching lines.\n");
|
||
PP "------------------------------------------\n");
|
||
PP " Pattern : oligonucleotide pattern\n");
|
||
PP "------------------------------------------\n\n");
|
||
PP " https://www.grenoble.prabi.fr/trac/ecoPCR/wiki\n");
|
||
PP "------------------------------------------\n\n");
|
||
}
|
||
|
||
#undef PP
|
||
|
||
/* ----------------------------------------------- */
|
||
/* printout usage and exit */
|
||
/* ----------------------------------------------- */
|
||
|
||
#define PP fprintf(stderr,
|
||
|
||
static void ExitUsage(stat)
|
||
int stat;
|
||
{
|
||
PP "usage: ecogrep [-d database] [-f filename] [-i taxid] [-r taxid] [-v] [-h] <pattern>\n");
|
||
PP "type \"ecogrep -h\" for help\n");
|
||
|
||
if (stat)
|
||
exit(stat);
|
||
}
|
||
|
||
#undef PP
|
||
|
||
/* ----------------------------------------------- */
|
||
/* MAIN */
|
||
/* ----------------------------------------------- */
|
||
|
||
int main(int argc, char **argv){
|
||
int32_t carg = 0;
|
||
int32_t r = 0; // number of restricted taxid
|
||
int32_t g = 0; // number of ignored taxid
|
||
int32_t v = 0; // stores if -v mode is active
|
||
int32_t p = 0; // number of pattern
|
||
int32_t i = 0;
|
||
int32_t errflag = 0;
|
||
int32_t error_max = 0; // stores the error rate allowed by the user
|
||
int32_t matchingresult = 0; // stores number of matching result
|
||
|
||
ecotaxonomy_t *taxonomy; // stores the taxonomy
|
||
|
||
char *database = NULL; // stores the database path (for taxonomy)
|
||
char **pattern = NULL; // stores the regex pattern
|
||
char *line[19] = {0}; // stores the line
|
||
int32_t *restricted_taxid = NULL; // stores the restricted taxid
|
||
int32_t *ignored_taxid = NULL; // stores the ignored taxid
|
||
int32_t current_taxid;
|
||
|
||
FILE *file = NULL; // stores the data stream, stdin by default
|
||
char *stream = ECOMALLOC(sizeof(char *)*10000,"error stream buffer allocation");
|
||
char *buffer;
|
||
|
||
/**
|
||
* Parse commande line options
|
||
**/
|
||
while ((carg = getopt(argc, argv, "f:d:i:r:e:vh")) != -1) {
|
||
|
||
switch (carg) {
|
||
case 'f':
|
||
if ( (file = fopen(optarg, "r")) == NULL)
|
||
errflag++;
|
||
break;
|
||
|
||
case 'd':
|
||
database = ECOMALLOC(strlen(optarg)+1,
|
||
"Error on datafile allocation");
|
||
strcpy(database,optarg);
|
||
break;
|
||
|
||
case 'i':
|
||
ignored_taxid = ECOREALLOC( ignored_taxid,
|
||
sizeof(int32_t)*(g+1),
|
||
"Error on ignored_taxid reallocation");
|
||
sscanf(optarg,"%d",&ignored_taxid[g]);
|
||
g++;
|
||
break;
|
||
|
||
case 'r':
|
||
restricted_taxid = ECOREALLOC( restricted_taxid,
|
||
sizeof(int32_t)*(r+1),
|
||
"Error on restricted_taxid reallocation");
|
||
sscanf(optarg,"%d",&restricted_taxid[r]);
|
||
r++;
|
||
break;
|
||
|
||
case 'v':
|
||
v = 1;
|
||
break;
|
||
|
||
case 'h':
|
||
PrintHelp();
|
||
exit(0);
|
||
break;
|
||
|
||
case 'e':
|
||
sscanf(optarg,"%d",&error_max);
|
||
break;
|
||
|
||
case '?':
|
||
errflag++;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Get the left-over command line arguments back
|
||
* and check the pattern is not more than 32 character long
|
||
*/
|
||
pattern = ECOMALLOC(sizeof *pattern * (argc - optind), "Error in pattern allocation");
|
||
for (p=0 ; argc > optind ; optind++, p++){
|
||
if (strlen(argv[optind]) <= 32)
|
||
pattern[p] = strdup(argv[optind]);
|
||
else
|
||
{
|
||
printf("# Sorry, ecogrep doesn't handle pattern longer than 32 characters.\
|
||
\n# Please check it out : %s\n",argv[optind]);
|
||
exit(0);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* check standard input if no file name given in -f option
|
||
**/
|
||
if (file == NULL)
|
||
{
|
||
if (isatty(fileno(stdin)))
|
||
errflag++;
|
||
else
|
||
file = stdin;
|
||
}
|
||
|
||
/**
|
||
* try to get the database name from environment variable
|
||
* if no database name specified in the -d option
|
||
**/
|
||
if (database == NULL)
|
||
{
|
||
database = getenv("ECOPCRDB");
|
||
if (database == NULL)
|
||
errflag++;
|
||
}
|
||
|
||
/**
|
||
* check at leat one processing is asked
|
||
* either patterns or taxid filters
|
||
*/
|
||
if ( !p && restricted_taxid == NULL && ignored_taxid == NULL )
|
||
errflag++;
|
||
|
||
if (errflag)
|
||
ExitUsage(errflag);
|
||
|
||
/**
|
||
* Get the taxonomy back
|
||
*/
|
||
taxonomy = read_taxonomy(database);
|
||
|
||
/**
|
||
* Parse the stream
|
||
*/
|
||
while( fgets(stream, 10000, file) != NULL ){
|
||
|
||
if (stream[0]!= '#')
|
||
{
|
||
for( i=0, buffer = strtok(stream,"|");
|
||
buffer != NULL;
|
||
i++, buffer = strtok(NULL,"|"))
|
||
{
|
||
printf("<EFBFBD><EFBFBD> %s\n",buffer);
|
||
line[i] = strdup(buffer);
|
||
}
|
||
|
||
sscanf(line[4],"%d",¤t_taxid);
|
||
|
||
if(!v) // normal mode
|
||
{
|
||
if ( (r > 0) && !(eco_is_taxid_included( taxonomy,
|
||
restricted_taxid,
|
||
r,
|
||
current_taxid))
|
||
)
|
||
continue;
|
||
|
||
if ( (g > 0) && (eco_is_taxid_included( taxonomy,
|
||
ignored_taxid,
|
||
g,
|
||
current_taxid))
|
||
)
|
||
continue;
|
||
}
|
||
|
||
else // v mode, invert ignore and restrict options
|
||
{
|
||
if ( (r > 0) && (eco_is_taxid_included( taxonomy,
|
||
restricted_taxid,
|
||
r,
|
||
current_taxid))
|
||
)
|
||
continue;
|
||
if ( (g > 0) && !(eco_is_taxid_included( taxonomy,
|
||
ignored_taxid,
|
||
g,
|
||
current_taxid))
|
||
)
|
||
continue;
|
||
}
|
||
|
||
if ( p == 0 || (ispatternmatching(line,pattern,p,error_max)))
|
||
{
|
||
printline(line,i);
|
||
matchingresult++;
|
||
}
|
||
}
|
||
}
|
||
|
||
printf("# %d matching result\n",matchingresult);
|
||
|
||
/**
|
||
* clean, close and free before leaving
|
||
**/
|
||
if ( file != stdin )
|
||
fclose(file);
|
||
freememory(line,i);
|
||
freememory(pattern,p);
|
||
ECOFREE(pattern,"Error in free pattern");
|
||
ECOFREE(stream,"Error in free stream");
|
||
|
||
return 0;
|
||
} |