git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/branches/refactoring@63 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
290
src/ecogrep.c
290
src/ecogrep.c
@ -6,8 +6,41 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
|
|
||||||
|
typedef enum
|
||||||
|
{
|
||||||
|
FALSE = 0,
|
||||||
|
TRUE = 1
|
||||||
|
} BOOL;
|
||||||
|
|
||||||
#define VERSION "0.1"
|
#define VERSION "0.1"
|
||||||
|
|
||||||
|
void getLineContent(char *stream, ecoseq_t *seq){
|
||||||
|
|
||||||
|
int i;
|
||||||
|
char *buffer;
|
||||||
|
|
||||||
|
for( i=0, buffer = strtok(stream,"|");
|
||||||
|
buffer != NULL;
|
||||||
|
i++, buffer = strtok(NULL,"|"))
|
||||||
|
{
|
||||||
|
switch (i) {
|
||||||
|
case 0:
|
||||||
|
seq->AC = strdup(buffer);
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
sscanf(buffer,"%d",&seq->taxid);
|
||||||
|
break;
|
||||||
|
case 18:
|
||||||
|
seq->SQ = strdup(buffer);
|
||||||
|
seq->SQ_length = strlen(buffer);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void freememory(char **tab, int32_t num){
|
void freememory(char **tab, int32_t num){
|
||||||
int32_t i;
|
int32_t i;
|
||||||
for (i=0;i<num-1;i++){
|
for (i=0;i<num-1;i++){
|
||||||
@ -24,39 +57,14 @@ void freememory(char **tab, int32_t num){
|
|||||||
*
|
*
|
||||||
* @return int 1 if a pattern match, else 0
|
* @return int 1 if a pattern match, else 0
|
||||||
**/
|
**/
|
||||||
int ispatternmatching(char **line, char **pattern, int32_t numpattern, int32_t error_max){
|
int ispatternmatching(ecoseq_t *seq, PatternPtr pattern){
|
||||||
int i;
|
if (pattern != NULL)
|
||||||
SeqPtr apatseq = NULL;
|
{
|
||||||
ecoseq_t *seq = NULL;
|
SeqPtr apatseq = NULL;
|
||||||
PatternPtr current_patt;
|
apatseq=ecoseq2apatseq(seq,apatseq);
|
||||||
|
return ManberAll(apatseq,pattern,0,0,apatseq->seqlen) > 0;
|
||||||
seq = new_ecoseq();
|
|
||||||
|
|
||||||
sscanf(line[4],"%d",&seq->taxid);
|
|
||||||
seq->AC = strdup(line[0]);
|
|
||||||
seq->SQ = strdup(line[18]);
|
|
||||||
seq->SQ_length = strlen(line[18]);
|
|
||||||
|
|
||||||
apatseq=ecoseq2apatseq(seq,apatseq);
|
|
||||||
|
|
||||||
for (i=0; i < numpattern ;i++){
|
|
||||||
current_patt = buildPattern(pattern[i],error_max);
|
|
||||||
if(ManberAll(apatseq,current_patt,0,0,apatseq->seqlen))
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
return 0;
|
else return 0;
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
*returns the result on standard output
|
|
||||||
* @param line array containing sequence information
|
|
||||||
* @param i length of line
|
|
||||||
**/
|
|
||||||
void printline(char **line, int32_t i){
|
|
||||||
int32_t k=0;
|
|
||||||
for (k=0; k < i; k++)
|
|
||||||
printf("%s |",line[k]);
|
|
||||||
printf("\n\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------- */
|
/* ----------------------------------------------- */
|
||||||
@ -75,13 +83,13 @@ static void PrintHelp()
|
|||||||
PP "------------------------------------------\n");
|
PP "------------------------------------------\n");
|
||||||
PP " options:\n");
|
PP " options:\n");
|
||||||
PP " -d : [D]atabase containing taxonomic information\n\n");
|
PP " -d : [D]atabase containing taxonomic information\n\n");
|
||||||
PP " -f : [F]ile name : ecoPCR ouput file\n\n");
|
PP " -p : [P]attern oligonucleotide pattern\n\n");
|
||||||
PP " -h : [H]elp - print <this> help\n\n");
|
PP " -h : [H]elp - print <this> help\n\n");
|
||||||
PP " -i : [I]gnore taxonomic id\n\n");
|
PP " -i : [I]gnore taxonomic id\n\n");
|
||||||
PP " -r : [R]estrict taxomic id\n\n");
|
PP " -r : [R]estrict taxomic id\n\n");
|
||||||
PP " -v : in[V]ert the sense of matching, to select non-matching lines.\n");
|
PP " -v : in[V]ert the sense of matching, to select non-matching lines.\n");
|
||||||
PP "------------------------------------------\n");
|
PP "------------------------------------------\n");
|
||||||
PP " Pattern : oligonucleotide pattern\n");
|
PP "ecoPCR ouput file name\n");
|
||||||
PP "------------------------------------------\n\n");
|
PP "------------------------------------------\n\n");
|
||||||
PP " https://www.grenoble.prabi.fr/trac/ecoPCR/wiki\n");
|
PP " https://www.grenoble.prabi.fr/trac/ecoPCR/wiki\n");
|
||||||
PP "------------------------------------------\n\n");
|
PP "------------------------------------------\n\n");
|
||||||
@ -98,7 +106,7 @@ static void PrintHelp()
|
|||||||
static void ExitUsage(stat)
|
static void ExitUsage(stat)
|
||||||
int stat;
|
int stat;
|
||||||
{
|
{
|
||||||
PP "usage: ecogrep [-d database] [-f filename] [-i taxid] [-r taxid] [-v] [-h] <pattern>\n");
|
PP "usage: ecogrep [-d database] [-p pattern] [-i taxid] [-r taxid] [-v] [-h] <file name>\n");
|
||||||
PP "type \"ecogrep -h\" for help\n");
|
PP "type \"ecogrep -h\" for help\n");
|
||||||
|
|
||||||
if (stat)
|
if (stat)
|
||||||
@ -111,41 +119,44 @@ static void ExitUsage(stat)
|
|||||||
/* MAIN */
|
/* MAIN */
|
||||||
/* ----------------------------------------------- */
|
/* ----------------------------------------------- */
|
||||||
|
|
||||||
|
#define LINE_BUFF_SIZE 10000
|
||||||
|
|
||||||
int main(int argc, char **argv){
|
int main(int argc, char **argv){
|
||||||
int32_t carg = 0;
|
int32_t carg = 0;
|
||||||
int32_t r = 0; // number of restricted taxid
|
int32_t r = 0; // number of restricted taxid
|
||||||
int32_t g = 0; // number of ignored taxid
|
int32_t i = 0; // number of ignored taxid
|
||||||
int32_t v = 0; // stores if -v mode is active
|
int32_t v = 0; // stores if -v mode is active
|
||||||
int32_t p = 0; // number of pattern
|
int32_t k = 0; // file counter
|
||||||
int32_t i = 0;
|
|
||||||
int32_t errflag = 0;
|
int32_t errflag = 0;
|
||||||
int32_t error_max = 0; // stores the error rate allowed by the user
|
int32_t error_max = 0; // stores the error rate allowed by the user
|
||||||
int32_t matchingresult = 0; // stores number of matching result
|
int32_t matchingresult = 0; // stores number of matching result
|
||||||
|
|
||||||
ecotaxonomy_t *taxonomy; // stores the taxonomy
|
ecotaxonomy_t *taxonomy; // stores the taxonomy
|
||||||
|
ecoseq_t *seq = NULL; // stores sequence info
|
||||||
|
|
||||||
|
|
||||||
|
char *p = NULL; // number of pattern
|
||||||
char *database = NULL; // stores the database path (for taxonomy)
|
char *database = NULL; // stores the database path (for taxonomy)
|
||||||
char **pattern = NULL; // stores the regex pattern
|
PatternPtr pattern = NULL; // stores the build pattern
|
||||||
char *line[19] = {0}; // stores the line
|
|
||||||
int32_t *restricted_taxid = NULL; // stores the restricted taxid
|
int32_t *restricted_taxid = NULL; // stores the restricted taxid
|
||||||
int32_t *ignored_taxid = NULL; // stores the ignored taxid
|
int32_t *ignored_taxid = NULL; // stores the ignored taxid
|
||||||
int32_t current_taxid;
|
|
||||||
|
|
||||||
FILE *file = NULL; // stores the data stream, stdin by default
|
FILE *file = NULL; // stores the data stream, stdin by default
|
||||||
char *stream = ECOMALLOC(sizeof(char *)*10000,"error stream buffer allocation");
|
char *stream = ECOMALLOC(sizeof(char *)*LINE_BUFF_SIZE,"error stream buffer allocation");
|
||||||
char *buffer;
|
char *orig = ECOMALLOC(sizeof(char *)*LINE_BUFF_SIZE,"error orig buffer allocation");
|
||||||
|
|
||||||
|
int is_ignored = 0;
|
||||||
|
int is_included = 0;
|
||||||
|
int is_matching = 0;
|
||||||
|
|
||||||
|
seq = new_ecoseq();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse commande line options
|
* Parse commande line options
|
||||||
**/
|
**/
|
||||||
while ((carg = getopt(argc, argv, "f:d:i:r:e:vh")) != -1) {
|
while ((carg = getopt(argc, argv, "p:d:i:r:e:vh")) != -1) {
|
||||||
|
|
||||||
switch (carg) {
|
switch (carg) {
|
||||||
case 'f':
|
|
||||||
if ( (file = fopen(optarg, "r")) == NULL)
|
|
||||||
errflag++;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 'd':
|
case 'd':
|
||||||
database = ECOMALLOC(strlen(optarg)+1,
|
database = ECOMALLOC(strlen(optarg)+1,
|
||||||
"Error on datafile allocation");
|
"Error on datafile allocation");
|
||||||
@ -154,10 +165,10 @@ int main(int argc, char **argv){
|
|||||||
|
|
||||||
case 'i':
|
case 'i':
|
||||||
ignored_taxid = ECOREALLOC( ignored_taxid,
|
ignored_taxid = ECOREALLOC( ignored_taxid,
|
||||||
sizeof(int32_t)*(g+1),
|
sizeof(int32_t)*(i+1),
|
||||||
"Error on ignored_taxid reallocation");
|
"Error on ignored_taxid reallocation");
|
||||||
sscanf(optarg,"%d",&ignored_taxid[g]);
|
sscanf(optarg,"%d",&ignored_taxid[i]);
|
||||||
g++;
|
i++;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'r':
|
case 'r':
|
||||||
@ -178,8 +189,14 @@ int main(int argc, char **argv){
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case 'e':
|
case 'e':
|
||||||
sscanf(optarg,"%d",&error_max);
|
sscanf(optarg,"%d",&error_max);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case 'p':
|
||||||
|
p = ECOMALLOC(strlen(optarg)+1,
|
||||||
|
"Error on pattern allocation");
|
||||||
|
strcpy(p,optarg);
|
||||||
|
break;
|
||||||
|
|
||||||
case '?':
|
case '?':
|
||||||
errflag++;
|
errflag++;
|
||||||
@ -187,31 +204,17 @@ int main(int argc, char **argv){
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the left-over command line arguments back
|
* Check pattern length and build it in PatternPtr format
|
||||||
* and check the pattern is not more than 32 character long
|
|
||||||
*/
|
|
||||||
pattern = ECOMALLOC(sizeof *pattern * (argc - optind), "Error in pattern allocation");
|
|
||||||
for (p=0 ; argc > optind ; optind++, p++){
|
|
||||||
if (strlen(argv[optind]) <= 32)
|
|
||||||
pattern[p] = strdup(argv[optind]);
|
|
||||||
else
|
|
||||||
{
|
|
||||||
printf("# Sorry, ecogrep doesn't handle pattern longer than 32 characters.\
|
|
||||||
\n# Please check it out : %s\n",argv[optind]);
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* check standard input if no file name given in -f option
|
|
||||||
**/
|
**/
|
||||||
if (file == NULL)
|
if(p && strlen(p) > 32)
|
||||||
{
|
{
|
||||||
if (isatty(fileno(stdin)))
|
printf("# Sorry, ecogrep doesn't handle pattern longer than 32 characters.\
|
||||||
errflag++;
|
\n# Please check it out : %s\n",p);
|
||||||
else
|
exit(EXIT_FAILURE);
|
||||||
file = stdin;
|
}
|
||||||
}
|
else if (p)
|
||||||
|
if ( (pattern = buildPattern(p,error_max)) == NULL)
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* try to get the database name from environment variable
|
* try to get the database name from environment variable
|
||||||
@ -228,9 +231,10 @@ int main(int argc, char **argv){
|
|||||||
* check at leat one processing is asked
|
* check at leat one processing is asked
|
||||||
* either patterns or taxid filters
|
* either patterns or taxid filters
|
||||||
*/
|
*/
|
||||||
if ( !p && restricted_taxid == NULL && ignored_taxid == NULL )
|
if ( p == NULL && restricted_taxid == NULL && ignored_taxid == NULL )
|
||||||
|
{
|
||||||
errflag++;
|
errflag++;
|
||||||
|
}
|
||||||
if (errflag)
|
if (errflag)
|
||||||
ExitUsage(errflag);
|
ExitUsage(errflag);
|
||||||
|
|
||||||
@ -242,72 +246,92 @@ int main(int argc, char **argv){
|
|||||||
/**
|
/**
|
||||||
* Parse the stream
|
* Parse the stream
|
||||||
*/
|
*/
|
||||||
while( fgets(stream, 10000, file) != NULL ){
|
for (k=0 ; argc >= optind ; optind++, k++){
|
||||||
|
|
||||||
if (stream[0]!= '#')
|
matchingresult = 0;
|
||||||
{
|
|
||||||
for( i=0, buffer = strtok(stream,"|");
|
if ( (file = fopen(argv[optind], "r")) == NULL)
|
||||||
buffer != NULL;
|
{
|
||||||
i++, buffer = strtok(NULL,"|"))
|
if (isatty(fileno(stdin)) == 0)
|
||||||
{
|
{
|
||||||
printf("<EFBFBD><EFBFBD> %s\n",buffer);
|
file = stdin;
|
||||||
line[i] = strdup(buffer);
|
printf("# Processing standard input...\n");
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
printf("# Processing %s...\n",argv[optind]);
|
||||||
|
|
||||||
sscanf(line[4],"%d",¤t_taxid);
|
while( fgets(stream, LINE_BUFF_SIZE, file) != NULL ){
|
||||||
|
|
||||||
if(!v) // normal mode
|
if (stream[0]!= '#')
|
||||||
{
|
{
|
||||||
if ( (r > 0) && !(eco_is_taxid_included( taxonomy,
|
|
||||||
restricted_taxid,
|
|
||||||
r,
|
|
||||||
current_taxid))
|
|
||||||
)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if ( (g > 0) && (eco_is_taxid_included( taxonomy,
|
stream[LINE_BUFF_SIZE-1]=0;
|
||||||
ignored_taxid,
|
|
||||||
g,
|
|
||||||
current_taxid))
|
|
||||||
)
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
else // v mode, invert ignore and restrict options
|
strcpy(orig,stream);
|
||||||
{
|
|
||||||
if ( (r > 0) && (eco_is_taxid_included( taxonomy,
|
|
||||||
restricted_taxid,
|
|
||||||
r,
|
|
||||||
current_taxid))
|
|
||||||
)
|
|
||||||
continue;
|
|
||||||
if ( (g > 0) && !(eco_is_taxid_included( taxonomy,
|
|
||||||
ignored_taxid,
|
|
||||||
g,
|
|
||||||
current_taxid))
|
|
||||||
)
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( p == 0 || (ispatternmatching(line,pattern,p,error_max)))
|
getLineContent(stream,seq);
|
||||||
{
|
|
||||||
printline(line,i);
|
/* -----------------------------------------------*/
|
||||||
matchingresult++;
|
/* ignored if : */
|
||||||
}
|
/* - v mode and no ignored */
|
||||||
}
|
/* OR */
|
||||||
|
/* - at least one -i option used */
|
||||||
|
/* AND */
|
||||||
|
/* - -i option is parent of current taxid */
|
||||||
|
/* -----------------------------------------------*/
|
||||||
|
is_ignored = ( (v && i==0) ||
|
||||||
|
( (i > 0) && (eco_is_taxid_included( taxonomy,
|
||||||
|
ignored_taxid,
|
||||||
|
i,
|
||||||
|
seq->taxid))
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
/* -----------------------------------------------*/
|
||||||
|
/* included if : */
|
||||||
|
/* - normal mode and no restriction */
|
||||||
|
/* OR */
|
||||||
|
/* - is -r option is parent of current taxid */
|
||||||
|
/* -----------------------------------------------*/
|
||||||
|
is_included = ( (!v && (r == 0)) || (eco_is_taxid_included( taxonomy,
|
||||||
|
restricted_taxid,
|
||||||
|
r,
|
||||||
|
seq->taxid))
|
||||||
|
);
|
||||||
|
|
||||||
|
/* -----------------------------------------------*/
|
||||||
|
/* match if no pattern or if function return 1 */
|
||||||
|
/* -----------------------------------------------*/
|
||||||
|
is_matching = ( (!v && !p) || (ispatternmatching(seq,pattern)));
|
||||||
|
|
||||||
|
if (
|
||||||
|
(!v && (!is_ignored && is_included && is_matching) ) ||
|
||||||
|
( v && (is_ignored && !is_included && !is_matching) )
|
||||||
|
)
|
||||||
|
{
|
||||||
|
printf("%s",orig);
|
||||||
|
matchingresult++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
if ( file != stdin )
|
||||||
|
fclose(file);
|
||||||
|
|
||||||
|
printf("# %d matching result(s)\n#\n",matchingresult);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("# %d matching result\n",matchingresult);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* clean, close and free before leaving
|
* clean and free before leaving
|
||||||
**/
|
**/
|
||||||
if ( file != stdin )
|
ECOFREE(orig,"Error in free orig");
|
||||||
fclose(file);
|
|
||||||
freememory(line,i);
|
|
||||||
freememory(pattern,p);
|
|
||||||
ECOFREE(pattern,"Error in free pattern");
|
|
||||||
ECOFREE(stream,"Error in free stream");
|
ECOFREE(stream,"Error in free stream");
|
||||||
|
ECOFREE(ignored_taxid,"Error in free stream");
|
||||||
|
ECOFREE(restricted_taxid,"Error in free stream");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
Reference in New Issue
Block a user