From a258d334b19b78ba2bad9eea452672ad2bd1ee2b Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Fri, 13 Oct 2017 18:54:17 +0200 Subject: [PATCH] Improved the fasta parser for better memory handling and better parsing of the last parts of fasta headers (definitions) --- libfasta/fasta_header_handler.c | 41 ++++++++++++++++------ libfasta/fasta_header_parser.c | 61 ++++++++++++++------------------- libfasta/fasta_header_parser.l | 53 ++++++++++++---------------- libfasta/fasta_seq_writer.c | 7 ++-- libfasta/header_mem_handler.c | 5 ++- 5 files changed, 83 insertions(+), 84 deletions(-) diff --git a/libfasta/fasta_header_handler.c b/libfasta/fasta_header_handler.c index f57d8c7..e3e1722 100644 --- a/libfasta/fasta_header_handler.c +++ b/libfasta/fasta_header_handler.c @@ -69,15 +69,34 @@ char* fastaSeqPtr_header_add_field(fastaSeqPtr seq, char* name, char* value) element_from_header* table_header_add_field(element_from_header* header, char* name, char* value) { int nbf; + int i, j; + element_from_header* new_header; + nbf = atoi(header[0].value); - nbf++; - header = (element_from_header*) realloc(header, (nbf+1)*sizeof(element_from_header)); - header[nbf].name = (char*) malloc((1+strlen(name))*sizeof(char)); - strcpy(header[nbf].name, name); - header[nbf].value = (char*) malloc((1+strlen(value))*sizeof(char)); - strcpy(header[nbf].value, value); - sprintf(header[0].value, "%d", nbf); - return(header); + new_header = (element_from_header*) realloc(header, ((nbf+1)*sizeof(element_from_header))); + + i=0; + while ((strcmp(new_header[i].name, "definition") != 0) && (i < nbf)) + i++; + + if (strcmp(new_header[i].name, "definition") == 0) + { + j = nbf-1; + while (strcmp(new_header[j].name, "definition") == 0) + { + new_header[j+1].name = new_header[j].name; + new_header[j+1].value = new_header[j].value; + j--; + } + } + + new_header[i].name = (char*) malloc((1+strlen(name))*sizeof(char)); + strcpy(new_header[i].name, name); + new_header[i].value = (char*) malloc((1+strlen(value))*sizeof(char)); + strcpy(new_header[i].value, value); + sprintf(new_header[0].value, "%d", nbf+1); + + return(new_header); } @@ -86,7 +105,7 @@ void free_header_table(element_from_header* header) int i; int nbf = atoi(header[0].value); - for (i = 0; i <= nbf; i++) + for (i = 0; i < nbf; i++) { free((header[i]).name); free((header[i]).value); @@ -101,7 +120,7 @@ char* getItemFromHeader(char* name, element_from_header* header) int nbf; int i; nbf = atoi(header[0].value); - for (i = 1; i <= nbf; i++) + for (i = 1; i < nbf; i++) { if (strcmp(header[i].name,name)==0) value = header[i].value; @@ -115,7 +134,7 @@ void changeValue(element_from_header* header, char* name, char* newValue) int i; int nbf = atoi(header[0].value); - for (i = 1; i <= nbf; i++) + for (i = 1; i < nbf; i++) { if (strcmp(header[i].name, name)==0) { diff --git a/libfasta/fasta_header_parser.c b/libfasta/fasta_header_parser.c index ad18723..6bf5221 100644 --- a/libfasta/fasta_header_parser.c +++ b/libfasta/fasta_header_parser.c @@ -798,7 +798,7 @@ YY_RULE_SETUP (*p_header)[*nbf].value = (char*) malloc(sizeof(char)*size_needed); strcpy(((*p_header)[*nbf]).value,header_yytext); - (*nbf)++; + p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated); } YY_BREAK case 3: @@ -820,7 +820,7 @@ YY_RULE_SETUP case 5: YY_RULE_SETUP #line 69 "fasta_header_parser.l" -{ +{ // TODO /*fprintf(stderr,"\n{SPACE} **%s**",header_yytext);*/ if (i != 0) field = store_in_field(field,header_yytext,&free_size,&i); @@ -886,24 +886,21 @@ case YY_STATE_EOF(REGVAL): #line 113 "fasta_header_parser.l" { field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i); - p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated); + (*nbf)++; end_header_table(p_header, *nbf); - free(field); BEGIN(INITIAL); return 0; } YY_BREAK case YY_STATE_EOF(REGNAME): -#line 123 "fasta_header_parser.l" +#line 122 "fasta_header_parser.l" { - /*(*p_header)[*nbf].name = (char*) malloc(sizeof(char)*19); - strcpy((*p_header)[*nbf].name,"other_informations"); + (*p_header)[*nbf].name = (char*) malloc(sizeof(char)*19); + strcpy((*p_header)[*nbf].name,"definition"); field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i); - p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated); - */ - end_header_table(p_header, *nbf); - + (*nbf)++; + end_header_table(p_header, nbf); free(field); BEGIN(INITIAL); return 0; @@ -911,10 +908,10 @@ case YY_STATE_EOF(REGNAME): YY_BREAK case 12: YY_RULE_SETUP -#line 136 "fasta_header_parser.l" +#line 133 "fasta_header_parser.l" ECHO; YY_BREAK -#line 918 "" +#line 915 "" case YY_STATE_EOF(INITIAL): case YY_STATE_EOF(REGID): yyterminate(); @@ -1912,7 +1909,7 @@ void header_yyfree (void * ptr ) #define YYTABLES_NAME "yytables" -#line 136 "fasta_header_parser.l" +#line 133 "fasta_header_parser.l" @@ -1923,38 +1920,32 @@ int header_yywrap() element_from_header* header_parser_main(char *h) { - int nbfields,memory_allocated; + int nbfields, memory_allocated; element_from_header* header; char* nbfields_n; - char* nbfields_v; - - nbfields_n = (char*) malloc(9*sizeof(char)); - nbfields_v = (char*) malloc(5*sizeof(char)); + YY_BUFFER_STATE state; + + state=header_yy_scan_string(h); memory_allocated=MEMALLOCATED; + header = (element_from_header*) malloc(memory_allocated * sizeof(element_from_header)); + + nbfields_n = (char*) malloc(9*sizeof(char)); + strcpy(nbfields_n, "nbfields"); + header[0].name = nbfields_n; + + // Initialize memory to store the number of fields + header[0].value = (char*) malloc(10*sizeof(char)); + nbfields=1; - strcpy(nbfields_n, "nbfields"); - strcpy(nbfields_v, "1"); - - header = (element_from_header*) malloc(memory_allocated * sizeof(element_from_header)); - - header[0].name = nbfields_n; - header[0].value = nbfields_v; - - YY_BUFFER_STATE state; - - state=header_yy_scan_string(h); - header_parser(&nbfields, &memory_allocated, &header); - + header_yy_delete_buffer(state); - + return header; } - - diff --git a/libfasta/fasta_header_parser.l b/libfasta/fasta_header_parser.l index c270c64..bb4c5cd 100644 --- a/libfasta/fasta_header_parser.l +++ b/libfasta/fasta_header_parser.l @@ -53,7 +53,7 @@ EQUAL = (*p_header)[*nbf].value = (char*) malloc(sizeof(char)*size_needed); strcpy(((*p_header)[*nbf]).value,yytext); - (*nbf)++; + p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated); } @@ -66,7 +66,7 @@ EQUAL = field = store_in_field(field,yytext,&free_size,&i); } -{SPACE} { +{SPACE} { // TODO /*fprintf(stderr,"\n{SPACE} **%s**",yytext);*/ if (i != 0) field = store_in_field(field,yytext,&free_size,&i); @@ -112,22 +112,19 @@ EQUAL = <> { field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i); - p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated); + (*nbf)++; end_header_table(p_header, *nbf); - free(field); BEGIN(INITIAL); return 0; } <> { - /*(*p_header)[*nbf].name = (char*) malloc(sizeof(char)*19); - strcpy((*p_header)[*nbf].name,"other_informations"); + (*p_header)[*nbf].name = (char*) malloc(sizeof(char)*19); + strcpy((*p_header)[*nbf].name,"definition"); field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i); - p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated); - */ - end_header_table(p_header, *nbf); - + (*nbf)++; + end_header_table(p_header, nbf); free(field); BEGIN(INITIAL); return 0; @@ -142,37 +139,31 @@ int header_yywrap() element_from_header* header_parser_main(char *h) { - int nbfields,memory_allocated; + int nbfields, memory_allocated; element_from_header* header; char* nbfields_n; - char* nbfields_v; - - nbfields_n = (char*) malloc(9*sizeof(char)); - nbfields_v = (char*) malloc(5*sizeof(char)); + YY_BUFFER_STATE state; + + state=yy_scan_string(h); memory_allocated=MEMALLOCATED; + header = (element_from_header*) malloc(memory_allocated * sizeof(element_from_header)); + + nbfields_n = (char*) malloc(9*sizeof(char)); + strcpy(nbfields_n, "nbfields"); + header[0].name = nbfields_n; + + // Initialize memory to store the number of fields + header[0].value = (char*) malloc(10*sizeof(char)); + nbfields=1; - strcpy(nbfields_n, "nbfields"); - strcpy(nbfields_v, "1"); - - header = (element_from_header*) malloc(memory_allocated * sizeof(element_from_header)); - - header[0].name = nbfields_n; - header[0].value = nbfields_v; - - YY_BUFFER_STATE state; - - state=yy_scan_string(h); - header_parser(&nbfields, &memory_allocated, &header); - + yy_delete_buffer(state); - + return header; } - - diff --git a/libfasta/fasta_seq_writer.c b/libfasta/fasta_seq_writer.c index ccee773..5019244 100644 --- a/libfasta/fasta_seq_writer.c +++ b/libfasta/fasta_seq_writer.c @@ -52,7 +52,7 @@ void printOnlyHeaderFromTable(element_from_header* header, FILE* output) fprintf(output,">%s ",header[1].value); - for (i = 2; i <= nbf; i++) + for (i = 2; i < nbf; i++) { if (strcmp(header[i].name, "definition") != 0) { @@ -60,11 +60,10 @@ void printOnlyHeaderFromTable(element_from_header* header, FILE* output) fprintf(output,"="); fprintf(output,"%s; ",header[i].value); } + else if (strcmp(header[i].name, "definition") == 0) + fprintf(output,"%s ", header[i].value); } - if (strcmp(header[nbf].name, "definition") == 0) - fprintf(output,"%s; ",header[nbf].value); - fprintf(output,"\n"); } diff --git a/libfasta/header_mem_handler.c b/libfasta/header_mem_handler.c index a0b8e7c..3619c88 100644 --- a/libfasta/header_mem_handler.c +++ b/libfasta/header_mem_handler.c @@ -76,7 +76,7 @@ element_from_header** check_and_realloc_mem_in_header_table(element_from_header* { (*nbf)++; - if (*nbf == *memory_allocated) + if ((*nbf)+1 == *memory_allocated) { (*memory_allocated)++; *p_header = (element_from_header*) realloc(*p_header, (*memory_allocated) * sizeof(element_from_header)); @@ -87,7 +87,6 @@ element_from_header** check_and_realloc_mem_in_header_table(element_from_header* void end_header_table(element_from_header** p_header, int nbf) { - nbf = nbf - 1; - //fprintf(stderr, "nbf = %d", nbf); + *p_header = (element_from_header*) realloc(*p_header, nbf * sizeof(element_from_header)); sprintf((*p_header)->value, "%d", nbf); }