sumalibs first commit
This commit is contained in:
BIN
libfasta/.DS_Store
vendored
Normal file
BIN
libfasta/.DS_Store
vendored
Normal file
Binary file not shown.
33
libfasta/Makefile
Normal file
33
libfasta/Makefile
Normal file
@ -0,0 +1,33 @@
|
||||
|
||||
SOURCES = fasta_header_parser.c \
|
||||
fasta_seq_writer.c \
|
||||
fasta_header_handler.c \
|
||||
header_mem_handler.c \
|
||||
sequence.c
|
||||
|
||||
SRCS=$(SOURCES)
|
||||
|
||||
|
||||
OBJECTS= $(patsubst %.c,%.o,$(SOURCES))
|
||||
|
||||
LIBFILE = libfasta.a
|
||||
RANLIB = ranlib
|
||||
|
||||
|
||||
include ../global.mk
|
||||
|
||||
all: $(LIBFILE)
|
||||
|
||||
fasta_header_parser.c: fasta_header_parser.l
|
||||
flex -Pheader_yy -t $< > $@
|
||||
|
||||
dic_parser.c: dic_parser.l
|
||||
lex -Phashtable_yy -t $< > $@
|
||||
|
||||
clean:
|
||||
rm -rf $(OBJECTS) $(LIBFILE)
|
||||
rm -f *.a
|
||||
|
||||
$(LIBFILE): $(OBJECTS)
|
||||
ar -cr $@ $?
|
||||
$(RANLIB) $@
|
126
libfasta/fasta_header_handler.c
Normal file
126
libfasta/fasta_header_handler.c
Normal file
@ -0,0 +1,126 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "sequence.h"
|
||||
#include "fasta_header_parser.h"
|
||||
#include "fasta_header_handler.h"
|
||||
|
||||
|
||||
char* char_header_add_field(char* header, char* name, char* value)
|
||||
{
|
||||
int lheader = strlen(header);
|
||||
header = (char*) realloc(header, (lheader+strlen(name)+strlen(value)+4)*sizeof(char));
|
||||
if (header[lheader-1] == '.')
|
||||
{
|
||||
strcpy(header+lheader-1,";");
|
||||
strcpy(header+lheader," ");
|
||||
strcpy(header+lheader+1,name);
|
||||
strcpy(header+lheader+1+strlen(name),"=");
|
||||
strcpy(header+lheader+1+strlen(name)+1,value);
|
||||
}
|
||||
else
|
||||
{
|
||||
strcpy(header+lheader,";");
|
||||
strcpy(header+lheader+1," ");
|
||||
strcpy(header+lheader+2,name);
|
||||
strcpy(header+lheader+2+strlen(name),"=");
|
||||
strcpy(header+lheader+2+strlen(name)+1,value);
|
||||
}
|
||||
return header;
|
||||
}
|
||||
|
||||
|
||||
char* fastaSeqPtr_header_add_field(fastaSeqPtr seq, char* name, char* value)
|
||||
{
|
||||
int lheader = strlen(seq->rawheader);
|
||||
int i;
|
||||
char* buffer;
|
||||
char* rawheader;
|
||||
|
||||
rawheader = (char*) malloc((lheader+strlen(name)+strlen(value)+5)*sizeof(char));
|
||||
strcpy(rawheader, seq->rawheader);
|
||||
|
||||
buffer = calloc(lheader, sizeof(char));
|
||||
|
||||
i=0;
|
||||
|
||||
while ((rawheader[i] != ' ') && (rawheader[i] != 0))
|
||||
i++;
|
||||
|
||||
if (rawheader[i] == ' ')
|
||||
strcpy(buffer, rawheader+i);
|
||||
else
|
||||
strcpy(rawheader+i, " ");
|
||||
|
||||
i++;
|
||||
|
||||
strcpy(rawheader+i,name);
|
||||
strcpy(rawheader+i+strlen(name),"=");
|
||||
strcpy(rawheader+i+strlen(name)+1,value);
|
||||
strcpy(rawheader+i+strlen(name)+1+strlen(value),";");
|
||||
strcpy(rawheader+i+strlen(name)+1+strlen(value)+1, buffer);
|
||||
|
||||
free(buffer);
|
||||
|
||||
return(rawheader);
|
||||
}
|
||||
|
||||
|
||||
element_from_header* table_header_add_field(element_from_header* header, char* name, char* value)
|
||||
{
|
||||
int nbf;
|
||||
nbf = atoi(header[0].value);
|
||||
nbf++;
|
||||
header = (element_from_header*) realloc(header, (nbf+1)*sizeof(element_from_header));
|
||||
header[nbf].name = (char*) malloc((1+strlen(name))*sizeof(char));
|
||||
strcpy(header[nbf].name, name);
|
||||
header[nbf].value = (char*) malloc((1+strlen(value))*sizeof(char));
|
||||
strcpy(header[nbf].value, value);
|
||||
sprintf(header[0].value, "%d", nbf);
|
||||
return(header);
|
||||
}
|
||||
|
||||
|
||||
void free_header_table(element_from_header* header)
|
||||
{
|
||||
int i;
|
||||
int nbf = atoi(header[0].value);
|
||||
|
||||
for (i = 0; i <= nbf; i++)
|
||||
{
|
||||
free((header[i]).name);
|
||||
free((header[i]).value);
|
||||
}
|
||||
free(header);
|
||||
}
|
||||
|
||||
|
||||
char* getItemFromHeader(char* name, element_from_header* header)
|
||||
{
|
||||
char* value = 0;
|
||||
int nbf;
|
||||
int i;
|
||||
nbf = atoi(header[0].value);
|
||||
for (i = 1; i <= nbf; i++)
|
||||
{
|
||||
if (strcmp(header[i].name,name)==0)
|
||||
value = header[i].value;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
void changeValue(element_from_header* header, char* name, char* newValue)
|
||||
{
|
||||
int i;
|
||||
int nbf = atoi(header[0].value);
|
||||
|
||||
for (i = 1; i <= nbf; i++)
|
||||
{
|
||||
if (strcmp(header[i].name, name)==0)
|
||||
{
|
||||
header[i].value = realloc(header[i].value, (1+strlen(newValue))*sizeof(char));
|
||||
strcpy(header[i].value, newValue);
|
||||
}
|
||||
}
|
||||
}
|
23
libfasta/fasta_header_handler.h
Normal file
23
libfasta/fasta_header_handler.h
Normal file
@ -0,0 +1,23 @@
|
||||
|
||||
#ifndef FASTA_HEADER_HANDLER_H_
|
||||
#define FASTA_HEADER_HANDLER_H_
|
||||
|
||||
|
||||
#include "sequence.h"
|
||||
|
||||
|
||||
char* char_header_add_field(char*,char*,char*);
|
||||
|
||||
char* fastaSeqPtr_header_add_field(fastaSeqPtr seq, char* name, char* value);
|
||||
|
||||
element_from_header* table_header_add_dic(element_from_header* header, char* name, struct hashtable *hashtab);
|
||||
|
||||
element_from_header* table_header_add_field(element_from_header* header, char* name, char* value);
|
||||
|
||||
void free_header_table(element_from_header*);
|
||||
|
||||
char* getItemFromHeader(char*, element_from_header*);
|
||||
|
||||
void changeValue(element_from_header* header, char* name, char* newValue);
|
||||
|
||||
#endif
|
1954
libfasta/fasta_header_parser.c
Normal file
1954
libfasta/fasta_header_parser.c
Normal file
File diff suppressed because it is too large
Load Diff
13
libfasta/fasta_header_parser.h
Normal file
13
libfasta/fasta_header_parser.h
Normal file
@ -0,0 +1,13 @@
|
||||
|
||||
#ifndef FASTA_HEADER_PARSER_H_
|
||||
#define FASTA_HEADER_PARSER_H_
|
||||
|
||||
typedef struct {
|
||||
char *name;
|
||||
void *value;
|
||||
}element_from_header;
|
||||
|
||||
element_from_header* header_parser_main(char*);
|
||||
|
||||
|
||||
#endif
|
178
libfasta/fasta_header_parser.l
Normal file
178
libfasta/fasta_header_parser.l
Normal file
@ -0,0 +1,178 @@
|
||||
/*
|
||||
* Add -ll in Makefile if you modify this file to convert to .c
|
||||
*/
|
||||
|
||||
%x REGID
|
||||
%x REGNAME
|
||||
%x REGVAL
|
||||
|
||||
%{
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "header_mem_handler.h"
|
||||
#include "fasta_header_handler.h"
|
||||
|
||||
#define MEMALLOCATED 10
|
||||
#define BUFFER 5
|
||||
|
||||
#define YY_DECL int header_parser(int *nbf, int *memory_allocated, element_from_header **p_header)
|
||||
|
||||
|
||||
%}
|
||||
|
||||
WORD [[:alnum:]:\-.{},'_()\#\[\]\|\&\"\'\/\%\+]+
|
||||
WORDID [[:alnum:]:\-.{},'_()\#\[\]\|\&\"\'\/\%\+=;]+
|
||||
SUP >
|
||||
EOL \n
|
||||
SEP ;
|
||||
SPACE [[:blank:]]+
|
||||
EQUAL =
|
||||
|
||||
%%
|
||||
|
||||
int i;
|
||||
int size_needed;
|
||||
int free_size;
|
||||
char* field;
|
||||
|
||||
|
||||
<INITIAL>{SUP} {
|
||||
/*printf("\n<INITIAL>{SUP},%s",yytext);*/
|
||||
BEGIN(REGID);
|
||||
}
|
||||
|
||||
<INITIAL,REGID>{WORDID} {
|
||||
i=0;
|
||||
|
||||
field = malloc_field(&free_size);
|
||||
(*p_header)[*nbf].name = (char*) malloc(3*sizeof(char));
|
||||
strcpy(((*p_header)[*nbf]).name,"id");
|
||||
|
||||
size_needed = strlen(yytext)+1;
|
||||
(*p_header)[*nbf].value = (char*) malloc(sizeof(char)*size_needed);
|
||||
strcpy(((*p_header)[*nbf]).value,yytext);
|
||||
|
||||
(*nbf)++;
|
||||
}
|
||||
|
||||
|
||||
<INITIAL,REGID>{SPACE} {
|
||||
BEGIN(REGNAME);
|
||||
}
|
||||
|
||||
<REGNAME>{WORD} {
|
||||
/*fprintf(stderr,"\n<REGNAME>{WORD} **%s**",yytext);*/
|
||||
field = store_in_field(field,yytext,&free_size,&i);
|
||||
}
|
||||
|
||||
<REGNAME>{SPACE} {
|
||||
/*fprintf(stderr,"\n<REGNAME>{SPACE} **%s**",yytext);*/
|
||||
if (i != 0)
|
||||
field = store_in_field(field,yytext,&free_size,&i);
|
||||
}
|
||||
|
||||
<REGNAME>{EQUAL} {
|
||||
/*fprintf(stderr,"\n<REGNAME>{EQUAL},%s",yytext);*/
|
||||
field = store_in_header_table(field, &((*p_header)[*nbf].name), &free_size, &i);
|
||||
BEGIN(REGVAL);
|
||||
}
|
||||
|
||||
<REGNAME>{SEP} {
|
||||
/*fprintf(stderr,"\n<REGNAME>{SEP},%s",yytext);*/
|
||||
(*p_header)[*nbf].name = (char*) malloc(19*sizeof(char));
|
||||
strcpy((*p_header)[*nbf].name,"definition");
|
||||
field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i);
|
||||
p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated);
|
||||
BEGIN(REGNAME);
|
||||
}
|
||||
|
||||
<REGVAL>{WORD} {
|
||||
/*fprintf(stderr,"\n<REGVAL>{WORD} **%s**\n",yytext);*/
|
||||
field = store_in_field(field,yytext,&free_size,&i);
|
||||
}
|
||||
|
||||
<REGVAL>{SPACE} {
|
||||
/*fprintf(stderr,"\n<REGVAL>{SPACE} **%s**\n",yytext);*/
|
||||
field = store_in_field(field,yytext,&free_size,&i);
|
||||
}
|
||||
|
||||
<REGVAL>{SEP} {
|
||||
/*fprintf(stderr,"\n<REGVAL>{SEP},%s\n",yytext);*/
|
||||
|
||||
field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i);
|
||||
p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated);
|
||||
BEGIN(REGNAME);
|
||||
}
|
||||
|
||||
|
||||
<REGVAL>{EQUAL} {
|
||||
/*fprintf(stderr, "\nWarning : separator ';' probably missing in header after %s",(*p_header)[*nbf].name);*/
|
||||
}
|
||||
|
||||
<REGVAL><<EOF>> {
|
||||
field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i);
|
||||
p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated);
|
||||
end_header_table(p_header, *nbf);
|
||||
|
||||
free(field);
|
||||
BEGIN(INITIAL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
<REGNAME><<EOF>> {
|
||||
/*(*p_header)[*nbf].name = (char*) malloc(sizeof(char)*19);
|
||||
strcpy((*p_header)[*nbf].name,"other_informations");
|
||||
field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i);
|
||||
p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated);
|
||||
*/
|
||||
end_header_table(p_header, *nbf);
|
||||
|
||||
free(field);
|
||||
BEGIN(INITIAL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
%%
|
||||
|
||||
int header_yywrap()
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
element_from_header* header_parser_main(char *h)
|
||||
{
|
||||
int nbfields,memory_allocated;
|
||||
element_from_header* header;
|
||||
char* nbfields_n;
|
||||
char* nbfields_v;
|
||||
|
||||
nbfields_n = (char*) malloc(9*sizeof(char));
|
||||
nbfields_v = (char*) malloc(5*sizeof(char));
|
||||
|
||||
memory_allocated=MEMALLOCATED;
|
||||
|
||||
nbfields=1;
|
||||
|
||||
strcpy(nbfields_n, "nbfields");
|
||||
strcpy(nbfields_v, "1");
|
||||
|
||||
header = (element_from_header*) malloc(memory_allocated * sizeof(element_from_header));
|
||||
|
||||
header[0].name = nbfields_n;
|
||||
header[0].value = nbfields_v;
|
||||
|
||||
YY_BUFFER_STATE state;
|
||||
|
||||
state=yy_scan_string(h);
|
||||
|
||||
header_parser(&nbfields, &memory_allocated, &header);
|
||||
|
||||
yy_delete_buffer(state);
|
||||
|
||||
return header;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
76
libfasta/fasta_seq_writer.c
Normal file
76
libfasta/fasta_seq_writer.c
Normal file
@ -0,0 +1,76 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "sequence.h"
|
||||
#include "fasta_header_parser.h"
|
||||
|
||||
|
||||
void printOnlySeqFromFastaSeqPtr(fastaSeqPtr seq, FILE* output)
|
||||
{
|
||||
char nuc;
|
||||
int n=60;
|
||||
int l = strlen(seq->sequence);
|
||||
for (n=60; n<l; n+=60)
|
||||
{
|
||||
nuc = seq->sequence[n];
|
||||
seq->sequence[n]=0;
|
||||
fprintf(output,"%s\n",seq->sequence+n-60);
|
||||
seq->sequence[n]=nuc;
|
||||
}
|
||||
fprintf(output,"%s\n",seq->sequence+n-60);
|
||||
}
|
||||
|
||||
|
||||
void printOnlySeqFromChar(char* seq, FILE* output)
|
||||
{
|
||||
char nuc;
|
||||
int n=60;
|
||||
int l = strlen(seq);
|
||||
for (n=60; n<l; n+=60)
|
||||
{
|
||||
nuc = seq[n];
|
||||
seq[n]=0;
|
||||
fprintf(output,"%s\n",seq+n-60);
|
||||
seq[n]=nuc;
|
||||
}
|
||||
fprintf(output,"%s\n",seq+n-60);
|
||||
}
|
||||
|
||||
|
||||
void printOnlyHeaderFromFastaSeqPtr(fastaSeqPtr seq, FILE* output)
|
||||
{
|
||||
fprintf(output,">%s\n",seq->rawheader);
|
||||
}
|
||||
|
||||
|
||||
void printOnlyHeaderFromTable(element_from_header* header, FILE* output)
|
||||
{
|
||||
int i;
|
||||
int nbf;
|
||||
|
||||
nbf = atoi(header[0].value);
|
||||
|
||||
fprintf(output,">%s ",header[1].value);
|
||||
|
||||
for (i = 2; i <= nbf; i++)
|
||||
{
|
||||
if (strcmp(header[i].name, "definition") != 0)
|
||||
{
|
||||
fprintf(output,"%s",header[i].name);
|
||||
fprintf(output,"=");
|
||||
fprintf(output,"%s; ",header[i].value);
|
||||
}
|
||||
}
|
||||
|
||||
if (strcmp(header[nbf].name, "definition") == 0)
|
||||
fprintf(output,"%s; ",header[nbf].value);
|
||||
|
||||
fprintf(output,"\n");
|
||||
}
|
||||
|
||||
|
||||
void printHeaderAndSeqFromFastaSeqPtr(fastaSeqPtr seq, FILE* output)
|
||||
{
|
||||
printOnlyHeaderFromFastaSeqPtr(seq, output);
|
||||
printOnlySeqFromFastaSeqPtr(seq, output);
|
||||
}
|
19
libfasta/fasta_seq_writer.h
Normal file
19
libfasta/fasta_seq_writer.h
Normal file
@ -0,0 +1,19 @@
|
||||
|
||||
#ifndef FASTA_SEQ_WRITER_H_
|
||||
#define FASTA_SEQ_WRITER_H_
|
||||
|
||||
#include "sequence.h"
|
||||
|
||||
|
||||
void printOnlySeqFromFastaSeqPtr(fastaSeqPtr, FILE*);
|
||||
|
||||
void printOnlySeqFromChar(char*, FILE*);
|
||||
|
||||
void printOnlyHeaderFromFastaSeqPtr(fastaSeqPtr, FILE*);
|
||||
|
||||
void printOnlyHeaderFromTable(element_from_header*, FILE*);
|
||||
|
||||
void printHeaderAndSeqFromFastaSeqPtr(fastaSeqPtr, FILE*);
|
||||
|
||||
|
||||
#endif
|
93
libfasta/header_mem_handler.c
Normal file
93
libfasta/header_mem_handler.c
Normal file
@ -0,0 +1,93 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "header_mem_handler.h"
|
||||
#include <string.h>
|
||||
|
||||
#define FIELD_BUFFER 1024
|
||||
|
||||
|
||||
char* malloc_field(int *free_size)
|
||||
{
|
||||
char* field = (char*) malloc(sizeof(char) * FIELD_BUFFER);
|
||||
field[0] = 0;
|
||||
(*free_size) = FIELD_BUFFER;
|
||||
return field;
|
||||
}
|
||||
|
||||
int check_mem_field(int size_needed)
|
||||
{
|
||||
int number_of_chunks_to_alloc;
|
||||
number_of_chunks_to_alloc = size_needed / FIELD_BUFFER + 1;
|
||||
return number_of_chunks_to_alloc;
|
||||
}
|
||||
|
||||
char* realloc_field(int number_of_chunks_to_alloc, char* field)
|
||||
{
|
||||
int size_needed;
|
||||
size_needed = number_of_chunks_to_alloc * FIELD_BUFFER;
|
||||
field = realloc(field, (size_needed)*sizeof(char));
|
||||
return field;
|
||||
}
|
||||
|
||||
char* check_and_realloc_field(char* field, int size_needed, int* free_size)
|
||||
{
|
||||
size_needed = size_needed + strlen(field);
|
||||
int number_of_chunks_to_alloc = check_mem_field(size_needed);
|
||||
if (strlen(field)>0)
|
||||
field = realloc_field(number_of_chunks_to_alloc, field);
|
||||
else
|
||||
{
|
||||
free(field);
|
||||
field = malloc(number_of_chunks_to_alloc * FIELD_BUFFER);
|
||||
}
|
||||
(*free_size) = number_of_chunks_to_alloc*FIELD_BUFFER - size_needed + 1;
|
||||
return field;
|
||||
}
|
||||
|
||||
|
||||
char* store_in_field(char* field, char* yytext, int* free_size, int* i)
|
||||
{
|
||||
int size_needed;
|
||||
size_needed = strlen(yytext)+1;
|
||||
if (size_needed > (*free_size))
|
||||
field = check_and_realloc_field(field, size_needed, free_size);
|
||||
else
|
||||
(*free_size) = (*free_size) - size_needed + 1;
|
||||
strcpy(&(field[(*i)]),yytext);
|
||||
(*i) = (*i)+size_needed-1;
|
||||
return field;
|
||||
}
|
||||
|
||||
|
||||
char* store_in_header_table(char* field, char** storing_place, int* free_size, int* i)
|
||||
{
|
||||
int size_needed;
|
||||
size_needed = strlen(field)+1;
|
||||
*storing_place = (char*) malloc(size_needed*sizeof(char));
|
||||
strcpy(*storing_place,field);
|
||||
(*i)=0;
|
||||
free(field);
|
||||
field = malloc_field(free_size);
|
||||
return field;
|
||||
}
|
||||
|
||||
|
||||
element_from_header** check_and_realloc_mem_in_header_table(element_from_header** p_header, int* nbf, int* memory_allocated)
|
||||
{
|
||||
(*nbf)++;
|
||||
|
||||
if (*nbf == *memory_allocated)
|
||||
{
|
||||
(*memory_allocated)++;
|
||||
*p_header = (element_from_header*) realloc(*p_header, (*memory_allocated) * sizeof(element_from_header));
|
||||
}
|
||||
|
||||
return p_header;
|
||||
}
|
||||
|
||||
void end_header_table(element_from_header** p_header, int nbf)
|
||||
{
|
||||
nbf = nbf - 1;
|
||||
//fprintf(stderr, "nbf = %d", nbf);
|
||||
sprintf((*p_header)->value, "%d", nbf);
|
||||
}
|
22
libfasta/header_mem_handler.h
Normal file
22
libfasta/header_mem_handler.h
Normal file
@ -0,0 +1,22 @@
|
||||
#ifndef HEADER_MEM_HANDLER_H_
|
||||
#define HEADER_MEM_HANDLER_H_
|
||||
|
||||
#include "fasta_header_parser.h"
|
||||
|
||||
char* malloc_field(int*);
|
||||
|
||||
int check_mem_field(int);
|
||||
|
||||
char* realloc_field(int, char*);
|
||||
|
||||
char* check_and_realloc_field(char*, int, int*);
|
||||
|
||||
char* store_in_field(char*, char*, int*, int*);
|
||||
|
||||
char* store_in_header_table(char*, char**, int*, int*);
|
||||
|
||||
element_from_header** check_and_realloc_mem_in_header_table(element_from_header**, int*, int*);
|
||||
|
||||
void end_header_table(element_from_header** p_header, int nbf);
|
||||
|
||||
#endif
|
450
libfasta/sequence.c
Normal file
450
libfasta/sequence.c
Normal file
@ -0,0 +1,450 @@
|
||||
/**
|
||||
* FileName: sequence.c
|
||||
* Authors: Tiayyba Riaz, Celine Mercier
|
||||
* Description: C file for sequence reading and parsing
|
||||
* **/
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#include "../libutils/utilities.h"
|
||||
#include "sequence.h"
|
||||
#include "../libfile/fileHandling.h"
|
||||
#include "fasta_header_handler.h"
|
||||
#include "fasta_header_parser.h"
|
||||
|
||||
|
||||
/*
|
||||
* Function Name: seq_getNext(FILE *fp, char *fieldDelim)
|
||||
* Description: Gets the next sequence from file by calling another function, passes the sequence
|
||||
* to other function to get the header elements and nucleotide suquence into a strcuture of
|
||||
* type fastaSeq and returns a pointer to this newly populated structure.
|
||||
*/
|
||||
|
||||
fastaSeqPtr seq_getNext(FILE *fp, char *fieldDelim, BOOL isStandardSeq, BOOL onlyATGC)
|
||||
{
|
||||
char *seq;
|
||||
char *header;
|
||||
char *strTemp;
|
||||
fastaSeqPtr seqElem;
|
||||
int seqLen;
|
||||
|
||||
seq = seq_readNextFromFilebyLine(fp);
|
||||
if (seq == NULL) return NULL;
|
||||
|
||||
/* Find header separator \n, if not found return NULL */
|
||||
strTemp = strchr(seq, '\n');
|
||||
if(strTemp == NULL)
|
||||
return NULL;
|
||||
|
||||
seqLen = strlen(strTemp);
|
||||
header = (char*) util_malloc(1+(strlen(seq) - seqLen)*sizeof(char), __FILE__, __LINE__);
|
||||
|
||||
/* Separate header in header variable */
|
||||
strncpy(header, seq, strTemp - seq);
|
||||
header[strTemp - seq] = '\0';
|
||||
/* Get memory for new sequence structure element */
|
||||
seqElem = (fastaSeqPtr) util_malloc(sizeof(fastaSeq), __FILE__, __LINE__);
|
||||
/* Parse header and assign values to structure fields */
|
||||
seq_fillHeader(header, fieldDelim, seqElem);
|
||||
/* Get clean sequence and assign to structure field */
|
||||
if (isStandardSeq)
|
||||
if (onlyATGC)
|
||||
seq_fillSeqOnlyATGC(strTemp, seqElem, seqLen);
|
||||
else
|
||||
seq_fillSeq(strTemp, seqElem, seqLen);
|
||||
else
|
||||
seq_fillDigitSeq(strTemp, seqElem, seqLen);
|
||||
/* Type cast the char * seq to void pointer and deallocate the memory pointed by this */
|
||||
util_free((void *)seq);
|
||||
/* Return new sequence structure element */
|
||||
return seqElem;
|
||||
}
|
||||
|
||||
|
||||
char *seq_readNextFromFilebyLine(FILE* fp)
|
||||
{
|
||||
char newc = '\0';
|
||||
BOOL seqCompleted = FALSE;
|
||||
int length = 500;
|
||||
int32_t len;
|
||||
char tempstr[length];
|
||||
char* buffer;
|
||||
|
||||
if (feof(fp)) return NULL;
|
||||
newc = file_nextChar(fp);
|
||||
if (newc != '>') ungetc(newc, fp);
|
||||
|
||||
buffer = util_malloc(1*sizeof(char), __FILE__, __LINE__);
|
||||
buffer[0] = '\0';
|
||||
|
||||
while(!seqCompleted)
|
||||
{
|
||||
newc = file_nextChar(fp);
|
||||
if(newc == '>' || newc == '\0')
|
||||
{
|
||||
seqCompleted = TRUE;
|
||||
if (newc == '>')
|
||||
ungetc(newc, fp); // Make sure next time we start from sequence delimiter >
|
||||
}
|
||||
else
|
||||
{
|
||||
ungetc(newc, fp);
|
||||
if(file_nextLine( fp, tempstr, length) != NULL)
|
||||
{
|
||||
len = strlen(tempstr) + strlen(buffer) + 1;
|
||||
buffer = util_realloc(buffer, len, __FILE__, __LINE__);
|
||||
strcat(buffer, tempstr);
|
||||
}
|
||||
else
|
||||
{
|
||||
seqCompleted = TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Function Name: seq_fillHeader(char* header, char *fieldDelim, fastaSeqPtr seqElem)
|
||||
*/
|
||||
void seq_fillHeader(char* header, char *fieldDelim, fastaSeqPtr seqElem)
|
||||
{
|
||||
char* IdEnd;
|
||||
int IdSize;
|
||||
|
||||
seqElem->rawheader = strdup(header);
|
||||
|
||||
IdEnd = strchr(header, ' ');
|
||||
if (IdEnd == NULL)
|
||||
IdSize = strlen(header);
|
||||
else
|
||||
IdSize = strlen(header) - strlen(IdEnd);
|
||||
|
||||
seqElem->accession_id = (char*) util_malloc(1+IdSize*sizeof(char), __FILE__, __LINE__);
|
||||
|
||||
strncpy(seqElem->accession_id, header, IdSize);
|
||||
|
||||
(seqElem->accession_id)[IdSize] = '\0';
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Function Name: seq_fillSeq(char *seq, fastaSeqPtr seqElem)
|
||||
* Description: Parses the whole sequences for actual nucleotide sequences and stores that
|
||||
* sequence in the field of structure 'seqElem' .
|
||||
*/
|
||||
void seq_fillSeq(char *seq, fastaSeqPtr seqElem, int seqLen)
|
||||
{
|
||||
char* seqTemp;
|
||||
char c;
|
||||
int32_t index = 0, seqIndex = 0, len = strlen(seq);
|
||||
char* seqAlphabets = "acgtACGT-nN";
|
||||
|
||||
seqTemp = (char*) util_malloc(seqLen*sizeof(char), __FILE__, __LINE__);
|
||||
|
||||
while (index < len)
|
||||
{
|
||||
c = seq[index++];
|
||||
if (strchr(seqAlphabets, c) != NULL)
|
||||
seqTemp[seqIndex++] = tolower(c);
|
||||
}
|
||||
seqTemp[seqIndex] = '\0';
|
||||
seqElem->length=seqIndex;
|
||||
seqElem->sequence = strdup(seqTemp);
|
||||
}
|
||||
|
||||
|
||||
void seq_fillSeqOnlyATGC(char *seq, fastaSeqPtr seqElem, int seqLen)
|
||||
{
|
||||
char* seqTemp;
|
||||
char c;
|
||||
int32_t index = 0, seqIndex = 0, len = strlen(seq);
|
||||
char* seqAlphabets = "acgtACGT";
|
||||
int notAllATGC = 0;
|
||||
|
||||
seqTemp = (char*) util_malloc(seqLen*sizeof(char), __FILE__, __LINE__);
|
||||
|
||||
while (index < len)
|
||||
{
|
||||
c = seq[index++];
|
||||
if (strchr(seqAlphabets, c) != NULL)
|
||||
seqTemp[seqIndex++] = tolower(c);
|
||||
else if (c != '\n')
|
||||
notAllATGC = 1;
|
||||
}
|
||||
|
||||
if (notAllATGC)
|
||||
seqTemp[0] = '\0';
|
||||
else
|
||||
{
|
||||
seqTemp[seqIndex] = '\0';
|
||||
seqElem->length=seqIndex;
|
||||
}
|
||||
seqElem->sequence = strdup(seqTemp);
|
||||
}
|
||||
|
||||
|
||||
void seq_fillDigitSeq(char *seq, fastaSeqPtr seqElem, int seqLen)
|
||||
{
|
||||
char* seqTemp;
|
||||
char c;
|
||||
int32_t index = 0, seqIndex = 0, len = strlen(seq);
|
||||
|
||||
seqTemp = (char*) util_malloc(seqLen*sizeof(char), __FILE__, __LINE__);
|
||||
|
||||
while (index < len)
|
||||
{
|
||||
c = seq[index++];
|
||||
if ((c >= '0' && c <= '9') || c == ' ')
|
||||
seqTemp[seqIndex++] = c;
|
||||
/*else
|
||||
{
|
||||
printf("Error in input file");
|
||||
exit(0);
|
||||
}*/
|
||||
}
|
||||
seqTemp[seqIndex] = '\0';
|
||||
seqElem->sequence = strdup(seqTemp);
|
||||
}
|
||||
|
||||
|
||||
fastaSeqCount seq_readAllSeq2(char *fileName, BOOL isStandardSeq, BOOL onlyATGC)
|
||||
{
|
||||
FILE* fp;
|
||||
fastaSeqPtr seqPtr;
|
||||
fastaSeqPtr seqPtrAr;
|
||||
|
||||
int32_t counter = 0;
|
||||
int32_t slots = 1000;
|
||||
fastaSeqCount allseqs;
|
||||
int32_t discarded=0;
|
||||
|
||||
fp = file_open(fileName, TRUE);
|
||||
|
||||
if (fp == NULL)
|
||||
{
|
||||
fprintf(stderr, "\nCould not open file.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
exitIfEmptyFile(fp);
|
||||
|
||||
seqPtrAr = (fastaSeqPtr) util_malloc(slots*sizeof(fastaSeq), __FILE__, __LINE__);
|
||||
|
||||
seqPtr = seq_getNext(fp, " ", isStandardSeq, onlyATGC);
|
||||
|
||||
while (seqPtr != NULL)
|
||||
{
|
||||
if (counter == slots)
|
||||
{
|
||||
slots += 1000;
|
||||
seqPtrAr = (fastaSeqPtr)util_realloc(seqPtrAr, slots*sizeof(fastaSeq), __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
if ((seqPtr->sequence)[0] != '\0')
|
||||
seqPtrAr[counter++] = *seqPtr;
|
||||
else
|
||||
discarded++;
|
||||
|
||||
util_free((void *)seqPtr);
|
||||
seqPtr = seq_getNext(fp, " ", isStandardSeq, onlyATGC);
|
||||
}
|
||||
fclose(fp);
|
||||
|
||||
if (counter != slots)
|
||||
seqPtrAr = (fastaSeqPtr)util_realloc(seqPtrAr, counter*sizeof(fastaSeq), __FILE__, __LINE__);
|
||||
|
||||
allseqs.count = counter;
|
||||
allseqs.fastaSeqs = seqPtrAr;
|
||||
|
||||
if (discarded)
|
||||
fprintf(stderr, "\nDiscarded %d sequences that did not contain only 'AaTtGgCc' characters.", discarded);
|
||||
|
||||
return allseqs;
|
||||
}
|
||||
|
||||
|
||||
int32_t seq_findSeqByAccId (char *accid, fastaSeqCountPtr allseqs)
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
for (i = 0; i < allseqs->count; i++)
|
||||
{
|
||||
if (strcmp (accid, allseqs->fastaSeqs[i].accession_id) == 0)
|
||||
return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
void seq_printSeqs (fastaSeqCountPtr allseq)
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
for (i = 0; i < allseq->count; i++)
|
||||
//for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (allseq->fastaSeqs[i].sequence == NULL) continue;
|
||||
if (allseq->fastaSeqs[i].rawheader)
|
||||
printf (">%s\n", allseq->fastaSeqs[i].rawheader);
|
||||
else
|
||||
printf (">%s\n", allseq->fastaSeqs[i].accession_id);
|
||||
printf ("%s\n", allseq->fastaSeqs[i].sequence);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int cleanDB(fastaSeqCount db) // replace not a/t/g/c with a's
|
||||
{
|
||||
int32_t i;
|
||||
char *seq;
|
||||
BOOL changed;
|
||||
int32_t seqchanged=0;
|
||||
int32_t nucchanged=0;
|
||||
|
||||
fprintf(stderr,"Cleaning dataset...");
|
||||
|
||||
for (i=0; i < db.count;i++)
|
||||
{
|
||||
|
||||
changed=FALSE;
|
||||
for (seq = db.fastaSeqs[i].sequence; *seq!=0; seq++)
|
||||
{
|
||||
if (*seq!='a' && *seq!='c' && *seq!='g' && *seq!='t')
|
||||
{
|
||||
changed=TRUE;
|
||||
nucchanged++;
|
||||
*seq='a';
|
||||
}
|
||||
}
|
||||
if (changed)
|
||||
seqchanged++;
|
||||
}
|
||||
|
||||
if (seqchanged)
|
||||
fprintf(stderr," : %d nucleotides substituted in %d sequences\n",nucchanged,seqchanged);
|
||||
else
|
||||
fprintf(stderr," : Done\n");
|
||||
|
||||
return(db.count);
|
||||
}
|
||||
|
||||
|
||||
void addCounts(fastaSeqCount* db)
|
||||
{
|
||||
int s;
|
||||
char* count;
|
||||
element_from_header* header;
|
||||
char* count_n;
|
||||
char* count_v;
|
||||
|
||||
count_n = (char*) malloc(6*sizeof(char));
|
||||
count_v = (char*) malloc(2*sizeof(char));
|
||||
|
||||
strcpy(count_n, "count");
|
||||
strcpy(count_v, "1");
|
||||
|
||||
for (s=0; s < db->count; s++)
|
||||
{
|
||||
header = header_parser_main(db->fastaSeqs[s].rawheader);
|
||||
count = getItemFromHeader("count", header);
|
||||
if (count == 0) // no count field
|
||||
{
|
||||
header = table_header_add_field(header, count_n, count_v);
|
||||
db->fastaSeqs[s].count = 1;
|
||||
}
|
||||
else
|
||||
db->fastaSeqs[s].count = atoi(count);
|
||||
db->fastaSeqs[s].header = header;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int uniqSeqsVector(fastaSeqCount* db, fastaSeqPtr** uniqSeqs)
|
||||
{
|
||||
int i, j, k;
|
||||
*(*(uniqSeqs)) = db->fastaSeqs;
|
||||
db->fastaSeqs[0].uniqHead = TRUE;
|
||||
|
||||
i = 0;
|
||||
k = 1;
|
||||
|
||||
for (j=1; j < db->count; j++)
|
||||
{
|
||||
if (strcmp(db->fastaSeqs[i].sequence, db->fastaSeqs[j].sequence) == 0)
|
||||
{
|
||||
db->fastaSeqs[i].count += db->fastaSeqs[j].count;
|
||||
db->fastaSeqs[j].uniqHead = FALSE;
|
||||
}
|
||||
else
|
||||
{
|
||||
db->fastaSeqs[j].uniqHead = TRUE;
|
||||
*(*(uniqSeqs)+k) = (db->fastaSeqs)+j;
|
||||
k++;
|
||||
i = j;
|
||||
}
|
||||
}
|
||||
return(k);
|
||||
}
|
||||
|
||||
|
||||
void calculateMaxAndMinLen(fastaSeqPtr* db, int n, int* lmax, int* lmin)
|
||||
{
|
||||
int i;
|
||||
int l;
|
||||
|
||||
*lmax = 0;
|
||||
for (i=0; i < n; i++)
|
||||
{
|
||||
l = (*(db+i))->length;
|
||||
if (l > *lmax)
|
||||
*lmax = l;
|
||||
}
|
||||
|
||||
*lmin = *lmax;
|
||||
for (i=0; i < n; i++)
|
||||
{
|
||||
l = (*(db+i))->length;
|
||||
if (l < *lmin)
|
||||
*lmin = l;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void calculateMaxAndMinLenDB(fastaSeqCount db, int* lmax, int* lmin)
|
||||
{
|
||||
int i;
|
||||
int l;
|
||||
|
||||
*lmax = 0;
|
||||
for (i=0; i < db.count; i++)
|
||||
{
|
||||
l = ((db.fastaSeqs)+i)->length;
|
||||
if (l > *lmax)
|
||||
*lmax = l;
|
||||
}
|
||||
|
||||
*lmin = *lmax;
|
||||
for (i=0; i < db.count; i++)
|
||||
{
|
||||
l = ((db.fastaSeqs)+i)->length;;
|
||||
if (l < *lmin)
|
||||
*lmin = l;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int sortSeqsWithCounts(const void **s1, const void **s2)
|
||||
{
|
||||
return(((fastaSeqPtr) *s2)->count - ((fastaSeqPtr) *s1)->count);
|
||||
}
|
||||
|
||||
|
||||
int reverseSortSeqsWithCounts(const void **s1, const void **s2)
|
||||
{
|
||||
return(((fastaSeqPtr) *s1)->count - ((fastaSeqPtr) *s2)->count);
|
||||
}
|
64
libfasta/sequence.h
Normal file
64
libfasta/sequence.h
Normal file
@ -0,0 +1,64 @@
|
||||
/**
|
||||
* FileName: sequence.h
|
||||
* Authors: Tiayyba Riaz, Celine Mercier
|
||||
* Description: Prototypes and other declarations for sequences
|
||||
* **/
|
||||
#ifndef SEQUENCE_H_
|
||||
#define SEQUENCE_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include "../libutils/utilities.h"
|
||||
#include "fasta_header_parser.h"
|
||||
|
||||
|
||||
typedef struct {
|
||||
char* accession_id; // identifier
|
||||
char *rawheader; // not parsed header
|
||||
element_from_header* header; // parsed header
|
||||
char *sequence; // DNA sequence itself
|
||||
int32_t length; // DNA sequence's length
|
||||
int32_t count; // abundance of the sequence
|
||||
unsigned char *table; // 4mer occurrence table build using function buildTable
|
||||
int32_t over; // count of 4mer with occurrences greater than 255 (overflow)
|
||||
struct fastaSeqPtr* next; // next unique sequence for example
|
||||
BOOL cluster_center; // whether the sequence is a cluster center or not
|
||||
int32_t cluster_weight; // cluster weight when sequence is cluster center
|
||||
int32_t cluster_weight_unique_ids; // cluster weight when sequence is cluster center, counting the number sequence records
|
||||
double score; // score with cluster center for example
|
||||
struct fastaSeqPtr* center; // pointer to the sequence's cluster center
|
||||
int32_t center_index; // index of the sequence's cluster center
|
||||
BOOL uniqHead; // whether the sequence is a unique head or not
|
||||
char* columns_BIOM; // to print in BIOM format
|
||||
int columns_BIOM_size; // size allocated for columns_BIOM
|
||||
char* line_OTU_table; // to print in OTU table format
|
||||
int line_OTU_table_size; // size allocated for line_OTU_table
|
||||
struct hashtable *sample_counts; // sample counts for sumaclean
|
||||
}fastaSeq,*fastaSeqPtr;
|
||||
|
||||
|
||||
typedef struct {
|
||||
int32_t count;
|
||||
fastaSeqPtr fastaSeqs;
|
||||
}fastaSeqCount, *fastaSeqCountPtr;
|
||||
|
||||
|
||||
fastaSeqPtr seq_getNext(FILE *fp, char *fieldDelim, BOOL isStandardSeq, BOOL onlyATGC);
|
||||
char *seq_readNextFromFilebyLine(FILE* fp);
|
||||
void seq_fillSeq(char *seq, fastaSeqPtr seqElem, int seqLen);
|
||||
void seq_fillSeqOnlyATGC(char *seq, fastaSeqPtr seqElem, int seqLen);
|
||||
void seq_fillDigitSeq(char *seq, fastaSeqPtr seqElem, int seqLen);
|
||||
void seq_fillHeader(char* header, char *fieldDelim, fastaSeqPtr seqElem);
|
||||
fastaSeqCount seq_readAllSeq2(char *fileName, BOOL isStandardSeq, BOOL onlyATGC);
|
||||
int32_t seq_findSeqByAccId (char *accid, fastaSeqCountPtr allseqs);
|
||||
void seq_printSeqs (fastaSeqCountPtr allseq);
|
||||
int cleanDB(fastaSeqCount);
|
||||
void addCounts(fastaSeqCount* db);
|
||||
int uniqSeqsVector(fastaSeqCount* db, fastaSeqPtr** uniqSeqs);
|
||||
void calculateMaxAndMinLen(fastaSeqPtr* db, int n, int* lmax, int* lmin);
|
||||
void calculateMaxAndMinLenDB(fastaSeqCount db, int* lmax, int* lmin);
|
||||
int sortSeqsWithCounts(const void **s1, const void **s2);
|
||||
int reverseSortSeqsWithCounts(const void **s1, const void **s2);
|
||||
void readSampleCounts(fastaSeqCount* db, char* key_name);
|
||||
|
||||
#endif /*SEQUENCE_H_*/
|
Reference in New Issue
Block a user