65 lines
2.8 KiB
C
Executable File
65 lines
2.8 KiB
C
Executable File
/**
|
|
* FileName: sequence.h
|
|
* Authors: Tiayyba Riaz, Celine Mercier
|
|
* Description: Prototypes and other declarations for sequences
|
|
* **/
|
|
#ifndef SEQUENCE_H_
|
|
#define SEQUENCE_H_
|
|
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include "../libutils/utilities.h"
|
|
#include "fasta_header_parser.h"
|
|
|
|
|
|
typedef struct {
|
|
char* accession_id; // identifier
|
|
char *rawheader; // not parsed header
|
|
element_from_header* header; // parsed header
|
|
char *sequence; // DNA sequence itself
|
|
int32_t length; // DNA sequence's length
|
|
int32_t count; // abundance of the sequence
|
|
unsigned char *table; // 4mer occurrence table build using function buildTable
|
|
int32_t over; // count of 4mer with occurrences greater than 255 (overflow)
|
|
struct fastaSeqPtr* next; // next unique sequence for example
|
|
BOOL cluster_center; // whether the sequence is a cluster center or not
|
|
int32_t cluster_weight; // cluster weight when sequence is cluster center
|
|
int32_t cluster_weight_unique_ids; // cluster weight when sequence is cluster center, counting the number sequence records
|
|
double score; // score with cluster center for example
|
|
struct fastaSeqPtr* center; // pointer to the sequence's cluster center
|
|
int32_t center_index; // index of the sequence's cluster center
|
|
BOOL uniqHead; // whether the sequence is a unique head or not
|
|
char* columns_BIOM; // to print in BIOM format
|
|
int columns_BIOM_size; // size allocated for columns_BIOM
|
|
char* line_OTU_table; // to print in OTU table format
|
|
int line_OTU_table_size; // size allocated for line_OTU_table
|
|
struct hashtable *sample_counts; // sample counts for sumaclean
|
|
}fastaSeq,*fastaSeqPtr;
|
|
|
|
|
|
typedef struct {
|
|
int32_t count;
|
|
fastaSeqPtr fastaSeqs;
|
|
}fastaSeqCount, *fastaSeqCountPtr;
|
|
|
|
|
|
fastaSeqPtr seq_getNext(FILE *fp, char *fieldDelim, BOOL isStandardSeq, BOOL onlyATGC);
|
|
char *seq_readNextFromFilebyLine(FILE* fp);
|
|
void seq_fillSeq(char *seq, fastaSeqPtr seqElem, int seqLen);
|
|
void seq_fillSeqOnlyATGC(char *seq, fastaSeqPtr seqElem, int seqLen);
|
|
void seq_fillDigitSeq(char *seq, fastaSeqPtr seqElem, int seqLen);
|
|
void seq_fillHeader(char* header, char *fieldDelim, fastaSeqPtr seqElem);
|
|
fastaSeqCount seq_readAllSeq2(char *fileName, BOOL isStandardSeq, BOOL onlyATGC);
|
|
int32_t seq_findSeqByAccId (char *accid, fastaSeqCountPtr allseqs);
|
|
void seq_printSeqs (fastaSeqCountPtr allseq);
|
|
int cleanDB(fastaSeqCount);
|
|
void addCounts(fastaSeqCount* db);
|
|
int uniqSeqsVector(fastaSeqCount* db, fastaSeqPtr** uniqSeqs);
|
|
void calculateMaxAndMinLen(fastaSeqPtr* db, int n, int* lmax, int* lmin);
|
|
void calculateMaxAndMinLenDB(fastaSeqCount db, int* lmax, int* lmin);
|
|
int sortSeqsWithCounts(const void **s1, const void **s2);
|
|
int reverseSortSeqsWithCounts(const void **s1, const void **s2);
|
|
void readSampleCounts(fastaSeqCount* db, char* key_name);
|
|
|
|
#endif /*SEQUENCE_H_*/
|