From 2ba6d16147cd6f0816b4afe2aae2da581692e285 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Sat, 28 Jul 2018 17:13:45 +0200 Subject: [PATCH] New command: obi ecopcr --- python/obitools3/commands/ecopcr.pyx | 188 ++++ python/obitools3/dms/capi/obiecopcr.pxd | 28 + python/obitools3/dms/dms.cfiles | 9 + python/obitools3/utils.cfiles | 9 + src/libecoPCR/ecoError.c | 26 + src/libecoPCR/ecoMalloc.c | 79 ++ src/libecoPCR/ecoPCR.h | 128 +++ src/libecoPCR/ecoapat.c | 205 ++++ src/libecoPCR/ecodna.c | 156 +++ src/libecoPCR/libapat/CODES/dft_code.h | 14 + src/libecoPCR/libapat/CODES/dna_code.h | 71 ++ src/libecoPCR/libapat/CODES/prot_code.h | 51 + src/libecoPCR/libapat/Gmach.h | 97 ++ src/libecoPCR/libapat/Gtypes.h | 70 ++ src/libecoPCR/libapat/apat.h | 176 ++++ src/libecoPCR/libapat/apat_parse.c | 369 +++++++ src/libecoPCR/libapat/apat_search.c | 339 +++++++ src/libecoPCR/libapat/libstki.c | 380 +++++++ src/libecoPCR/libapat/libstki.h | 89 ++ src/libecoPCR/libthermo/nnparams.c | 619 ++++++++++++ src/libecoPCR/libthermo/nnparams.h | 62 ++ src/obi_ecopcr.c | 1229 +++++++++++++++++++++++ src/obi_ecopcr.h | 127 +++ src/obierrno.h | 3 +- 24 files changed, 4523 insertions(+), 1 deletion(-) create mode 100644 python/obitools3/commands/ecopcr.pyx create mode 100644 python/obitools3/dms/capi/obiecopcr.pxd create mode 100644 src/libecoPCR/ecoError.c create mode 100644 src/libecoPCR/ecoMalloc.c create mode 100644 src/libecoPCR/ecoPCR.h create mode 100644 src/libecoPCR/ecoapat.c create mode 100644 src/libecoPCR/ecodna.c create mode 100644 src/libecoPCR/libapat/CODES/dft_code.h create mode 100644 src/libecoPCR/libapat/CODES/dna_code.h create mode 100644 src/libecoPCR/libapat/CODES/prot_code.h create mode 100644 src/libecoPCR/libapat/Gmach.h create mode 100644 src/libecoPCR/libapat/Gtypes.h create mode 100644 src/libecoPCR/libapat/apat.h create mode 100644 src/libecoPCR/libapat/apat_parse.c create mode 100644 src/libecoPCR/libapat/apat_search.c create mode 100644 src/libecoPCR/libapat/libstki.c create mode 100644 src/libecoPCR/libapat/libstki.h create mode 100644 src/libecoPCR/libthermo/nnparams.c create mode 100644 src/libecoPCR/libthermo/nnparams.h create mode 100644 src/obi_ecopcr.c create mode 100644 src/obi_ecopcr.h diff --git a/python/obitools3/commands/ecopcr.pyx b/python/obitools3/commands/ecopcr.pyx new file mode 100644 index 0000000..9329d4a --- /dev/null +++ b/python/obitools3/commands/ecopcr.pyx @@ -0,0 +1,188 @@ +#cython: language_level=3 + +from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport +from obitools3.dms.dms cimport DMS +from obitools3.dms.capi.obidms cimport OBIDMS_p +from obitools3.dms.view import RollbackException +from obitools3.dms.capi.obiecopcr cimport obi_ecopcr +from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption, addTaxonomyInputOption +from obitools3.uri.decode import open_uri +from obitools3.apps.config import logger +from obitools3.utils cimport tobytes +from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS + +from libc.stdlib cimport malloc, free +from libc.stdint cimport int32_t + + +__title__="in silico PCR" + + +# TODO: add option to output unique ids +def addOptions(parser): + + addSequenceInputOption(parser) + addMinimalOutputOption(parser) + addTaxonomyInputOption(parser) + + + group = parser.add_argument_group('obi ecopcr specific options') + + group.add_argument('--primer1', '-F', + action="store", dest="ecopcr:primer1", + metavar='', + type=str, + help="Forward primer.") + + group.add_argument('--primer2', '-R', + action="store", dest="ecopcr:primer2", + metavar='', + type=str, + help="Reverse primer.") + + group.add_argument('--error', '-e', + action="store", dest="ecopcr:error", + metavar='', + default=0, + type=int, + help="Maximum number of errors (mismatches) allowed per primer. Default: 0.") + + group.add_argument('--min-length', '-l', + action="store", + dest="ecopcr:min-length", + metavar="", + type=int, + default=0, + help="Minimum length of the in silico amplified DNA fragment, excluding primers.") + + group.add_argument('--max-length', '-L', + action="store", + dest="ecopcr:max-length", + metavar="", + type=int, + default=0, + help="Maximum length of the in silico amplified DNA fragment, excluding primers.") + + group.add_argument('--restrict-to-taxid', '-r', + action="append", + dest="ecopcr:restrict-to-taxid", + metavar="", + type=int, + default=[], + help="Only the sequence records corresponding to the taxonomic group identified " + "by TAXID are considered for the in silico PCR. The TAXID is an integer " + "that can be found in the NCBI taxonomic database.") + + group.add_argument('--ignore-taxid', '-i', + action="append", + dest="ecopcr:ignore-taxid", + metavar="", + type=int, + default=[], + help="The sequences of the taxonomic group identified by TAXID are not considered for the in silico PCR.") + + group.add_argument('--circular', '-c', + action="store_true", + dest="ecopcr:circular", + default=False, + help="Considers that the input sequences are circular (e.g. mitochondrial or chloroplastic DNA).") + + group.add_argument('--salt-concentration', '-a', + action="store", + dest="ecopcr:salt-concentration", + metavar="", + type=float, + default=0.05, + help="Salt concentration used for estimating the Tm. Default: 0.05.") + + group.add_argument('--salt-correction-method', '-m', + action="store", + dest="ecopcr:salt-correction-method", + metavar="<1|2>", + type=int, + default=1, + help="Defines the method used for estimating the Tm (melting temperature) between the primers and their corresponding " + "target sequences. SANTALUCIA: 1, or OWCZARZY: 2. Default: 1.") + + group.add_argument('--keep-nucs', '-D', + action="store", + dest="ecopcr:keep-nucs", + metavar="", + type=int, + default=0, + help="Keeps the specified number of nucleotides on each side of the in silico amplified sequences, " + "(already including the amplified DNA fragment plus the two target sequences of the primers).") + + group.add_argument('--kingdom-mode', '-k', + action="store_true", + dest="ecopcr:kingdom-mode", + default=False, + help="Print in the output the kingdom of the in silico amplified sequences (default: print the superkingdom).") + + + +def run(config): + + cdef int32_t* restrict_to_taxids_p = NULL + cdef int32_t* ignore_taxids_p = NULL + + restrict_to_taxids_len = len(config['ecopcr']['restrict-to-taxid']) + restrict_to_taxids_p = malloc((restrict_to_taxids_len + 1) * sizeof(int32_t)) # +1 for the -1 flagging the end of the array + for i in range(restrict_to_taxids_len) : + restrict_to_taxids_p[i] = config['ecopcr']['restrict-to-taxid'][i] + restrict_to_taxids_p[restrict_to_taxids_len] = -1 + + ignore_taxids_len = len(config['ecopcr']['ignore-taxid']) + ignore_taxids_p = malloc((ignore_taxids_len + 1) * sizeof(int32_t)) # +1 for the -1 flagging the end of the array + for i in range(ignore_taxids_len) : + ignore_taxids_p[i] = config['ecopcr']['ignore-taxid'][i] + ignore_taxids_p[ignore_taxids_len] = -1 + + DMS.obi_atexit() + + logger("info", "obi ecopcr") + + # TODO Bad URI reading because current one is not adapted + + # Get input DMS path + i_dms_name = config['obi']['inputURI'].split('/')[0] + + # Read the name of the input view + i_uri = config['obi']['inputURI'].split('/') + i_view_name = i_uri[1] + + # Read the name of the output view + o_uri = config['obi']['outputURI'].split('/') + if len(o_uri)==2: + # Get output DMS path + o_dms_name = o_uri[0] + o_view_name = o_uri[1] + else: + o_dms_name = i_dms_name + o_view_name = o_uri[0] + + o_dms = open_uri(o_dms_name, input=False)[0] + + # Read taxonomy name + taxonomy_name = config['obi']['taxoURI'].split('/')[2] + + # TODO: input DMS, taxonomy and primers in comments + + if obi_ecopcr(tobytes(i_dms_name), tobytes(i_view_name), tobytes(taxonomy_name), \ + tobytes(o_dms_name), tobytes(o_view_name), b"ecopcr", \ + tobytes(config['ecopcr']['primer1']), tobytes(config['ecopcr']['primer2']), \ + config['ecopcr']['error'], \ + config['ecopcr']['min-length'], config['ecopcr']['max-length'], \ + restrict_to_taxids_p, ignore_taxids_p, \ + config['ecopcr']['circular'], config['ecopcr']['salt-concentration'], config['ecopcr']['salt-correction-method'], \ + config['ecopcr']['keep-nucs'], config['ecopcr']['kingdom-mode']) < 0: + raise Exception("Error running ecopcr") + + free(restrict_to_taxids_p) + free(ignore_taxids_p) + + print("\n") + print(repr(o_dms[o_view_name])) + + o_dms.close() + diff --git a/python/obitools3/dms/capi/obiecopcr.pxd b/python/obitools3/dms/capi/obiecopcr.pxd new file mode 100644 index 0000000..54ec4aa --- /dev/null +++ b/python/obitools3/dms/capi/obiecopcr.pxd @@ -0,0 +1,28 @@ +#cython: language_level=3 + +from obitools3.dms.capi.obidms cimport OBIDMS_p +from libc.stdint cimport int32_t + + +cdef extern from "obi_ecopcr.h" nogil: + + int obi_ecopcr(const char* input_dms_name, + const char* i_view_name, + const char* taxonomy_name, + const char* output_dms_name, + const char* o_view_name, + const char* o_view_comments, + const char* primer1, + const char* primer2, + int error_max, + int min_len, + int max_len, + int32_t* restrict_to_taxids, + int32_t* ignore_taxids, + int circular, + double salt_concentration, + int salt_correction_method, + int keep_nucleotides, + bint kingdom_mode) + + diff --git a/python/obitools3/dms/dms.cfiles b/python/obitools3/dms/dms.cfiles index 2d0b4d8..04a31d7 100644 --- a/python/obitools3/dms/dms.cfiles +++ b/python/obitools3/dms/dms.cfiles @@ -9,6 +9,15 @@ ../../../src/murmurhash2.c ../../../src/obi_align.c ../../../src/obi_clean.c +../../../src/obi_ecopcr.c +../../../src/libecoPCR/libthermo/nnparams.c +../../../src/libecoPCR/libapat/apat_parse.c +../../../src/libecoPCR/libapat/apat_search.c +../../../src/libecoPCR/libapat/libstki.c +../../../src/libecoPCR/ecoapat.c +../../../src/libecoPCR/ecodna.c +../../../src/libecoPCR/ecoError.c +../../../src/libecoPCR/ecoMalloc.c ../../../src/obiavl.c ../../../src/obiblob_indexer.c ../../../src/obiblob.c diff --git a/python/obitools3/utils.cfiles b/python/obitools3/utils.cfiles index 2a5e3b9..1e6c323 100644 --- a/python/obitools3/utils.cfiles +++ b/python/obitools3/utils.cfiles @@ -9,6 +9,15 @@ ../../src/murmurhash2.c ../../src/obi_align.c ../../src/obi_clean.c +../../src/obi_ecopcr.c +../../src/libecoPCR/libthermo/nnparams.c +../../src/libecoPCR/libapat/apat_parse.c +../../src/libecoPCR/libapat/apat_search.c +../../src/libecoPCR/libapat/libstki.c +../../src/libecoPCR/ecoapat.c +../../src/libecoPCR/ecodna.c +../../src/libecoPCR/ecoError.c +../../src/libecoPCR/ecoMalloc.c ../../src/obiavl.c ../../src/obiblob_indexer.c ../../src/obiblob.c diff --git a/src/libecoPCR/ecoError.c b/src/libecoPCR/ecoError.c new file mode 100644 index 0000000..00bbfa2 --- /dev/null +++ b/src/libecoPCR/ecoError.c @@ -0,0 +1,26 @@ +#include "ecoPCR.h" +#include +#include + +/* + * print the message given as argument and exit the program + * @param error error number + * @param message the text explaining what's going on + * @param filename the file source where the program failed + * @param linenumber the line where it has failed + * filename and linenumber are written at pre-processing + * time by a macro + */ +void ecoError(int32_t error, + const char* message, + const char * filename, + int linenumber) +{ + fprintf(stderr,"Error %d in file %s line %d : %s\n", + error, + filename, + linenumber, + message); + + abort(); +} diff --git a/src/libecoPCR/ecoMalloc.c b/src/libecoPCR/ecoMalloc.c new file mode 100644 index 0000000..0ea8d3b --- /dev/null +++ b/src/libecoPCR/ecoMalloc.c @@ -0,0 +1,79 @@ +#include "ecoPCR.h" +#include + +static int eco_log_malloc = 0; + +void eco_trace_memory_allocation() +{ + eco_log_malloc=1; +} + +void eco_untrace_memory_allocation() +{ + eco_log_malloc=0; +} + + +void *eco_malloc(int32_t chunksize, + const char *error_message, + const char *filename, + int32_t line) +{ + void * chunk; + + chunk = calloc(1,chunksize); + + if (!chunk) + ecoError(ECO_MEM_ERROR,error_message,filename,line); + + if (eco_log_malloc) + fprintf(stderr, + "Memory segment located at %p of size %d is allocated (file : %s [%d])", + chunk, + chunksize, + filename, + line); + + return chunk; +} + +void *eco_realloc(void *chunk, + int32_t newsize, + const char *error_message, + const char *filename, + int32_t line) +{ + void *newchunk; + + newchunk = realloc(chunk,newsize); + + if (!newchunk) + ecoError(ECO_MEM_ERROR,error_message,filename,line); + + if (eco_log_malloc) + fprintf(stderr, + "Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])", + chunk, + newchunk, + newsize, + filename, + line); + + return newchunk; +} + +void eco_free(void *chunk, + const char *error_message, + const char *filename, + int32_t line) +{ + free(chunk); + + if (eco_log_malloc) + fprintf(stderr, + "Memory segment %p is released => %s (file : %s [%d])", + chunk, + error_message, + filename, + line); +} diff --git a/src/libecoPCR/ecoPCR.h b/src/libecoPCR/ecoPCR.h new file mode 100644 index 0000000..d4cd399 --- /dev/null +++ b/src/libecoPCR/ecoPCR.h @@ -0,0 +1,128 @@ +#ifndef ECOPCR_H_ +#define ECOPCR_H_ + +#include +#include + +#include "../obidmscolumn.h" +#include "../obiview.h" +#include "../obitypes.h" + +#ifndef H_apat +#include "./libapat/apat.h" +#endif + +/***************************************************** + * + * Data type declarations + * + *****************************************************/ + +/* + * + * Sequence types + * + */ + +typedef struct { + + int32_t taxid; + char AC[20]; + int32_t DE_length; + int32_t SQ_length; + int32_t CSQ_length; + + char data[1]; + +} ecoseqformat_t; + +typedef struct { + int32_t taxid; + int32_t SQ_length; + char *AC; + char *DE; + char *SQ; +} ecoseq_t; + + +/***************************************************** + * + * Function declarations + * + *****************************************************/ + +/* + * + * Low level system functions + * + */ + +int32_t is_big_endian(); +int32_t swap_int32_t(int32_t); + +void *eco_malloc(int32_t chunksize, + const char *error_message, + const char *filename, + int32_t line); + + +void *eco_realloc(void *chunk, + int32_t chunksize, + const char *error_message, + const char *filename, + int32_t line); + +void eco_free(void *chunk, + const char *error_message, + const char *filename, + int32_t line); + +void eco_trace_memory_allocation(); +void eco_untrace_memory_allocation(); + +#define ECOMALLOC(size,error_message) \ + eco_malloc((size),(error_message),__FILE__,__LINE__) + +#define ECOREALLOC(chunk,size,error_message) \ + eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__) + +#define ECOFREE(chunk,error_message) \ + eco_free((chunk),(error_message),__FILE__,__LINE__) + + +/* + * + * Error management + * + */ + + +void ecoError(int32_t,const char*,const char *,int); + +#define ECOERROR(code,message) ecoError((code),(message),__FILE__,__LINE__) + +#define ECO_IO_ERROR (1) +#define ECO_MEM_ERROR (2) +#define ECO_ASSERT_ERROR (3) +#define ECO_NOTFOUND_ERROR (4) + + +/* + * + * Low level Disk access functions + * + */ + + +int32_t delete_apatseq(SeqPtr pseq); +PatternPtr buildPattern(const char *pat, int32_t error_max); +PatternPtr complementPattern(PatternPtr pat); + +SeqPtr ecoseq2apatseq(char* sequence, SeqPtr out, int32_t circular); + +char *ecoComplementPattern(char *nucAcSeq); +char *ecoComplementSequence(char *nucAcSeq); +char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end); + + +#endif /*ECOPCR_H_*/ diff --git a/src/libecoPCR/ecoapat.c b/src/libecoPCR/ecoapat.c new file mode 100644 index 0000000..8f6c0aa --- /dev/null +++ b/src/libecoPCR/ecoapat.c @@ -0,0 +1,205 @@ +#include "./libapat/libstki.h" +#include "./libapat/apat.h" + +#include "ecoPCR.h" + +#include + +#include "../obidmscolumn.h" +#include "../obiview.h" +#include "../obitypes.h" + + +static void EncodeSequence(SeqPtr seq); +static void UpperSequence(char *seq); + +/* -------------------------------------------- */ +/* uppercase sequence */ +/* -------------------------------------------- */ + +#define IS_LOWER(c) (((c) >= 'a') && ((c) <= 'z')) +#define TO_UPPER(c) ((c) - 'a' + 'A') + +void UpperSequence(char *seq) +{ + char *cseq; + + for (cseq = seq ; *cseq ; cseq++) + if (IS_LOWER(*cseq)) + *cseq = TO_UPPER(*cseq); +} + +#undef IS_LOWER +#undef TO_UPPER + + + +/* -------------------------------------------- */ +/* encode sequence */ +/* IS_UPPER is slightly faster than isupper */ +/* -------------------------------------------- */ + +#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'Z')) + + + +void EncodeSequence(SeqPtr seq) +{ + int i; + UInt8 *data; + char *cseq; + + data = seq->data; + cseq = seq->cseq; + + while (*cseq) { + *data = (IS_UPPER(*cseq) ? *cseq - 'A' : 0x0); + data++; + cseq++; + } + + for (i=0,cseq=seq->cseq;i < seq->circular; i++,cseq++,data++) + *data = (IS_UPPER(*cseq) ? *cseq - 'A' : 0x0); + + for (i = 0 ; i < MAX_PATTERN ; i++) + seq->hitpos[i]->top = seq->hiterr[i]->top = 0; +} + +#undef IS_UPPER + + +SeqPtr ecoseq2apatseq(char* sequence, SeqPtr out, int32_t circular) +{ + int i; + int32_t seq_len; + + if (!out) + { + out = ECOMALLOC(sizeof(Seq), + "Error in Allocation of a new Seq structure"); + + for (i = 0 ; i < MAX_PATTERN ; i++) + { + + if (! (out->hitpos[i] = NewStacki(kMinStackiSize))) + ECOERROR(ECO_MEM_ERROR,"Error in hit stack Allocation"); + + if (! (out->hiterr[i] = NewStacki(kMinStackiSize))) + ECOERROR(ECO_MEM_ERROR,"Error in error stack Allocation"); + } + } + + seq_len = strlen(sequence); + + out->seqsiz = out->seqlen = seq_len; + out->circular = circular; + + if (!out->data) + { + out->data = ECOMALLOC((out->seqlen+circular) *sizeof(UInt8), + "Error in Allocation of a new Seq data member"); + out->datsiz= out->seqlen+circular; + } + else if ((out->seqlen +circular) >= out->datsiz) + { + out->data = ECOREALLOC(out->data,(out->seqlen+circular), + "Error during Seq data buffer realloc"); + out->datsiz= out->seqlen+circular; + } + + UpperSequence(sequence); // ecoPCR only works on uppercase + out->cseq = sequence; + + EncodeSequence(out); + + return out; +} + + +int32_t delete_apatseq(SeqPtr pseq) +{ + int i; + + if (pseq) { + + if (pseq->data) + ECOFREE(pseq->data,"Freeing sequence data buffer"); + + for (i = 0 ; i < MAX_PATTERN ; i++) { + if (pseq->hitpos[i]) FreeStacki(pseq->hitpos[i]); + if (pseq->hiterr[i]) FreeStacki(pseq->hiterr[i]); + } + + ECOFREE(pseq,"Freeing apat sequence structure"); + + return 0; + } + + return 1; +} + +PatternPtr buildPattern(const char *pat, int32_t error_max) +{ + PatternPtr pattern; + int32_t patlen; + + pattern = ECOMALLOC(sizeof(Pattern), + "Error in pattern allocation"); + + pattern->ok = Vrai; + pattern->hasIndel= Faux; + pattern->maxerr = error_max; + patlen = strlen(pat); + + pattern->cpat = ECOMALLOC(sizeof(char)*patlen+1, + "Error in sequence pattern allocation"); + + strncpy(pattern->cpat,pat,patlen); + pattern->cpat[patlen]=0; + UpperSequence(pattern->cpat); + + if (!CheckPattern(pattern)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking"); + + if (! EncodePattern(pattern, dna)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding"); + + if (! CreateS(pattern, ALPHA_LEN)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling"); + + return pattern; + +} + +PatternPtr complementPattern(PatternPtr pat) +{ + PatternPtr pattern; + + pattern = ECOMALLOC(sizeof(Pattern), + "Error in pattern allocation"); + + pattern->ok = Vrai; + pattern->hasIndel= pat->hasIndel; + pattern->maxerr = pat->maxerr; + pattern->patlen = pat->patlen; + + pattern->cpat = ECOMALLOC(sizeof(char)*(strlen(pat->cpat)+1), + "Error in sequence pattern allocation"); + + + strcpy(pattern->cpat,pat->cpat); + + ecoComplementPattern(pattern->cpat); + + if (!CheckPattern(pattern)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking"); + + if (! EncodePattern(pattern, dna)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding"); + + if (! CreateS(pattern, ALPHA_LEN)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling"); + + return pattern; + +} diff --git a/src/libecoPCR/ecodna.c b/src/libecoPCR/ecodna.c new file mode 100644 index 0000000..86d2012 --- /dev/null +++ b/src/libecoPCR/ecodna.c @@ -0,0 +1,156 @@ +#include +#include "ecoPCR.h" + +/* + * @doc: DNA alphabet (IUPAC) + */ +#define LX_BIO_DNA_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]" + +/* + * @doc: complementary DNA alphabet (IUPAC) + */ +#define LX_BIO_CDNA_ALPHA "TVGHEFCDIJMLKNOPQYSAABWXRZ#!][" + + +static char sNuc[] = LX_BIO_DNA_ALPHA; +static char sAnuc[] = LX_BIO_CDNA_ALPHA; + +static char LXBioBaseComplement(char nucAc); +static char *LXBioSeqComplement(char *nucAcSeq); +static char *reverseSequence(char *str,char isPattern); + + +/* ---------------------------- */ + +char LXBioBaseComplement(char nucAc) +{ + char *c; + + if ((c = strchr(sNuc, nucAc))) + return sAnuc[(c - sNuc)]; + else + return nucAc; +} + +/* ---------------------------- */ + +char *LXBioSeqComplement(char *nucAcSeq) +{ + char *s; + + for (s = nucAcSeq ; *s ; s++) + *s = LXBioBaseComplement(*s); + + return nucAcSeq; +} + + +char *reverseSequence(char *str,char isPattern) +{ + char *sb, *se, c; + + if (! str) + return str; + + sb = str; + se = str + strlen(str) - 1; + + while(sb <= se) { + c = *sb; + *sb++ = *se; + *se-- = c; + } + + sb = str; + se = str + strlen(str) - 1; + + if (isPattern) + for (;sb < se; sb++) + { + if (*sb=='#') + { + if (((se - sb) > 2) && (*(sb+2)=='!')) + { + *sb='!'; + sb+=2; + *sb='#'; + } + else + { + *sb=*(sb+1); + sb++; + *sb='#'; + } + } + else if (*sb=='!') + { + *sb=*(sb-1); + *(sb-1)='!'; + } + } + + return str; +} + +char *ecoComplementPattern(char *nucAcSeq) +{ + return reverseSequence(LXBioSeqComplement(nucAcSeq),1); +} + +char *ecoComplementSequence(char *nucAcSeq) +{ + return reverseSequence(LXBioSeqComplement(nucAcSeq),0); +} + + +char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end) +/* + extract subsequence from nucAcSeq [begin,end[ +*/ +{ + static char *buffer = NULL; + static int32_t buffSize= 0; + int32_t length; + + if (begin < end) + { + length = end - begin; + + if (length >= buffSize) + { + buffSize = length+1; + if (buffer) + buffer=ECOREALLOC(buffer,buffSize, + "Error in reallocating sub sequence buffer"); + else + buffer=ECOMALLOC(buffSize, + "Error in allocating sub sequence buffer"); + + } + + strncpy(buffer,nucAcSeq + begin,length); + buffer[length]=0; + } + else + { + length = end + strlen(nucAcSeq) - begin; + + if (length >= buffSize) + { + buffSize = length+1; + if (buffer) + buffer=ECOREALLOC(buffer,buffSize, + "Error in reallocating sub sequence buffer"); + else + buffer=ECOMALLOC(buffSize, + "Error in allocating sub sequence buffer"); + + } + strncpy(buffer,nucAcSeq+begin,length - end); + strncpy(buffer+(length-end),nucAcSeq ,end); + buffer[length]=0; + } + + return buffer; +} + diff --git a/src/libecoPCR/libapat/CODES/dft_code.h b/src/libecoPCR/libapat/CODES/dft_code.h new file mode 100644 index 0000000..b9caf28 --- /dev/null +++ b/src/libecoPCR/libapat/CODES/dft_code.h @@ -0,0 +1,14 @@ +/* ----------------------------------------------- */ +/* dft_pat_seq_code.h */ +/* default alphabet encoding for alpha */ +/* ----------------------------------------------- */ + + 0x00000001 /* A */, 0x00000002 /* B */, 0x00000004 /* C */, + 0x00000008 /* D */, 0x00000010 /* E */, 0x00000020 /* F */, + 0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */, + 0x00000200 /* J */, 0x00000400 /* K */, 0x00000800 /* L */, + 0x00001000 /* M */, 0x00002000 /* N */, 0x00004000 /* O */, + 0x00008000 /* P */, 0x00010000 /* Q */, 0x00020000 /* R */, + 0x00040000 /* S */, 0x00080000 /* T */, 0x00100000 /* U */, + 0x00200000 /* V */, 0x00400000 /* W */, 0x00800000 /* X */, + 0x01000000 /* Y */, 0x02000000 /* Z */ diff --git a/src/libecoPCR/libapat/CODES/dna_code.h b/src/libecoPCR/libapat/CODES/dna_code.h new file mode 100644 index 0000000..0febf41 --- /dev/null +++ b/src/libecoPCR/libapat/CODES/dna_code.h @@ -0,0 +1,71 @@ +/* ----------------------------------------------- */ +/* dna_code.h */ +/* alphabet encoding for dna/rna */ +/* ----------------------------------------- */ +/* IUPAC encoding */ +/* ----------------------------------------- */ +/* G/A/T/C */ +/* U=T */ +/* R=AG */ +/* Y=CT */ +/* M=AC */ +/* K=GT */ +/* S=CG */ +/* W=AT */ +/* H=ACT */ +/* B=CGT */ +/* V=ACG */ +/* D=AGT */ +/* N=ACGT */ +/* X=ACGT */ +/* EFIJLOPQZ not recognized */ +/* ----------------------------------------- */ +/* dual encoding */ +/* ----------------------------------------- */ +/* A=ADHMNRVW */ +/* B=BCDGHKMNRSTUVWY */ +/* C=BCHMNSVY */ +/* D=ABDGHKMNRSTUVWY */ +/* G=BDGKNRSV */ +/* H=ABCDHKMNRSTUVWY */ +/* K=BDGHKNRSTUVWY */ +/* M=ABCDHMNRSVWY */ +/* N=ABCDGHKMNRSTUVWY */ +/* R=ABDGHKMNRSVW */ +/* S=BCDGHKMNRSVY */ +/* T=BDHKNTUWY */ +/* U=BDHKNTUWY */ +/* V=ABCDGHKMNRSVWY */ +/* W=ABDHKMNRTUVWY */ +/* X=ABCDGHKMNRSTUVWY */ +/* Y=BCDHKMNSTUVWY */ +/* EFIJLOPQZ not recognized */ +/* ----------------------------------------------- */ + +#ifndef USE_DUAL + + /* IUPAC */ + + 0x00000001 /* A */, 0x00080044 /* B */, 0x00000004 /* C */, + 0x00080041 /* D */, 0x00000000 /* E */, 0x00000000 /* F */, + 0x00000040 /* G */, 0x00080005 /* H */, 0x00000000 /* I */, + 0x00000000 /* J */, 0x00080040 /* K */, 0x00000000 /* L */, + 0x00000005 /* M */, 0x00080045 /* N */, 0x00000000 /* O */, + 0x00000000 /* P */, 0x00000000 /* Q */, 0x00000041 /* R */, + 0x00000044 /* S */, 0x00080000 /* T */, 0x00080000 /* U */, + 0x00000045 /* V */, 0x00080001 /* W */, 0x00080045 /* X */, + 0x00080004 /* Y */, 0x00000000 /* Z */ + +#else + /* DUAL */ + + 0x00623089 /* A */, 0x017e34ce /* B */, 0x01243086 /* C */, + 0x017e34cb /* D */, 0x00000000 /* E */, 0x00000000 /* F */, + 0x0026244a /* G */, 0x017e348f /* H */, 0x00000000 /* I */, + 0x00000000 /* J */, 0x017e24ca /* K */, 0x00000000 /* L */, + 0x0166308f /* M */, 0x017e34cf /* N */, 0x00000000 /* O */, + 0x00000000 /* P */, 0x00000000 /* Q */, 0x006634cb /* R */, + 0x012634ce /* S */, 0x0158248a /* T */, 0x0158248a /* U */, + 0x016634cf /* V */, 0x017a348b /* W */, 0x017e34cf /* X */, + 0x017c348e /* Y */, 0x00000000 /* Z */ +#endif diff --git a/src/libecoPCR/libapat/CODES/prot_code.h b/src/libecoPCR/libapat/CODES/prot_code.h new file mode 100644 index 0000000..edcdfc1 --- /dev/null +++ b/src/libecoPCR/libapat/CODES/prot_code.h @@ -0,0 +1,51 @@ +/* ----------------------------------------------- */ +/* prot_code.h */ +/* alphabet encoding for proteins */ +/* ----------------------------------------- */ +/* IUPAC encoding */ +/* ----------------------------------------- */ +/* B=DN */ +/* Z=EQ */ +/* X=any - {X} */ +/* JOU not recognized */ +/* ----------------------------------------- */ +/* dual encoding */ +/* ----------------------------------------- */ +/* B=BDN */ +/* D=BD */ +/* E=EZ */ +/* N=BN */ +/* Q=QZ */ +/* X=any - {X} */ +/* Z=EQZ */ +/* JOU not recognized */ +/* ----------------------------------------------- */ + +#ifndef USE_DUAL + + /* IUPAC */ + + 0x00000001 /* A */, 0x00002008 /* B */, 0x00000004 /* C */, + 0x00000008 /* D */, 0x00000010 /* E */, 0x00000020 /* F */, + 0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */, + 0x00000000 /* J */, 0x00000400 /* K */, 0x00000800 /* L */, + 0x00001000 /* M */, 0x00002000 /* N */, 0x00000000 /* O */, + 0x00008000 /* P */, 0x00010000 /* Q */, 0x00020000 /* R */, + 0x00040000 /* S */, 0x00080000 /* T */, 0x00000000 /* U */, + 0x00200000 /* V */, 0x00400000 /* W */, 0x037fffff /* X */, + 0x01000000 /* Y */, 0x00010010 /* Z */ + +#else + /* DUAL */ + + 0x00000001 /* A */, 0x0000200a /* B */, 0x00000004 /* C */, + 0x0000000a /* D */, 0x02000010 /* E */, 0x00000020 /* F */, + 0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */, + 0x00000000 /* J */, 0x00000400 /* K */, 0x00000800 /* L */, + 0x00001000 /* M */, 0x00002002 /* N */, 0x00000000 /* O */, + 0x00008000 /* P */, 0x02010000 /* Q */, 0x00020000 /* R */, + 0x00040000 /* S */, 0x00080000 /* T */, 0x00000000 /* U */, + 0x00200000 /* V */, 0x00400000 /* W */, 0x037fffff /* X */, + 0x01000000 /* Y */, 0x02010010 /* Z */ + +#endif diff --git a/src/libecoPCR/libapat/Gmach.h b/src/libecoPCR/libapat/Gmach.h new file mode 100644 index 0000000..8fb1c69 --- /dev/null +++ b/src/libecoPCR/libapat/Gmach.h @@ -0,0 +1,97 @@ +/* ---------------------------------------------------------------- */ +/* Copyright (c) Atelier de BioInformatique */ +/* @file: Gmach.h */ +/* @desc: machine dependant setup */ +/* @+ *should* be included in all ABI softs */ +/* */ +/* @history: */ +/* @+ : Jul 95 : MWC first draft */ +/* @+ : Jan 96 : adapted to Pwg */ +/* @+ : Nov 00 : adapted to Mac_OS_X */ +/* ---------------------------------------------------------------- */ + +#ifndef _H_Gmach + + /* OS names */ + +#define _H_Gmach + + /* Macintosh Classic */ + /* Think C environment */ +#ifdef THINK_C +#define MACINTOSH +#define MAC_OS_C +#endif + + + /* Macintosh Classic */ + /* Code-Warrior */ +#ifdef __MWERKS__ +#define MACINTOSH +#define MAC_OS_C +#endif + + /* Macintosh OS-X */ +#ifdef MAC_OS_X +#define MACINTOSH +#define UNIX +#define UNIX_BSD +#endif + + /* LINUX */ +#ifdef LINUX +#define UNIX +#define UNIX_BSD +#endif + + /* other Unix Boxes */ + /* SunOS / Solaris */ +#ifdef SUN +#define UNIX +#ifdef SOLARIS +#define UNIX_S7 +#else +#define UNIX_BSD +#endif +#endif + + /* SGI Irix */ +#ifdef SGI +#define UNIX +#define UNIX_S7 +#endif + +/* ansi setup */ +/* for unix machines see makefile */ + +#ifndef PROTO +#define PROTO 1 +#endif + +#ifndef ANSI_PROTO +#define ANSI_PROTO PROTO +#endif + +#ifndef ANSI_STR +#define ANSI_STR 1 +#endif + +/* unistd.h header file */ + +#ifdef UNIX +#define HAS_UNISTD_H +#endif + +/* getopt.h header file */ + +#ifdef MAC_OS_C +#define HAS_GETOPT_H "getopt.h" +#endif + +#ifdef SGI +#define HAS_GETOPT_H +#endif + + + +#endif diff --git a/src/libecoPCR/libapat/Gtypes.h b/src/libecoPCR/libapat/Gtypes.h new file mode 100644 index 0000000..7460014 --- /dev/null +++ b/src/libecoPCR/libapat/Gtypes.h @@ -0,0 +1,70 @@ +/* ---------------------------------------------------------------- */ +/* Copyright (c) Atelier de BioInformatique */ +/* @file: Gtypes.h */ +/* @desc: general & machine dependant types */ +/* @+ *should* be included in all ABI softs */ +/* */ +/* @history: */ +/* @+ : Jan 91 : MWC first draft */ +/* @+ : Jul 95 : Gmach addition */ +/* ---------------------------------------------------------------- */ + +#define _H_Gtypes + +#ifndef _H_Gmach +#include "Gmach.h" +#endif + +#ifndef NULL +#include /* is the official NULL here ? */ +#endif + +/* ==================================================== */ +/* constantes */ +/* ==================================================== */ + +#ifndef PROTO +#define PROTO 1 /* prototypes flag */ +#endif + + +#define Vrai 0x1 /* bool values = TRUE */ +#define Faux 0x0 /* = FALSE */ + +#define Nil NULL /* nil pointer */ + +#define kBigInt16 0x7fff /* plus grand 16 bits signe */ +#define kBigInt32 0x7fffffff /* plus grand 32 bits signe */ +#define kBigUInt16 0xffff /* plus grand 16 bits ~signe */ +#define kBigUInt32 0xffffffff /* plus grand 32 bits ~signe */ + + +/* ==================================================== */ +/* Types (for Sun & Iris - 32 bits machines) */ +/* ==================================================== */ + + /* --- specific sizes --------- */ +typedef int Int32; /* Int32 = 32 bits signe */ +typedef unsigned int UInt32; /* UInt32 = 32 bits ~signe */ +typedef short Int16; /* Int16 = 16 bits signe */ +typedef unsigned short UInt16; /* UInt32 = 16 bits ~signe */ +typedef char Int8; /* Int8 = 8 bits signe */ +typedef unsigned char UInt8; /* UInt8 = 8 bits ~signe */ + + /* --- default types ---------- */ + +typedef int Int; /* 'natural' int (>= 32 bits) */ + +typedef void *Ptr; /* pointeur */ + + + +/* ==================================================== */ +/* special macro for prototypes */ +/* ==================================================== */ + +#if PROTO +#define P(s) s +#else +#define P(s) () +#endif diff --git a/src/libecoPCR/libapat/apat.h b/src/libecoPCR/libapat/apat.h new file mode 100644 index 0000000..64ce679 --- /dev/null +++ b/src/libecoPCR/libapat/apat.h @@ -0,0 +1,176 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Dec. 94 */ +/* File: apat.h */ +/* Purpose: pattern scan */ +/* History: */ +/* 28/12/94 : ascan first version */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + + +#include + +#ifndef _H_Gtypes +#include "Gtypes.h" +#endif + +#ifndef _H_libstki +#include "libstki.h" +#endif + +#define H_apat + +/* ----------------------------------------------- */ +/* constantes */ +/* ----------------------------------------------- */ + +#ifndef BUFSIZ +#define BUFSIZ 1024 /* io buffer size */ +#endif + +#define MAX_NAME_LEN BUFSIZ /* max length of sequence name */ + +#define ALPHA_LEN 26 /* alphabet length */ + /* *DO NOT* modify */ + +#define MAX_PATTERN 4 /* max # of patterns */ + /* *DO NOT* modify */ + +#define MAX_PAT_LEN 32 /* max pattern length */ + /* *DO NOT* modify */ + +#define MAX_PAT_ERR 32 /* max # of errors */ + /* *DO NOT* modify */ + +#define PATMASK 0x3ffffff /* mask for 26 symbols */ + /* *DO NOT* modify */ + +#define OBLIBIT 0x4000000 /* bit 27 to 1 -> oblig. pos */ + /* *DO NOT* modify */ + + /* mask for position */ +#define ONEMASK 0x80000000 /* mask for highest position */ + + /* masks for Levenhstein edit */ +#define OPER_IDT 0x00000000 /* identity */ +#define OPER_INS 0x40000000 /* insertion */ +#define OPER_DEL 0x80000000 /* deletion */ +#define OPER_SUB 0xc0000000 /* substitution */ + +#define OPER_SHFT 30 /* shift */ + + /* Levenhstein Opcodes */ +#define SOPER_IDT 0x0 /* identity */ +#define SOPER_INS 0x1 /* insertion */ +#define SOPER_DEL 0x2 /* deletion */ +#define SOPER_SUB 0x3 /* substitution */ + + /* Levenhstein Opcodes masks */ +#define OPERMASK 0xc0000000 /* mask for Opcodes */ +#define NOPERMASK 0x3fffffff /* negate of previous */ + + /* special chars in pattern */ +#define PATCHARS "[]!#" + + /* 26 letter alphabet */ + /* in alphabetical order */ + +//#define ORD_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + + /* protein alphabet */ + +//#define PROT_ALPHA "ACDEFGHIKLMNPQRSTVWY" + + /* dna/rna alphabet */ + +//#define DNA_ALPHA "ABCDGHKMNRSTUVWXY" + + +/* ----------------------------------------------- */ +/* data structures */ +/* ----------------------------------------------- */ + + /* -------------------- */ +typedef enum { /* data encoding */ + /* -------------------- */ + alpha = 0, /* [A-Z] */ + dna, /* IUPAC DNA */ + protein /* IUPAC proteins */ +} CodType; + + /* -------------------- */ +typedef struct { /* sequence */ + /* -------------------- */ + char *name; /* sequence name */ + Int32 seqlen; /* sequence length */ + Int32 seqsiz; /* sequence buffer size */ + Int32 datsiz; /* data buffer size */ + Int32 circular; + UInt8 *data; /* data buffer */ + char *cseq; /* sequence buffer */ + StackiPtr hitpos[MAX_PATTERN]; /* stack of hit pos. */ + StackiPtr hiterr[MAX_PATTERN]; /* stack of errors */ +} Seq, *SeqPtr; + + /* -------------------- */ +typedef struct { /* pattern */ + /* -------------------- */ + int patlen; /* pattern length */ + int maxerr; /* max # of errors */ + char *cpat; /* pattern string */ + Int32 *patcode; /* encoded pattern */ + UInt32 *smat; /* S matrix */ + UInt32 omask; /* oblig. bits mask */ + bool hasIndel; /* are indels allowed */ + bool ok; /* is pattern ok */ +} Pattern, *PatternPtr; + +/* ----------------------------------------------- */ +/* macros */ +/* ----------------------------------------------- */ + +#ifndef NEW +#define NEW(typ) (typ*)malloc(sizeof(typ)) +#define NEWN(typ, dim) (typ*)malloc((unsigned long)(dim) * sizeof(typ)) +#define REALLOC(typ, ptr, dim) (typ*)realloc((void *) (ptr), (unsigned long)(dim) * sizeof(typ)) +#define FREE(ptr) free((void *) ptr) +#endif + +/* ----------------------------------------------- */ +/* prototypes */ +/* ----------------------------------------------- */ + + /* apat_seq.c */ + +SeqPtr FreeSequence (SeqPtr pseq); +SeqPtr NewSequence (void); +int ReadNextSequence (SeqPtr pseq); +int WriteSequence (FILE *filou , SeqPtr pseq); + + /* apat_parse.c */ + +Int32 *GetCode (CodType ctype); +int CheckPattern (Pattern *ppat); +int EncodePattern (Pattern *ppat, CodType ctype); +int ReadPattern (Pattern *ppat); +void PrintDebugPattern (Pattern *ppat); + + /* apat_search.c */ + +int CreateS (Pattern *ppat, Int32 lalpha); +Int32 ManberNoErr (Seq *pseq , Pattern *ppat, int patnum,int begin,int length); +Int32 ManberSub (Seq *pseq , Pattern *ppat, int patnum,int begin,int length); +Int32 ManberIndel (Seq *pseq , Pattern *ppat, int patnum,int begin,int length); +Int32 ManberAll (Seq *pseq , Pattern *ppat, int patnum,int begin,int length); +Int32 NwsPatAlign (Seq *pseq , Pattern *ppat, Int32 nerr , + Int32 *reslen , Int32 *reserr); + + /* apat_sys.c */ + +float UserCpuTime (int reset); +float SysCpuTime (int reset); +char *StrCpuTime (int reset); +void Erreur (char *msg , int stat); +int AccessFile (char *path, char *mode); + diff --git a/src/libecoPCR/libapat/apat_parse.c b/src/libecoPCR/libapat/apat_parse.c new file mode 100644 index 0000000..43cda48 --- /dev/null +++ b/src/libecoPCR/libapat/apat_parse.c @@ -0,0 +1,369 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Mar. 92 */ +/* File: apat_parse.c */ +/* Purpose: Codage du pattern */ +/* History: */ +/* 00/07/94 : first version (stanford) */ +/* 00/11/94 : revised for DNA/PROTEIN */ +/* 30/12/94 : modified EncodePattern */ +/* for manber search */ +/* 14/05/99 : indels added */ +/* ==================================================== */ + +#include +#include +#include +#include + +#include "Gtypes.h" +#include "apat.h" + /* -------------------- */ + /* default char */ + /* encodings */ + /* -------------------- */ + +static Int32 sDftCode[] = { + +#include "CODES/dft_code.h" + +}; + /* -------------------- */ + /* char encodings */ + /* IUPAC */ + /* -------------------- */ + + /* IUPAC Proteins */ +static Int32 sProtCode[] = { + +#include "CODES/prot_code.h" + +}; + /* IUPAC Dna/Rna */ +static Int32 sDnaCode[] = { + +#include "CODES/dna_code.h" + +}; + + +/* -------------------------------------------- */ +/* internal replacement of gets */ +/* -------------------------------------------- */ +static char *sGets(char *buffer, int size) { + + char *ebuf; + + if (! fgets(buffer, size-1, stdin)) + return NULL; + + /* remove trailing line feed */ + + ebuf = buffer + strlen(buffer); + + while (--ebuf >= buffer) { + if ((*ebuf == '\n') || (*ebuf == '\r')) + *ebuf = '\000'; + else + break; + } + + return buffer; +} + +/* -------------------------------------------- */ +/* returns actual code associated to type */ +/* -------------------------------------------- */ + +Int32 *GetCode(CodType ctype) +{ + Int32 *code = sDftCode; + + switch (ctype) { + case dna : code = sDnaCode ; break; + case protein : code = sProtCode ; break; + default : code = sDftCode ; break; + } + + return code; +} + +/* -------------------------------------------- */ + +#define BAD_IF(tst) if (tst) return 0 + +int CheckPattern(Pattern *ppat) +{ + int lev; + char *pat; + + pat = ppat->cpat; + + BAD_IF (*pat == '#'); + + for (lev = 0; *pat ; pat++) + + switch (*pat) { + + case '[' : + BAD_IF (lev); + BAD_IF (*(pat+1) == ']'); + lev++; + break; + + case ']' : + lev--; + BAD_IF (lev); + break; + + case '!' : + BAD_IF (lev); + BAD_IF (! *(pat+1)); + BAD_IF (*(pat+1) == ']'); + break; + + case '#' : + BAD_IF (lev); + BAD_IF (*(pat-1) == '['); + break; + + default : + if (! isupper(*pat)) + return 0; + break; + } + + return (lev ? 0 : 1); +} + +#undef BAD_IF + + +/* -------------------------------------------- */ +static char *skipOblig(char *pat) +{ + return (*(pat+1) == '#' ? pat+1 : pat); +} + +/* -------------------------------------------- */ +static char *splitPattern(char *pat) +{ + switch (*pat) { + + case '[' : + for (; *pat; pat++) + if (*pat == ']') + return skipOblig(pat); + return NULL; + break; + + case '!' : + return splitPattern(pat+1); + break; + + } + + return skipOblig(pat); +} + +/* -------------------------------------------- */ +static Int32 valPattern(char *pat, Int32 *code) +{ + Int32 val; + + switch (*pat) { + + case '[' : + return valPattern(pat+1, code); + break; + + case '!' : + val = valPattern(pat+1, code); + return (~val & PATMASK); + break; + + default : + val = 0x0; + while (isupper(*pat)) { + val |= code[*pat - 'A']; + pat++; + } + return val; + } + + return 0x0; +} + +/* -------------------------------------------- */ +static Int32 obliBitPattern(char *pat) +{ + return (*(pat + strlen(pat) - 1) == '#' ? OBLIBIT : 0x0); +} + + +/* -------------------------------------------- */ +static int lenPattern(char *pat) +{ + int lpat; + + lpat = 0; + + while (*pat) { + + if (! (pat = splitPattern(pat))) + return 0; + + pat++; + + lpat++; + } + + return lpat; +} + +/* -------------------------------------------- */ +/* Interface */ +/* -------------------------------------------- */ + +/* -------------------------------------------- */ +/* encode un pattern */ +/* -------------------------------------------- */ +int EncodePattern(Pattern *ppat, CodType ctype) +{ + int pos, lpat; + Int32 *code; + char *pp, *pa, c; + + ppat->ok = Faux; + + code = GetCode(ctype); + + ppat->patlen = lpat = lenPattern(ppat->cpat); + + if (lpat <= 0) + return 0; + + if (! (ppat->patcode = NEWN(Int32, lpat))) + return 0; + + pa = pp = ppat->cpat; + + pos = 0; + + while (*pa) { + + pp = splitPattern(pa); + + c = *++pp; + + *pp = '\000'; + + ppat->patcode[pos++] = valPattern(pa, code) | obliBitPattern(pa); + + *pp = c; + + pa = pp; + } + + ppat->ok = Vrai; + + return lpat; +} + +/* -------------------------------------------- */ +/* remove blanks */ +/* -------------------------------------------- */ +static char *RemBlanks(char *s) +{ + char *sb, *sc; + + for (sb = sc = s ; *sb ; sb++) + if (! isspace(*sb)) + *sc++ = *sb; + + return s; +} + +/* -------------------------------------------- */ +/* count non blanks */ +/* -------------------------------------------- */ +static Int32 CountAlpha(char *s) +{ + Int32 n; + + for (n = 0 ; *s ; s++) + if (! isspace(*s)) + n++; + + return n; +} + + +/* -------------------------------------------- */ +/* lit un pattern */ +/* #mis */ +/* ligne starting with '/' are comments */ +/* -------------------------------------------- */ +int ReadPattern(Pattern *ppat) +{ + int val; + char *spac; + char buffer[BUFSIZ]; + + ppat->ok = Vrai; + + if (! sGets(buffer, sizeof(buffer))) + return 0; + + if (*buffer == '/') + return ReadPattern(ppat); + + if (! CountAlpha(buffer)) + return ReadPattern(ppat); + + for (spac = buffer ; *spac ; spac++) + if ((*spac == ' ') || (*spac == '\t')) + break; + + ppat->ok = Faux; + + if (! *spac) + return 0; + + if (sscanf(spac, "%d", &val) != 1) + return 0; + + ppat->hasIndel = (val < 0); + + ppat->maxerr = ((val >= 0) ? val : -val); + + *spac = '\000'; + + (void) RemBlanks(buffer); + + if ((ppat->cpat = NEWN(char, strlen(buffer)+1))) + strcpy(ppat->cpat, buffer); + + ppat->ok = (ppat->cpat != NULL); + + return (ppat->ok ? 1 : 0); +} + +/* -------------------------------------------- */ +/* ecrit un pattern - Debug - */ +/* -------------------------------------------- */ +void PrintDebugPattern(Pattern *ppat) +{ + int i; + + printf("Pattern : %s\n", ppat->cpat); + printf("Encoding : \n\t"); + + for (i = 0 ; i < ppat->patlen ; i++) { + printf("0x%8.8x ", ppat->patcode[i]); + if (i%4 == 3) + printf("\n\t"); + } + printf("\n"); +} + diff --git a/src/libecoPCR/libapat/apat_search.c b/src/libecoPCR/libapat/apat_search.c new file mode 100644 index 0000000..f0dd394 --- /dev/null +++ b/src/libecoPCR/libapat/apat_search.c @@ -0,0 +1,339 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Dec. 94 */ +/* File: apat_search.c */ +/* Purpose: recherche du pattern */ +/* algorithme de Baeza-Yates/Gonnet */ +/* Manber (agrep) */ +/* History: */ +/* 07/12/94 : first version */ +/* 28/12/94 : revised version */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + +#if 0 +#ifndef THINK_C +#include +#endif +#endif + +#include +#include +#include + +#include "Gtypes.h" +#include "libstki.h" +#include "apat.h" + +#define POP PopiOut +#define PUSH PushiIn +#define TOPCURS CursiToTop +#define DOWNREAD ReadiDown + +#define KRONECK(x, msk) ((~x & msk) ? 0 : 1) +#define MIN(x, y) ((x) < (y) ? (x) : (y)) + +/* -------------------------------------------- */ +/* Construction de la matrice S */ +/* -------------------------------------------- */ + +int CreateS(Pattern *ppat, Int32 lalpha) +{ + Int32 i, j, indx; + UInt32 pindx, amask, omask, *smat; + + ppat->ok = Faux; + + omask = 0x0L; + + if (! (smat = NEWN(UInt32, lalpha))) + return 0; + + for (i = 0 ; i < lalpha ; i++) + smat[i] = 0x0; + + for (i = ppat->patlen - 1, amask = 0x1L ; i >= 0 ; i--, amask <<= 1) { + + indx = ppat->patcode[i]; + + if (ppat->patcode[i] & OBLIBIT) + omask |= amask; + + for (j = 0, pindx = 0x1L ; j < lalpha ; j++, pindx <<= 1) + if (indx & pindx) + smat[j] |= amask; + } + + ppat->smat = smat; + + ppat->omask = omask; + + ppat->ok = Vrai; + + return 1; + +} + +/* -------------------------------------------- */ +/* Baeza-Yates/Manber algorithm */ +/* NoError */ +/* -------------------------------------------- */ +Int32 ManberNoErr(Seq *pseq, Pattern *ppat, int patnum,int begin,int length) +{ + UInt32 pos; + UInt32 smask, r; + UInt8 *data; + StackiPtr *stkpos, *stkerr; + UInt32 end; + + end = begin + length; + end = (end <= (size_t)(pseq->seqlen+pseq->circular)) ? end:(size_t)(pseq->seqlen+pseq->circular); + + + /* create local masks */ + + smask = r = 0x1L << ppat->patlen; + + /* init. scan */ + data = pseq->data + begin; + stkpos = pseq->hitpos + patnum; + stkerr = pseq->hiterr + patnum; + + /* loop on text data */ + + for (pos = begin ; pos < end ; pos++) { + + r = (r >> 1) & ppat->smat[*data++]; + + if (r & 0x1L) { + PUSH(stkpos, pos - ppat->patlen + 1); + PUSH(stkerr, 0); + } + + r |= smask; + } + + return (*stkpos)->top; /* aka # of hits */ +} + +/* -------------------------------------------- */ +/* Baeza-Yates/Manber algorithm */ +/* Substitution only */ +/* */ +/* Note : r array is stored as : */ +/* 0 0 r(0,j) r(0,j+1) r(1,j) r(1,j+1) ... */ +/* */ +/* -------------------------------------------- */ +Int32 ManberSub(Seq *pseq, Pattern *ppat, int patnum,int begin,int length) +{ + int e, emax, found; + UInt32 pos; + UInt32 smask, cmask, sindx; + UInt32 *pr, r[2 * MAX_PAT_ERR + 2]; + UInt8 *data; + StackiPtr *stkpos, *stkerr; + UInt32 end; + + end = begin + length; + end = (end <= (size_t)(pseq->seqlen+pseq->circular)) ? end:(size_t)(pseq->seqlen+pseq->circular); + + /* create local masks */ + emax = ppat->maxerr; + + r[0] = r[1] = 0x0; + + cmask = smask = 0x1L << ppat->patlen; + + for (e = 0, pr = r + 3 ; e <= emax ; e++, pr += 2) + *pr = cmask; + + cmask = ~ ppat->omask; + + /* init. scan */ + data = pseq->data + begin; + stkpos = pseq->hitpos + patnum; + stkerr = pseq->hiterr + patnum; + + /* loop on text data */ + + for (pos = begin ; pos < end ; pos++) { + + sindx = ppat->smat[*data++]; + + for (e = found = 0, pr = r ; e <= emax ; e++, pr += 2) { + + pr[2] = pr[3] | smask; + + pr[3] = ((pr[0] >> 1) & cmask) /* sub */ + | ((pr[2] >> 1) & sindx); /* ident */ + + if (pr[3] & 0x1L) { /* found */ + if (! found) { + PUSH(stkpos, pos - ppat->patlen + 1); + PUSH(stkerr, e); + } + found++; + } + } + } + + return (*stkpos)->top; /* aka # of hits */ +} + +/* -------------------------------------------- */ +/* Baeza-Yates/Manber algorithm */ +/* Substitution + Indels */ +/* */ +/* Note : r array is stored as : */ +/* 0 0 r(0,j) r(0,j+1) r(1,j) r(1,j+1) ... */ +/* */ +/* Warning: may return shifted pos. */ +/* */ +/* -------------------------------------------- */ +Int32 ManberIndel(Seq *pseq, Pattern *ppat, int patnum,int begin,int length) +{ + int e, emax, found; + UInt32 pos; + UInt32 smask, cmask, sindx; + UInt32 *pr, r[2 * MAX_PAT_ERR + 2]; + UInt8 *data; + StackiPtr *stkpos, *stkerr; + UInt32 end; + + end = begin + length; + end = (end <= (size_t)(pseq->seqlen+pseq->circular)) ? end:(size_t)(pseq->seqlen+pseq->circular); + + /* create local masks */ + emax = ppat->maxerr; + + r[0] = r[1] = 0x0; + + cmask = smask = 0x1L << ppat->patlen; + + for (e = 0, pr = r + 3 ; e <= emax ; e++, pr += 2) { + *pr = cmask; + cmask = (cmask >> 1) | smask; + } + + cmask = ~ ppat->omask; + + /* init. scan */ + data = pseq->data + begin; + stkpos = pseq->hitpos + patnum; + stkerr = pseq->hiterr + patnum; + + /* loop on text data */ + + for (pos = begin ; pos < end ; pos++) { + + sindx = ppat->smat[*data++]; + + for (e = found = 0, pr = r ; e <= emax ; e++, pr += 2) { + + pr[2] = pr[3] | smask; + + pr[3] = (( pr[0] /* ins */ + | (pr[0] >> 1) /* sub */ + | (pr[1] >> 1)) /* del */ + & cmask) + | ((pr[2] >> 1) & sindx); /* ident */ + + if (pr[3] & 0x1L) { /* found */ + if (! found) { + PUSH(stkpos, pos - ppat->patlen + 1); + PUSH(stkerr, e); + } + found++; + } + + } + } + + return (*stkpos)->top; /* aka # of hits */ +} + +/* -------------------------------------------- */ +/* Baeza-Yates/Manber algorithm */ +/* API call to previous functions */ +/* -------------------------------------------- */ +Int32 ManberAll(Seq *pseq, Pattern *ppat, int patnum,int begin,int length) +{ + if (ppat->maxerr == 0) + return ManberNoErr(pseq, ppat, patnum, begin, length); + else if (ppat->hasIndel) + return ManberIndel(pseq, ppat, patnum, begin, length); + else + return ManberSub(pseq, ppat, patnum, begin, length); +} + + +/* -------------------------------------------- */ +/* Alignement NWS */ +/* pour edition des hits */ +/* (avec substitution obligatoire aux bords) */ +/* -------------------------------------------- */ + +Int32 NwsPatAlign(pseq, ppat, nerr, reslen, reserr) + Seq *pseq; + Pattern *ppat; + Int32 nerr, *reslen, *reserr; +{ + UInt8 *sseq, *px; + Int32 i, j, lseq, lpat, npos, dindel, dsub, + *pc, *pi, *pd, *ps; + UInt32 amask; + + static Int32 sTab[(MAX_PAT_LEN+MAX_PAT_ERR+1) * (MAX_PAT_LEN+1)]; + + lseq = pseq->seqlen; + + pc = sTab; /* |----|----| --> i */ + pi = pc - 1; /* | ps | pd | | */ + pd = pi - lseq; /* |----|----| | */ + ps = pd - 1; /* | pi | pc | v j */ + /* |---------| */ + + lseq = pseq->seqlen; + lpat = ppat->patlen; + + sseq = pseq->data - 1; + + amask = ONEMASK >> lpat; + + for (j = 0 ; j <= lpat ; j++) { + + for (i = 0 , px = sseq ; i <= lseq ; i++, px++) { + + if (i && j) { + dindel = MIN(*pi, *pd) + 1; + dsub = *ps + KRONECK(ppat->smat[*px], amask); + *pc = MIN(dindel, dsub); + } + else if (i) /* j == 0 */ + *pc = *pi + 1; + else if (j) /* i == 0 */ + *pc = *pd + 1; + else /* root */ + *pc = 0; + + pc++; + pi++; + pd++; + ps++; + } + + amask <<= 1; + } + + pc--; + + for (i = lseq, npos = 0 ; i >= 0 ; i--, pc--) { + if (*pc <= nerr) { + *reslen++ = i; + *reserr++ = *pc; + npos++; + } + } + + return npos; +} diff --git a/src/libecoPCR/libapat/libstki.c b/src/libecoPCR/libapat/libstki.c new file mode 100644 index 0000000..22dbac2 --- /dev/null +++ b/src/libecoPCR/libapat/libstki.c @@ -0,0 +1,380 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Mar. 92 */ +/* File: libstki.c */ +/* Purpose: A library to deal with 'stacks' of */ +/* integers */ +/* Note: 'stacks' are dynamic (i.e. size is */ +/* automatically readjusted when needed) */ +/* History: */ +/* 00/03/92 : first draft */ +/* 15/08/93 : revised version */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + +#include +#include +#include +#include + +#include "Gtypes.h" +#include "libstki.h" + + +/* ============================ */ +/* Constantes et Macros locales */ +/* ============================ */ + +#define ExpandStack(stkh) ResizeStacki((stkh), (*stkh)->size << 1) + +#define ShrinkStack(stkh) ResizeStacki((stkh), (*stkh)->size >> 1) + + +static Int16 sStkiLastError = kStkiNoErr; + +/* -------------------------------------------- */ +/* gestion des erreurs */ +/* get/reset erreur flag */ +/* */ +/* @function: StkiError */ +/* -------------------------------------------- */ + +Int16 StkiError(bool reset) +{ + Int16 err; + + err = sStkiLastError; + + if (reset) + sStkiLastError = kStkiNoErr; + + return err; + +} /* end of StkiError */ + +/* -------------------------------------------- */ +/* creation d'un stack */ +/* */ +/* @function: NewStacki */ +/* -------------------------------------------- */ + +StackiPtr NewStacki(Int32 size) +{ + StackiPtr stki; + + if (! (stki = NEW(Stacki))) + return NULL; + + stki->size = size; + stki->top = 0; + stki->cursor = 0; + + if ( ! (stki->val = NEWN(Int32, size))) { + sStkiLastError = kStkiMemErr; + return FreeStacki(stki); + } + + return stki; + +} /* end of NewStacki */ + + +/* -------------------------------------------- */ +/* liberation d'un stack */ +/* */ +/* @function: FreeStacki */ +/* -------------------------------------------- */ + +StackiPtr FreeStacki(StackiPtr stki) +{ + if (stki) { + if (stki->val) + FREE(stki->val); + FREE(stki); + } + + return NULL; + +} /* end of FreeStacki */ + +/* -------------------------------------------- */ +/* creation d'un vecteur de stacks */ +/* */ +/* @function: NewStackiVector */ +/* -------------------------------------------- */ + +StackiHdle NewStackiVector(Int32 vectSize, Int32 stackSize) +{ + Int32 i; + StackiHdle stkh; + + if (! (stkh = NEWN(StackiPtr, vectSize))) { + sStkiLastError = kStkiMemErr; + return NULL; + } + + for (i = 0 ; i < vectSize ; i++) + if (! (stkh[i] = NewStacki(stackSize))) + return FreeStackiVector(stkh, i); + + return stkh; + +} /* end of NewStackiVector */ + + +/* -------------------------------------------- */ +/* liberation d'un vecteur de stacks */ +/* */ +/* @function: FreeStackiVector */ +/* -------------------------------------------- */ + +StackiHdle FreeStackiVector(StackiHdle stkh, Int32 vectSize) +{ + Int32 i; + + if (stkh) { + for (i = 0 ; i < vectSize ; i++) + (void) FreeStacki(stkh[i]); + FREE(stkh); + } + + return NULL; + +} /* end of FreeStackiVector */ + +/* -------------------------------------------- */ +/* resize d'un stack */ +/* */ +/* @function: ResizeStacki */ +/* -------------------------------------------- */ + +Int32 ResizeStacki(StackiHdle stkh, Int32 size) +{ + Int32 resize = 0; /* assume error */ + Int32 *val; + + if ((val = REALLOC(Int32, (*stkh)->val, size))) { + (*stkh)->size = resize = size; + (*stkh)->val = val; + } + + if (! resize) + sStkiLastError = kStkiMemErr; + + return resize; + +} /* end of ResizeStacki */ + +/* -------------------------------------------- */ +/* empilage(/lement) */ +/* */ +/* @function: PushiIn */ +/* -------------------------------------------- */ + +bool PushiIn(StackiHdle stkh, Int32 val) +{ + if (((*stkh)->top >= (*stkh)->size) && (! ExpandStack(stkh))) + return Faux; + + (*stkh)->val[((*stkh)->top)++] = val; + + return Vrai; + +} /* end of PushiIn */ + +/* -------------------------------------------- */ +/* depilage(/lement) */ +/* */ +/* @function: PopiOut */ +/* -------------------------------------------- */ + +bool PopiOut(StackiHdle stkh, Int32 *val) +{ + if ((*stkh)->top <= 0) + return Faux; + + *val = (*stkh)->val[--((*stkh)->top)]; + + if ( ((*stkh)->top < ((*stkh)->size >> 1)) + && ((*stkh)->top > kMinStackiSize)) + + (void) ShrinkStack(stkh); + + return Vrai; + +} /* end of PopiOut */ + +/* -------------------------------------------- */ +/* lecture descendante */ +/* */ +/* @function: ReadiDown */ +/* -------------------------------------------- */ + +bool ReadiDown(StackiPtr stki, Int32 *val) +{ + if (stki->cursor <= 0) + return Faux; + + *val = stki->val[--(stki->cursor)]; + + return Vrai; + +} /* end of ReadiDown */ + +/* -------------------------------------------- */ +/* lecture ascendante */ +/* */ +/* @function: ReadiUp */ +/* -------------------------------------------- */ + +bool ReadiUp(StackiPtr stki, Int32 *val) +{ + if (stki->cursor >= stki->top) + return Faux; + + *val = stki->val[(stki->cursor)++]; + + return Vrai; + +} /* end of ReadiUp */ + +/* -------------------------------------------- */ +/* remontee/descente du curseur */ +/* */ +/* @function: CursiToTop */ +/* @function: CursiToBottom */ +/* -------------------------------------------- */ + +void CursiToTop(StackiPtr stki) +{ + stki->cursor = stki->top; + +} /* end of CursiToTop */ + +void CursiToBottom(stki) + StackiPtr stki; +{ + stki->cursor = 0; + +} /* end of CursiToBottom */ + +/* -------------------------------------------- */ +/* echange des valeurs cursor <-> (top - 1) */ +/* */ +/* @function: CursiSwap */ +/* -------------------------------------------- */ + +void CursiSwap(StackiPtr stki) +{ + Int32 tmp; + + if ((stki->top <= 0) || (stki->cursor < 0)) + return; + + tmp = stki->val[stki->cursor]; + stki->val[stki->cursor] = stki->val[stki->top - 1]; + stki->val[stki->top - 1] = tmp; + +} /* end of CursiSwap */ + +/* -------------------------------------------- */ +/* Recherche d'une valeur en stack a partir du */ +/* curseur courant en descendant. */ +/* on laisse le curseur a l'endroit trouve */ +/* */ +/* @function: SearchDownStacki */ +/* -------------------------------------------- */ + +bool SearchDownStacki(StackiPtr stki, Int32 sval) +{ + Int32 val; + bool more; + + while ((more = ReadiDown(stki, &val))) + if (val == sval) + break; + + return more; + +} /* end of SearchDownStacki */ + +/* -------------------------------------------- */ +/* Recherche dichotomique d'une valeur en stack */ +/* le stack est suppose trie par valeurs */ +/* croissantes. */ +/* on place le curseur a l'endroit trouve */ +/* */ +/* @function: BinSearchStacki */ +/* -------------------------------------------- */ + +bool BinSearchStacki(StackiPtr stki, Int32 sval) +{ + Int32 midd, low, high, span; + + low = 0; + high = stki->top - 1; + + while (high >= low) { + + midd = (high + low) / 2; + + span = stki->val[midd] - sval; + + if (span == 0) { + stki->cursor = midd; + return Vrai; + } + + if (span > 0) + high = midd - 1; + else + low = midd + 1; + } + + return Faux; + +} /* end of BinSearchStacki */ + +/* -------------------------------------------- */ +/* teste l'egalite *physique* de deux stacks */ +/* */ +/* @function: SameStacki */ +/* -------------------------------------------- */ + +bool SameStacki(StackiPtr stki1, StackiPtr stki2) +{ + if (stki1->top != stki2->top) + return Faux; + + return ((memcmp(stki1->val, stki2->val, + stki1->top * sizeof(Int32)) == 0) ? Vrai : Faux); + +} /* end of SameStacki */ + + +/* -------------------------------------------- */ +/* inverse l'ordre des elements dans un stack */ +/* */ +/* @function: ReverseStacki */ +/* -------------------------------------------- */ + +bool ReverseStacki(StackiPtr stki) +{ + Int32 *t, *b, swp; + + if (stki->top <= 0) + return Faux; + + b = stki->val; + t = b + stki->top - 1; + + while (t > b) { + swp = *t; + *t-- = *b; + *b++ = swp; + } + + return Vrai; + +} /* end of ReverseStacki */ + diff --git a/src/libecoPCR/libapat/libstki.h b/src/libecoPCR/libapat/libstki.h new file mode 100644 index 0000000..14ef447 --- /dev/null +++ b/src/libecoPCR/libapat/libstki.h @@ -0,0 +1,89 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Mar. 92 */ +/* File: libstki.h */ +/* Purpose: library of dynamic stacks holding */ +/* integer values */ +/* History: */ +/* 00/03/92 : first draft */ +/* 07/07/93 : complete revision */ +/* 10/03/94 : added xxxVector funcs */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + +#include + +#ifndef _H_Gtypes +#include "Gtypes.h" +#endif + +#define _H_libstki + +/* ==================================================== */ +/* Constantes de dimensionnement */ +/* ==================================================== */ + +#ifndef kMinStackiSize +#define kMinStackiSize 2 /* taille mini stack */ +#endif + + +#define kStkiNoErr 0 /* ok */ +#define kStkiMemErr 1 /* not enough memory */ + +#define kStkiReset Vrai +#define kStkiGet Faux + +/* ==================================================== */ +/* Macros standards */ +/* ==================================================== */ + +#ifndef NEW +#define NEW(typ) (typ*)malloc(sizeof(typ)) +#define NEWN(typ, dim) (typ*)malloc((unsigned long)(dim) * sizeof(typ)) +#define REALLOC(typ, ptr, dim) (typ*)realloc((void *) (ptr), (unsigned long)(dim) * sizeof(typ)) +#define FREE(ptr) free((Ptr) ptr) +#endif + + +/* ==================================================== */ +/* Types & Structures de donnees */ +/* ==================================================== */ + + /* -------------------- */ + /* structure : pile */ + /* -------------------- */ +typedef struct Stacki { + /* ---------------------*/ + Int32 size; /* stack size */ + Int32 top; /* current free pos. */ + Int32 cursor; /* current cursor */ + Int32 *val; /* values */ + /* ---------------------*/ +} Stacki, *StackiPtr, **StackiHdle; + + + +/* ==================================================== */ +/* Prototypes (generated by mproto) */ +/* ==================================================== */ + + /* libstki.c */ + +Int16 StkiError (bool reset ); +StackiPtr NewStacki (Int32 size ); +StackiPtr FreeStacki (StackiPtr stki ); +StackiHdle NewStackiVector (Int32 vectSize, Int32 stackSize ); +StackiHdle FreeStackiVector (StackiHdle stkh, Int32 vectSize ); +Int32 ResizeStacki (StackiHdle stkh , Int32 size ); +bool PushiIn (StackiHdle stkh , Int32 val ); +bool PopiOut (StackiHdle stkh , Int32 *val ); +bool ReadiDown (StackiPtr stki , Int32 *val ); +bool ReadiUp (StackiPtr stki , Int32 *val ); +void CursiToTop (StackiPtr stki ); +void CursiToBottom (StackiPtr stki ); +void CursiSwap (StackiPtr stki ); +bool SearchDownStacki (StackiPtr stki , Int32 sval ); +bool BinSearchStacki (StackiPtr stki , Int32 sval ); +bool SameStacki (StackiPtr stki1 , StackiPtr stki2 ); +bool ReverseStacki (StackiPtr stki ); diff --git a/src/libecoPCR/libthermo/nnparams.c b/src/libecoPCR/libthermo/nnparams.c new file mode 100644 index 0000000..6775614 --- /dev/null +++ b/src/libecoPCR/libthermo/nnparams.c @@ -0,0 +1,619 @@ +/* + * nnparams.cpp + * PHunterLib + * + * Nearest Neighbor Model / Parameters + * + * Created by Tiayyba Riaz on 7/2/09. + * + */ + +#include +#include +#include +#include +#include "nnparams.h" + +static char bpencoder[] = { 1, // A + 0, // b + 2, // C + 0,0,0, // d, e, f + 3, // G + 0,0,0,0,0,0,0,0,0,0,0,0, // h,i,j,k,l,m,n,o,p,q,r,s + 4,0, // T,U + 0,0,0,0,0}; // v,w,x,y,z + + +double forbidden_entropy; + + +double nparam_GetInitialEntropy(PNNParams nparm) +{ + return -5.9f+nparm->rlogc; +} + + +//Retrieve Enthalpy for given NN-Pair from parameter table +double nparam_GetEnthalpy(PNNParams nparm, char x0, char x1, char y0, char y1) +{ + return ndH(x0,x1,y0,y1); //xx, yx are already numbers +} + + +//Retrieve Entropy for given NN-Pair from parameter table +double nparam_GetEntropy(PNNParams nparm, char x0, char x1, char y0, char y1) +{ + //xx and yx are already numbers + char nx0=x0;//nparam_convertNum(x0); + char nx1=x1;//nparam_convertNum(x1); + char ny0=y0;//nparam_convertNum(y0); + char ny1=y1;//nparam_convertNum(y1); + double answer = ndS(nx0,nx1,ny0,ny1); + /*Salt correction Santalucia*/ + if (nparm->saltMethod == SALT_METHOD_SANTALUCIA) { + if(nx0!=5 && 1<= nx1 && nx1<=4) { + answer += 0.5*nparm->kfac; + } + if(ny1!=5 && 1<= ny0 && ny0<=4) { + answer += 0.5*nparm->kfac; + } + } + /*Salt correction Owczarzy*/ + if (nparm->saltMethod == SALT_METHOD_OWCZARZY) { + double logk = log(nparm->kplus); + answer += ndH(nx0,nx1,ny0,ny1)*((4.29 * nparm->gcContent-3.95)*0.00001*logk+ 0.0000094*logk*logk); + } + return answer; +} + +/* PURPOSE: Return melting temperature TM for given entropy and enthalpy +* Assuming a one-state transition and using the formula +* TM = dH / (dS + R ln(Ct/4)) +* entropy = dS + R ln Ct/4 (must already be included!) +* enthaklpy = dH +* where +* dH = enthalpy +* dS = entropy +* R = Boltzmann factor +* Ct = Strand Concentration +* +* PARAMETERS: +* entrypy and enthalpy +* +* RETURN VALUE: +* temperature +*/ + +double nparam_CalcTM(double entropy,double enthalpy) +{ + double tm = 0; // absolute zero - return if model fails! + if (enthalpy>=forbidden_enthalpy) //||(entropy==-cfact)) + return 0; + if (entropy<0) // avoid division by zero and model errors! + { + tm = enthalpy/entropy;// - kfac; //LKFEB + if (tm<0) + return 0; + } + return tm; +} + + +void nparam_InitParams(PNNParams nparm, double c1, double c2, double kp, int sm) +{ + nparm->Ct1 = c1; + nparm->Ct2 = c2; + nparm->kplus = kp; + int maxCT = 1; + if(nparm->Ct2 > nparm->Ct1) + { + maxCT = 2; + } + double ctFactor; + if(nparm->Ct1 == nparm->Ct2) + { + ctFactor = nparm->Ct1/2; + } + else if (maxCT == 1) + { + ctFactor = nparm->Ct1-nparm->Ct2/2; + } + else + { + ctFactor = nparm->Ct2-nparm->Ct1/2; + } + nparm->rlogc = R * log(ctFactor); + forbidden_entropy = nparm->rlogc; + nparm->kfac = 0.368 * log (nparm->kplus); + nparm->saltMethod = sm; + int x,y,a,b; // variables used as counters... + + // Set all parameters to zero! + memset(nparm->dH,0,sizeof(nparm->dH)); + memset(nparm->dS,0,sizeof(nparm->dS)); + + // Set all X-/Y-, -X/Y- and X-/-Y so, that TM will be VERY small! + for (x=1;x<=4;x++) + { + for (y=1;y<=4;y++) + { + ndH(0,x,y,0)=forbidden_enthalpy; + ndS(0,x,y,0)=forbidden_entropy; + ndH(x,0,0,y)=forbidden_enthalpy; + ndS(x,0,0,y)=forbidden_entropy; + ndH(x,0,y,0)=forbidden_enthalpy; + ndS(x,0,y,0)=forbidden_entropy; + // forbid X-/Y$ and X$/Y- etc., i.e. terminal must not be paired with gap! + ndH(x,5,y,0)=forbidden_enthalpy; + ndS(x,5,y,0)=forbidden_entropy; + ndH(x,0,y,5)=forbidden_enthalpy; + ndS(x,0,y,5)=forbidden_entropy; + ndH(5,x,0,y)=forbidden_enthalpy; + ndS(5,x,0,y)=forbidden_entropy; + ndH(0,x,5,y)=forbidden_enthalpy; + ndS(0,x,5,y)=forbidden_entropy; + // forbid X$/-Y etc. + ndH(x,5,0,y)=forbidden_enthalpy; + ndS(x,5,0,y)=forbidden_entropy; + ndH(x,0,5,y)=forbidden_enthalpy; + ndS(x,0,5,y)=forbidden_entropy; + ndH(5,x,y,0)=forbidden_enthalpy; + ndS(5,x,y,0)=forbidden_entropy; + ndH(0,x,y,5)=forbidden_enthalpy; + ndS(0,x,y,5)=forbidden_entropy; + + } + // also, forbid x-/-- and --/x-, i.e. no two inner gaps paired + ndH(x,0,0,0)=forbidden_enthalpy; + ndS(x,0,0,0)=forbidden_entropy; + ndH(0,0,x,0)=forbidden_enthalpy; + ndS(0,0,x,0)=forbidden_entropy; + // x-/-$ + ndH(x,0,0,5)=forbidden_enthalpy; + ndS(x,0,0,5)=forbidden_entropy; + ndH(5,0,0,x)=forbidden_enthalpy; + ndS(5,0,0,x)=forbidden_entropy; + ndH(0,5,x,0)=forbidden_enthalpy; + ndS(x,0,0,5)=forbidden_entropy; + ndH(0,x,5,0)=forbidden_enthalpy; + ndS(0,x,5,0)=forbidden_entropy; + } + // forbid --/-- + ndH(0,0,0,0)=forbidden_enthalpy; + ndS(0,0,0,0)=forbidden_entropy; + + ndH(5,0,0,0)=forbidden_enthalpy; + ndS(5,0,0,0)=forbidden_entropy; + ndH(0,0,5,0)=forbidden_enthalpy; + ndS(0,0,5,0)=forbidden_entropy; + ndH(0,5,5,0)=forbidden_enthalpy; + ndS(0,5,5,0)=forbidden_entropy; + + // Interior loops (double Mismatches) + #define iloop_entropy -0.97f + #define iloop_enthalpy 0.0f + for (x=1; x<=4; x++) + for (y=1; y<=4; y++) + for (a=1; a<=4; a++) + for (b=1; b<=4; b++) + // AT and CG pair, and as A=1, C=2, G=3, T=4 this means + // we have Watson-Crick pairs if (x+a==5) and (y+b)==5. + if (!((x+a==5)||(y+b==5))) + { + // No watson-crick-pair, i.e. double mismatch! + // set enthalpy/entropy to loop expansion! + ndH(x,y,a,b) = iloop_enthalpy; + ndS(x,y,a,b) = iloop_entropy; + } + + // xy/-- and --/xy (Bulge Loops of size > 1) + #define bloop_entropy -1.3f + #define bloop_enthalpy 0.0f + for (x=1; x<=4; x++) + for (y=1; y<=4; y++) + { + ndH(x,y,0,0) = bloop_enthalpy; + ndS(x,y,0,0) = bloop_entropy; + ndH(0,0,x,y) = bloop_enthalpy; + ndS(0,0,x,y) = bloop_entropy; + } + + // x-/ya abd xa/y- as well as -x/ay and ax/-y + // bulge opening and closing parameters with + // adjacent matches / mismatches + // obulge_mism and cbulge_mism chosen so high to avoid + // AAAAAAAAA + // T--G----T + // being better than + // AAAAAAAAA + // TG------T + #define obulge_match_H (-2.66f * 1000) + #define obulge_match_S -14.22f + #define cbulge_match_H (-2.66f * 1000) + #define cbulge_match_S -14.22f + #define obulge_mism_H (0.0f * 1000) + #define obulge_mism_S -6.45f + #define cbulge_mism_H 0.0f + #define cbulge_mism_S -6.45f + for (x=1; x<=4; x++) + for (y=1; y<=4; y++) + for (a=1; a<=4; a++) + { + if (x+y==5) // other base pair matches! + { + ndH(x,0,y,a)=obulge_match_H; // bulge opening + ndS(x,0,y,a)=obulge_match_S; + ndH(x,a,y,0)=obulge_match_H; + ndS(x,a,y,0)=obulge_match_S; + ndH(0,x,a,y)=cbulge_match_H; // bulge closing + ndS(0,x,a,y)=cbulge_match_S; + ndH(a,x,0,y)=cbulge_match_H; + ndS(a,x,0,y)=cbulge_match_S; + } + else + { // mismatch in other base pair! + ndH(x,0,y,a)=obulge_mism_H; // bulge opening + ndS(x,0,y,a)=obulge_mism_S; + ndH(x,a,y,0)=obulge_mism_H; + ndS(x,a,y,0)=obulge_mism_S; + ndH(0,x,a,y)=cbulge_mism_H; // bulge closing + ndS(0,x,a,y)=cbulge_mism_S; + ndH(a,x,0,y)=cbulge_mism_H; + ndS(a,x,0,y)=cbulge_mism_S; + } + } + + // Watson-Crick pairs (note that only ten are unique, as obviously + // 5'-AG-3'/3'-TC-5' = 5'-CT-3'/3'-GA-5' etc. + ndH(1,1,4,4)=-7.6f*1000; ndS(1,1,4,4)=-21.3f; // AA/TT 04 + ndH(1,2,4,3)=-8.4f*1000; ndS(1,2,4,3)=-22.4f; // AC/TG adapted GT/CA + ndH(1,3,4,2)=-7.8f*1000; ndS(1,3,4,2)=-21.0f; // AG/TC adapted CT/GA + ndH(1,4,4,1)=-7.2f*1000; ndS(1,4,4,1)=-20.4f; // AT/TA 04 + ndH(2,1,3,4)=-8.5f*1000; ndS(2,1,3,4)=-22.7f; // CA/GT 04 + ndH(2,2,3,3)=-8.0f*1000; ndS(2,2,3,3)=-19.9f; // CC/GG adapted GG/CC + ndH(2,3,3,2)=-10.6f*1000; ndS(2,3,3,2)=-27.2f; // CG/GC 04 + ndH(2,4,3,1)=-7.8f*1000; ndS(2,4,3,1)=-21.0f; // CT/GA 04 + ndH(3,1,2,4)=-8.2f*1000; ndS(3,1,2,4)=-22.2f; // GA/CT 04 + ndH(3,2,2,3)=-9.8f*1000; ndS(3,2,2,3)=-24.4f; // GC/CG 04 + ndH(3,3,2,2)=-8.0f*1000; ndS(3,3,2,2)=-19.9f; // GG/CC 04 + ndH(3,4,2,1)=-8.4f*1000; ndS(3,4,2,1)=-22.4f; // GT/CA 04 + ndH(4,1,1,4)=-7.2f*1000; ndS(4,1,1,4)=-21.3f; // TA/AT 04 + ndH(4,2,1,3)=-8.2f*1000; ndS(4,2,1,3)=-22.2f; // TC/AG adapted GA/CT + ndH(4,3,1,2)=-8.5f*1000; ndS(4,3,1,2)=-22.7f; // TG/AC adapted CA/GT + ndH(4,4,1,1)=-7.6f*1000; ndS(4,4,1,1)=-21.3f; // TT/AA adapted AA/TT + + // A-C Mismatches (Values for pH 7.0) + ndH(1,1,2,4)=7.6f*1000; ndS(1,1,2,4)=20.2f; // AA/CT + ndH(1,1,4,2)=2.3f*1000; ndS(1,1,4,2)=4.6f; // AA/TC + ndH(1,2,2,3)=-0.7f*1000; ndS(1,2,2,3)=-3.8f; // AC/CG + ndH(1,2,4,1)=5.3f*1000; ndS(1,2,4,1)=14.6f; // AC/TA + ndH(1,3,2,2)=0.6f*1000; ndS(1,3,2,2)=-0.6f; // AG/CC + ndH(1,4,2,1)=5.3f*1000; ndS(1,4,2,1)=14.6f; // AT/CA + ndH(2,1,1,4)=3.4f*1000; ndS(2,1,1,4)=8.0f; // CA/AT + ndH(2,1,3,2)=1.9f*1000; ndS(2,1,3,2)=3.7f; // CA/GC + ndH(2,2,1,3)=5.2f*1000; ndS(2,2,1,3)=14.2f; // CC/AG + ndH(2,2,3,1)=0.6f*1000; ndS(2,2,3,1)=-0.6f; // CC/GA + ndH(2,3,1,2)=1.9f*1000; ndS(2,3,1,2)=3.7f; // CG/AC + ndH(2,4,1,1)=2.3f*1000; ndS(2,4,1,1)=4.6f; // CT/AA + ndH(3,1,2,2)=5.2f*1000; ndS(3,1,2,2)=14.2f; // GA/CC + ndH(3,2,2,1)=-0.7f*1000; ndS(3,2,2,1)=-3.8f; // GC/CA + ndH(4,1,1,2)=3.4f*1000; ndS(4,1,1,2)=8.0f; // TA/AC + ndH(4,2,1,1)=7.6f*1000; ndS(4,2,1,1)=20.2f; // TC/AA + + // C-T Mismatches + ndH(1,2,4,4)=0.7f*1000; ndS(1,2,4,4)=0.2f; // AC/TT + ndH(1,4,4,2)=-1.2f*1000; ndS(1,4,4,2)=-6.2f; // AT/TC + ndH(2,1,4,4)=1.0f*1000; ndS(2,1,4,4)=0.7f; // CA/TT + ndH(2,2,3,4)=-0.8f*1000; ndS(2,2,3,4)=-4.5f; // CC/GT + ndH(2,2,4,3)=5.2f*1000; ndS(2,2,4,3)=13.5f; // CC/TG + ndH(2,3,4,2)=-1.5f*1000; ndS(2,3,4,2)=-6.1f; // CG/TC + ndH(2,4,3,2)=-1.5f*1000; ndS(2,4,3,2)=-6.1f; // CT/GC + ndH(2,4,4,1)=-1.2f*1000; ndS(2,4,4,1)=-6.2f; // CT/TA + ndH(3,2,2,4)=2.3f*1000; ndS(3,2,2,4)=5.4f; // GC/CT + ndH(3,4,2,2)=5.2f*1000; ndS(3,4,2,2)=13.5f; // GT/CC + ndH(4,1,2,4)=1.2f*1000; ndS(4,1,2,4)=0.7f; // TA/CT + ndH(4,2,2,3)=2.3f*1000; ndS(4,2,2,3)=5.4f; // TC/CG + ndH(4,2,1,4)=1.2f*1000; ndS(4,2,1,4)=0.7f; // TC/AT + ndH(4,3,2,2)=-0.8f*1000; ndS(4,3,2,2)=-4.5f; // TG/CC + ndH(4,4,2,1)=0.7f*1000; ndS(4,4,2,1)=0.2f; // TT/CA + ndH(4,4,1,2)=1.0f*1000; ndS(4,4,1,2)=0.7f; // TT/AC + + // G-A Mismatches + ndH(1,1,3,4)=3.0f*1000; ndS(1,1,3,4)=7.4f; // AA/GT + ndH(1,1,4,3)=-0.6f*1000; ndS(1,1,4,3)=-2.3f; // AA/TG + ndH(1,2,3,3)=0.5f*1000; ndS(1,2,3,3)=3.2f; // AC/GG + ndH(1,3,3,2)=-4.0f*1000; ndS(1,3,3,2)=-13.2f; // AG/GC + ndH(1,3,4,1)=-0.7f*1000; ndS(1,3,4,1)=-2.3f; // AG/TA + ndH(1,4,3,1)=-0.7f*1000; ndS(1,4,3,1)=-2.3f; // AT/GA + ndH(2,1,3,3)=-0.7f*1000; ndS(2,1,3,3)=-2.3f; // CA/GG + ndH(2,3,3,1)=-4.0f*1000; ndS(2,3,3,1)=-13.2f; // CG/GA + ndH(3,1,1,4)=0.7f*1000; ndS(3,1,1,4)=0.7f; // GA/AT + ndH(3,1,2,3)=-0.6f*1000; ndS(3,1,2,3)=-1.0f; // GA/CG + ndH(3,2,1,3)=-0.6f*1000; ndS(3,2,1,3)=-1.0f; // GC/AG + ndH(3,3,1,2)=-0.7f*1000; ndS(3,3,1,2)=-2.3f; // GG/AC + ndH(3,3,2,1)=0.5f*1000; ndS(3,3,2,1)=3.2f; // GG/CA + ndH(3,4,1,1)=-0.6f*1000; ndS(3,4,1,1)=-2.3f; // GT/AA + ndH(4,1,1,3)=0.7f*1000; ndS(4,1,1,3)=0.7f; // TA/AG + ndH(4,3,1,1)=3.0f*1000; ndS(4,3,1,1)=7.4f; // TG/AA + + // G-T Mismatches + ndH(1,3,4,4)=1.0f*1000; ndS(1,3,4,4)=0.9f; // AG/TT + ndH(1,4,4,3)=-2.5f*1000; ndS(1,4,4,3)=-8.3f; // AT/TG + ndH(2,3,3,4)=-4.1f*1000; ndS(2,3,3,4)=-11.7f; // CG/GT + ndH(2,4,3,3)=-2.8f*1000; ndS(2,4,3,3)=-8.0f; // CT/GG + ndH(3,1,4,4)=-1.3f*1000; ndS(3,1,4,4)=-5.3f; // GA/TT + ndH(3,2,4,3)=-4.4f*1000; ndS(3,2,4,3)=-12.3f; // GC/TG + ndH(3,3,2,4)=3.3f*1000; ndS(3,3,2,4)=10.4f; // GG/CT + ndH(3,3,4,2)=-2.8f*1000; ndS(3,3,4,2)=-8.0f; // GG/TC +// ndH(3,3,4,4)=5.8f*1000; ndS(3,3,4,4)=16.3f; // GG/TT + ndH(3,4,2,3)=-4.4f*1000; ndS(3,4,2,3)=-12.3f; // GT/CG + ndH(3,4,4,1)=-2.5f*1000; ndS(3,4,4,1)=-8.3f; // GT/TA +// ndH(3,4,4,3)=4.1f*1000; ndS(3,4,4,3)=9.5f; // GT/TG + ndH(4,1,3,4)=-0.1f*1000; ndS(4,1,3,4)=-1.7f; // TA/GT + ndH(4,2,3,3)=3.3f*1000; ndS(4,2,3,3)=10.4f; // TC/GG + ndH(4,3,1,4)=-0.1f*1000; ndS(4,3,1,4)=-1.7f; // TG/AT + ndH(4,3,3,2)=-4.1f*1000; ndS(4,3,3,2)=-11.7f; // TG/GC +// ndH(4,3,3,4)=-1.4f*1000; ndS(4,3,3,4)=-6.2f; // TG/GT + ndH(4,4,1,3)=-1.3f*1000; ndS(4,4,1,3)=-5.3f; // TT/AG + ndH(4,4,3,1)=1.0f*1000; ndS(4,4,3,1)=0.9f; // TT/GA +// ndH(4,4,3,3)=5.8f*1000; ndS(4,4,3,3)=16.3f; // TT/GG + + // A-A Mismatches + ndH(1,1,1,4)=4.7f*1000; ndS(1,1,1,4)=12.9f; // AA/AT + ndH(1,1,4,1)=1.2f*1000; ndS(1,1,4,1)=1.7f; // AA/TA + ndH(1,2,1,3)=-2.9f*1000; ndS(1,2,1,3)=-9.8f; // AC/AG + ndH(1,3,1,2)=-0.9f*1000; ndS(1,3,1,2)=-4.2f; // AG/AC + ndH(1,4,1,1)=1.2f*1000; ndS(1,4,1,1)=1.7f; // AT/AA + ndH(2,1,3,1)=-0.9f*1000; ndS(2,1,3,1)=-4.2f; // CA/GA + ndH(3,1,2,1)=-2.9f*1000; ndS(3,1,2,1)=-9.8f; // GA/CA + ndH(4,1,1,1)=4.7f*1000; ndS(4,1,1,1)=12.9f; // TA/AA + + // C-C Mismatches + ndH(1,2,4,2)=0.0f*1000; ndS(1,2,4,2)=-4.4f; // AC/TC + ndH(2,1,2,4)=6.1f*1000; ndS(2,1,2,4)=16.4f; // CA/CT + ndH(2,2,2,3)=3.6f*1000; ndS(2,2,2,3)=8.9f; // CC/CG + ndH(2,2,3,2)=-1.5f*1000; ndS(2,2,3,2)=-7.2f; // CC/GC + ndH(2,3,2,2)=-1.5f*1000; ndS(2,3,2,2)=-7.2f; // CG/CC + ndH(2,4,2,1)=0.0f*1000; ndS(2,4,2,1)=-4.4f; // CT/CA + ndH(3,2,2,2)=3.6f*1000; ndS(3,2,2,2)=8.9f; // GC/CC + ndH(4,2,1,2)=6.1f*1000; ndS(4,2,1,2)=16.4f; // TC/AC + + // G-G Mismatches + ndH(1,3,4,3)=-3.1f*1000; ndS(1,3,4,3)=-9.5f; // AG/TG + ndH(2,3,3,3)=-4.9f*1000; ndS(2,3,3,3)=-15.3f; // CG/GG + ndH(3,1,3,4)=1.6f*1000; ndS(3,1,3,4)=3.6f; // GA/GT + ndH(3,2,3,3)=-6.0f*1000; ndS(3,2,3,3)=-15.8f; // GC/GG + ndH(3,3,2,3)=-6.0f*1000; ndS(3,3,2,3)=-15.8f; // GG/CG + ndH(3,3,3,2)=-4.9f*1000; ndS(3,3,3,2)=-15.3f; // GG/GC + ndH(3,4,3,1)=-3.1f*1000; ndS(3,4,3,1)=-9.5f; // GT/GA + ndH(4,3,1,3)=1.6f*1000; ndS(4,3,1,3)=3.6f; // TG/AG + + // T-T Mismatches + ndH(1,4,4,4)=-2.7f*1000; ndS(1,4,4,4)=-10.8f; // AT/TT + ndH(2,4,3,4)=-5.0f*1000; ndS(2,4,3,4)=-15.8f; // CT/GT + ndH(3,4,2,4)=-2.2f*1000; ndS(3,4,2,4)=-8.4f; // GT/CT + ndH(4,1,4,4)=0.2f*1000; ndS(4,1,4,4)=-1.5f; // TA/TT + ndH(4,2,4,3)=-2.2f*1000; ndS(4,2,4,3)=-8.4f; // TC/TG + ndH(4,3,4,2)=-5.0f*1000; ndS(4,3,4,2)=-15.8f; // TG/TC + ndH(4,4,1,4)=0.2f*1000; ndS(4,4,1,4)=-1.5f; // TT/AT + ndH(4,4,4,1)=-2.7f*1000; ndS(4,4,4,1)=-10.8f; // TT/TA + + // Dangling Ends + ndH(5,1,1,4)=-0.7f*1000; ndS(5,1,1,4)=-0.8f; // $A/AT + ndH(5,1,2,4)=4.4f*1000; ndS(5,1,2,4)=14.9f; // $A/CT + ndH(5,1,3,4)=-1.6f*1000; ndS(5,1,3,4)=-3.6f; // $A/GT + ndH(5,1,4,4)=2.9f*1000; ndS(5,1,4,4)=10.4f; // $A/TT + ndH(5,2,1,3)=-2.1f*1000; ndS(5,2,1,3)=-3.9f; // $C/AG + ndH(5,2,2,3)=-0.2f*1000; ndS(5,2,2,3)=-0.1f; // $C/CG + ndH(5,2,3,3)=-3.9f*1000; ndS(5,2,3,3)=-11.2f; // $C/GG + ndH(5,2,4,3)=-4.4f*1000; ndS(5,2,4,3)=-13.1f; // $C/TG + ndH(5,3,1,2)=-5.9f*1000; ndS(5,3,1,2)=-16.5f; // $G/AC + ndH(5,3,2,2)=-2.6f*1000; ndS(5,3,2,2)=-7.4f; // $G/CC + ndH(5,3,3,2)=-3.2f*1000; ndS(5,3,3,2)=-10.4f; // $G/GC + ndH(5,3,4,2)=-5.2f*1000; ndS(5,3,4,2)=-15.0f; // $G/TC + ndH(5,4,1,1)=-0.5f*1000; ndS(5,4,1,1)=-1.1f; // $T/AA + ndH(5,4,2,1)=4.7f*1000; ndS(5,4,2,1)=14.2f; // $T/CA + ndH(5,4,3,1)=-4.1f*1000; ndS(5,4,3,1)=-13.1f; // $T/GA + ndH(5,4,4,1)=-3.8f*1000; ndS(5,4,4,1)=-12.6f; // $T/TA + ndH(1,5,4,1)=-2.9f*1000; ndS(1,5,4,1)=-7.6f; // A$/TA + ndH(1,5,4,2)=-4.1f*1000; ndS(1,5,4,2)=-13.0f; // A$/TC + ndH(1,5,4,3)=-4.2f*1000; ndS(1,5,4,3)=-15.0f; // A$/TG + ndH(1,5,4,4)=-0.2f*1000; ndS(1,5,4,4)=-0.5f; // A$/TT + ndH(1,1,5,4)=0.2f*1000; ndS(1,1,5,4)=2.3f; // AA/$T + ndH(1,1,4,5)=-0.5f*1000; ndS(1,1,4,5)=-1.1f; // AA/T$ + ndH(1,2,5,3)=-6.3f*1000; ndS(1,2,5,3)=-17.1f; // AC/$G + ndH(1,2,4,5)=4.7f*1000; ndS(1,2,4,5)=14.2f; // AC/T$ + ndH(1,3,5,2)=-3.7f*1000; ndS(1,3,5,2)=-10.0f; // AG/$C + ndH(1,3,4,5)=-4.1f*1000; ndS(1,3,4,5)=-13.1f; // AG/T$ + ndH(1,4,5,1)=-2.9f*1000; ndS(1,4,5,1)=-7.6f; // AT/$A + ndH(1,4,4,5)=-3.8f*1000; ndS(1,4,4,5)=-12.6f; // AT/T$ + ndH(2,5,3,1)=-3.7f*1000; ndS(2,5,3,1)=-10.0f; // C$/GA + ndH(2,5,3,2)=-4.0f*1000; ndS(2,5,3,2)=-11.9f; // C$/GC + ndH(2,5,3,3)=-3.9f*1000; ndS(2,5,3,3)=-10.9f; // C$/GG + ndH(2,5,3,4)=-4.9f*1000; ndS(2,5,3,4)=-13.8f; // C$/GT + ndH(2,1,5,4)=0.6f*1000; ndS(2,1,5,4)=3.3f; // CA/$T + ndH(2,1,3,5)=-5.9f*1000; ndS(2,1,3,5)=-16.5f; // CA/G$ + ndH(2,2,5,3)=-4.4f*1000; ndS(2,2,5,3)=-12.6f; // CC/$G + ndH(2,2,3,5)=-2.6f*1000; ndS(2,2,3,5)=-7.4f; // CC/G$ + ndH(2,3,5,2)=-4.0f*1000; ndS(2,3,5,2)=-11.9f; // CG/$C + ndH(2,3,3,5)=-3.2f*1000; ndS(2,3,3,5)=-10.4f; // CG/G$ + ndH(2,4,5,1)=-4.1f*1000; ndS(2,4,5,1)=-13.0f; // CT/$A + ndH(2,4,3,5)=-5.2f*1000; ndS(2,4,3,5)=-15.0f; // CT/G$ + ndH(3,5,2,1)=-6.3f*1000; ndS(3,5,2,1)=-17.1f; // G$/CA + ndH(3,5,2,2)=-4.4f*1000; ndS(3,5,2,2)=-12.6f; // G$/CC + ndH(3,5,2,3)=-5.1f*1000; ndS(3,5,2,3)=-14.0f; // G$/CG + ndH(3,5,2,4)=-4.0f*1000; ndS(3,5,2,4)=-10.9f; // G$/CT + ndH(3,1,5,4)=-1.1f*1000; ndS(3,1,5,4)=-1.6f; // GA/$T + ndH(3,1,2,5)=-2.1f*1000; ndS(3,1,2,5)=-3.9f; // GA/C$ + ndH(3,2,5,3)=-5.1f*1000; ndS(3,2,5,3)=-14.0f; // GC/$G + ndH(3,2,2,5)=-0.2f*1000; ndS(3,2,2,5)=-0.1f; // GC/C$ + ndH(3,3,5,2)=-3.9f*1000; ndS(3,3,5,2)=-10.9f; // GG/$C + ndH(3,3,2,5)=-3.9f*1000; ndS(3,3,2,5)=-11.2f; // GG/C$ + ndH(3,4,5,1)=-4.2f*1000; ndS(3,4,5,1)=-15.0f; // GT/$A + ndH(3,4,2,5)=-4.4f*1000; ndS(3,4,2,5)=-13.1f; // GT/C$ + ndH(4,5,1,1)=0.2f*1000; ndS(4,5,1,1)=2.3f; // T$/AA + ndH(4,5,1,2)=0.6f*1000; ndS(4,5,1,2)=3.3f; // T$/AC + ndH(4,5,1,3)=-1.1f*1000; ndS(4,5,1,3)=-1.6f; // T$/AG + ndH(4,5,1,4)=-6.9f*1000; ndS(4,5,1,4)=-20.0f; // T$/AT + ndH(4,1,5,4)=-6.9f*1000; ndS(4,1,5,4)=-20.0f; // TA/$T + ndH(4,1,1,5)=-0.7f*1000; ndS(4,1,1,5)=-0.7f; // TA/A$ + ndH(4,2,5,3)=-4.0f*1000; ndS(4,2,5,3)=-10.9f; // TC/$G + ndH(4,2,1,5)=4.4f*1000; ndS(4,2,1,5)=14.9f; // TC/A$ + ndH(4,3,5,2)=-4.9f*1000; ndS(4,3,5,2)=-13.8f; // TG/$C + ndH(4,3,1,5)=-1.6f*1000; ndS(4,3,1,5)=-3.6f; // TG/A$ + ndH(4,4,5,1)=-0.2f*1000; ndS(4,4,5,1)=-0.5f; // TT/$A + ndH(4,4,1,5)=2.9f*1000; ndS(4,4,1,5)=10.4f; // TT/A$ + + return; +} + +int nparam_CountGCContent(char * seq ) { + int lseq = strlen(seq); + int k; + double count = 0; + for( k=0;krlogc; + double mtemp; + char c1; + char c2; + char c3; + char c4; + unsigned int i; + char nseq[50]; + char *useq = seq; + + nparam_CleanSeq (seq, nseq, len); + if (!nseq[0]) + return NaN; + useq = nseq; + + for ( i=1;idH[c3][c4][c1][c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2); + thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2); + } + //printf("------------------\n"); + mtemp = nparam_CalcTM(thedS,thedH); + //fprintf(stderr,"Enthalpy: %f, entropy: %f, seq: %s rloc=%f\n", thedH, thedS, useq, nparm->rlogc); + //exit (0); + return mtemp; +} + +double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len) +{ + const unsigned long long minus1 = 0xFFFFFFFFFFFFFFFFLLU; + const double NaN = *((double*)&minus1); + double thedH = 0; + //double thedS = nparam_GetInitialEntropy(nparm); + double thedS = -5.9f+nparm->rlogc; + double mtemp; + char c1; + char c2; + char c3; + char c4; + unsigned int i; + char nseq1[50]; + char nseq2[50]; + char *useq1; + char *useq2; + + nparam_CleanSeq (seq1, nseq1, len); + if (!nseq1[0]) + return NaN; + useq1 = nseq1; + + nparam_CleanSeq (seq2, nseq2, len); + if (!nseq2[0]) + return NaN; + useq2 = nseq2; + + //fprintf (stderr,"Primer : %s\n",useq); + for ( i=1;idH[c3][c4][c1][c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2); + thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2); + } + //fprintf(stderr,"------------------\n"); + mtemp = nparam_CalcTM(thedS,thedH); + //if (mtemp == 0) + //{ + // fprintf(stderr,"Enthalpy: %f, entropy: %f, seq: %s\n", thedH, thedS, useq); + //exit (0); + //} + return mtemp; +} + +double calculateMeltingTemperatureBasic (char * seq) { + int gccount; + double temp; + int seqlen; + + seqlen = strlen (seq); + gccount = nparam_CountGCContent (seq); + temp = 64.9 + 41*(gccount - 16.4)/seqlen; + return temp; +} diff --git a/src/libecoPCR/libthermo/nnparams.h b/src/libecoPCR/libthermo/nnparams.h new file mode 100644 index 0000000..1ed168f --- /dev/null +++ b/src/libecoPCR/libthermo/nnparams.h @@ -0,0 +1,62 @@ +/* + * nnparams.h + * PHunterLib + * + * Nearest Neighbor Model Parameters + * + * Created by Tiayyba Riaz on 02/07/09. + * + */ + +#ifndef NNPARAMS_H_ +#define NNPARAMS_H_ + +#include +#include + +// following defines to simplify coding... +#define ndH(a,b,c,d) nparm->dH[(int)a][(int)b][(int)c][(int)d] +#define ndS(a,b,c,d) nparm->dS[(int)a][(int)b][(int)c][(int)d] +#define forbidden_enthalpy 1000000000000000000.0f +#define R 1.987f +#define SALT_METHOD_SANTALUCIA 1 +#define SALT_METHOD_OWCZARZY 2 + +#define DEF_CONC_PRIMERS 0.0000008 +#define DEF_CONC_SEQUENCES 0 +#define DEF_SALT 0.05 + +#define GETNUMCODE(a) bpencoder[a - 'A'] +#define GETREVCODE(a) 5-bpencoder[a - 'A'] + + +extern double forbidden_entropy; + + +typedef struct CNNParams_st +{ + double Ct1; + double Ct2; + double rlogc; + double kplus; + double kfac; + int saltMethod; + double gcContent; + double new_TM; + double dH[6][6][6][6]; // A-C-G-T + gap + initiation (dangling end, $ sign) + double dS[6][6][6][6]; +}CNNParams, * PNNParams; + +void nparam_InitParams(PNNParams nparm, double c1, double c2, double kp, int sm); +int nparam_CountGCContent(char * seq ); +double nparam_GetEntropy(PNNParams nparm, char x0, char x1, char y0, char y1); +double nparam_GetEnthalpy(PNNParams nparm, char x0, char x1, char y0, char y1); +double nparam_CalcTM(double entropy,double enthalpy); +double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len); +double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len); + +double nparam_GetInitialEntropy(PNNParams nparm) ; +double calculateMeltingTemperatureBasic (char * seq); +//void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options); + +#endif diff --git a/src/obi_ecopcr.c b/src/obi_ecopcr.c new file mode 100644 index 0000000..1e4cbc4 --- /dev/null +++ b/src/obi_ecopcr.c @@ -0,0 +1,1229 @@ +/**************************************************************************** + * In silico PCR * + ****************************************************************************/ + +/** + * @file obi_ecopcr.c + * @author Celine Mercier (celine.mercier@metabarcoding.org) from Eric Coissac's original code (eric.coissac@metabarcoding.org) + * @date June 5th 2018 + * @brief ecoPCR works as an in silico PCR that preserves the taxonomic information of the selected sequences, and allows various specified conditions for the in silico amplification. + */ + +#include +#include +#include +#include +#include + +#include "libecoPCR/ecoPCR.h" +#include "libecoPCR/libthermo/nnparams.h" + +#include "obi_ecopcr.h" +#include "obidms.h" +#include "obidms_taxonomy.h" +#include "obiview.h" +#include "obidebug.h" +#include "obierrno.h" +#include "obitypes.h" +#include "obiview.h" + + +#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) + + +/************************************************************************** + * + * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S + * + **************************************************************************/ + +/** + * Internal function creating the output columns for the ecopcr algorithm. + * + * @param o_view A pointer on the output view. + * @param kingdom_mode Whether the kingdom or the superkingdom informations should be printed to the output. + * + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since July 2018 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int create_output_columns(Obiview_p o_view, bool kingdom_mode); + + +/** + * Internal function printing a found amplicon with all its associated informations. + * + * @param i_view A pointer on the input view. + * @param o_view A pointer on the output view. + * @param i_idx The line index of the sequence in the input view. + * @param o_idx The line index where the amplicon should be written in the output view. + * @param taxonomy A pointer on a taxonomy structure. + * @param sequence The original full length sequence. + * @param taxid The taxid associated with the amplicon. + * @param seq_len The length of the original full length sequence. + * @param amplicon_len The length of the amplicon. + * @param primer1 The first primer. + * @param primer2 The second primer. + * @param tparm The structure with the PCR temperature informations. + * @param o1 The structure with the first primer's data. + * @param o2 The structure with the second primer's data. + * @param pos1 The start position of the amplicon. + * @param pos2 The end position of the amplicon. + * @param err1 The number of errors in the first primer. + * @param err2 The number of errors in the second primer. + * @param strand The DNA strand direction of the amplicon (R(everse) or D(irect)). + * @param kingdom_mode Whether the kingdom or the superkingdom informations should be printed to the output. + * @param keep_nucleotides Number of nucleotides kept on each side of the amplicon. + * @param i_id_column A pointer on the input sequence identifier column. + * @param o_id_column A pointer on the output sequence identifier column. + * @param o_ori_seq_len_column A pointer on the original sequence length column. + * @param o_amplicon_column A pointer on the output sequence column. + * @param o_amplicon_length_column A pointer on the output amplicon length column. + * @param o_taxid_column A pointer on the output taxid column. + * @param o_rank_column A pointer on the output taxonomic rank column. + * @param o_name_column A pointer on the output scientific name column. + * @param o_species_taxid_column A pointer on the output species taxid column. + * @param o_species_name_column A pointer on the output species name column. + * @param o_genus_taxid_column A pointer on the output genus taxid column. + * @param o_genus_name_column A pointer on the output genus name column. + * @param o_family_taxid_column A pointer on the output family taxid column. + * @param o_family_name_column A pointer on the output family name column. + * @param o_kingdom_taxid_column A pointer on the output kingdom taxid column. + * @param o_kingdom_name_column A pointer on the output kingdom name column. + * @param o_superkingdom_taxid_column A pointer on the output superkingdom taxid column. + * @param o_superkingdom_name_column A pointer on the output superkingdom name column. + * @param o_strand_column A pointer on the output strand direction column. + * @param o_primer1_column A pointer on the output first primer column. + * @param o_primer2_column A pointer on the output second primer column. + * @param o_error1_column A pointer on the output column for the error count in the first primer. + * @param o_error2_column A pointer on the output column for the error count in the second primer. + * @param o_temp1_column A pointer on the output column for the temperature for the first primer. + * @param o_temp2_column A pointer on the output column for the temperature for the second primer. + * + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since July 2018 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int print_seq(Obiview_p i_view, Obiview_p o_view, + index_t i_idx, index_t o_idx, + OBIDMS_taxonomy_p taxonomy, + char* sequence, + obiint_t taxid, + int seq_len, + int32_t amplicon_len, + const char* primer1, const char* primer2, + PNNParams tparm, + PatternPtr o1, PatternPtr o2, + int32_t pos1, int32_t pos2, + int32_t err1, int32_t err2, + char strand, bool kingdom_mode, + int keep_nucleotides, + OBIDMS_column_p i_id_column, OBIDMS_column_p o_id_column, OBIDMS_column_p o_ori_seq_len_column, + OBIDMS_column_p o_amplicon_column, OBIDMS_column_p o_amplicon_length_column, + OBIDMS_column_p o_taxid_column, OBIDMS_column_p o_rank_column, OBIDMS_column_p o_name_column, + OBIDMS_column_p o_species_taxid_column, OBIDMS_column_p o_species_name_column, + OBIDMS_column_p o_genus_taxid_column, OBIDMS_column_p o_genus_name_column, + OBIDMS_column_p o_family_taxid_column, OBIDMS_column_p o_family_name_column, + OBIDMS_column_p o_kingdom_taxid_column, OBIDMS_column_p o_kingdom_name_column, + OBIDMS_column_p o_superkingdom_taxid_column, OBIDMS_column_p o_superkingdom_name_column, + OBIDMS_column_p o_strand_column, + OBIDMS_column_p o_primer1_column, OBIDMS_column_p o_primer2_column, + OBIDMS_column_p o_error1_column, OBIDMS_column_p o_error2_column, + OBIDMS_column_p o_temp1_column, OBIDMS_column_p o_temp2_column); + + +/************************************************************************ + * + * D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S + * + ************************************************************************/ + +static int create_output_columns(Obiview_p o_view, bool kingdom_mode) +{ + // Original length column + if (obi_view_add_column(o_view, ECOPCR_SEQLEN_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_SEQLEN_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_SEQLEN_COLUMN_NAME); + return -1; + } + + // Amplicon length column + if (obi_view_add_column(o_view, ECOPCR_AMPLICONLEN_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_AMPLICONLEN_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_AMPLICONLEN_COLUMN_NAME); + return -1; + } + + // Taxid column + if (obi_view_add_column(o_view, TAXID_COLUMN, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, TAXID_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", TAXID_COLUMN); + return -1; + } + + // Taxonomic rank column + if (obi_view_add_column(o_view, ECOPCR_RANK_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_RANK_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_RANK_COLUMN_NAME); + return -1; + } + + // Species taxid column + if (obi_view_add_column(o_view, ECOPCR_SPECIES_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_SPECIES_TAXID_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_SPECIES_TAXID_COLUMN_NAME); + return -1; + } + + // Genus taxid column + if (obi_view_add_column(o_view, ECOPCR_GENUS_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_GENUS_TAXID_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_GENUS_TAXID_COLUMN_NAME); + return -1; + } + + // Family taxid column + if (obi_view_add_column(o_view, ECOPCR_FAMILY_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_FAMILY_TAXID_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_FAMILY_TAXID_COLUMN_NAME); + return -1; + } + + if (kingdom_mode) + { + // Kingdom taxid column + if (obi_view_add_column(o_view, ECOPCR_KINGDOM_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_KINGDOM_TAXID_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_KINGDOM_TAXID_COLUMN_NAME); + return -1; + } + } + else + { + // Superkingdom taxid column + if (obi_view_add_column(o_view, ECOPCR_SUPERKINGDOM_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_SUPERKINGDOM_TAXID_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_SUPERKINGDOM_TAXID_COLUMN_NAME); + return -1; + } + } + + // Scientific name column + if (obi_view_add_column(o_view, ECOPCR_SCIENTIFIC_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_SCIENTIFIC_NAME_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_SCIENTIFIC_NAME_COLUMN_NAME); + return -1; + } + + // Species name column + if (obi_view_add_column(o_view, ECOPCR_SPECIES_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_SPECIES_NAME_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_SPECIES_NAME_COLUMN_NAME); + return -1; + } + + // Genus name column + if (obi_view_add_column(o_view, ECOPCR_GENUS_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_GENUS_NAME_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_GENUS_NAME_COLUMN_NAME); + return -1; + } + + // Family name column + if (obi_view_add_column(o_view, ECOPCR_FAMILY_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_FAMILY_NAME_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_FAMILY_NAME_COLUMN_NAME); + return -1; + } + + if (kingdom_mode) + { + // Kingdom name column + if (obi_view_add_column(o_view, ECOPCR_KINGDOM_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_KINGDOM_NAME_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_KINGDOM_NAME_COLUMN_NAME); + return -1; + } + } + else + { + // Superkingdom name column + if (obi_view_add_column(o_view, ECOPCR_SUPERKINGDOM_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_SUPERKINGDOM_NAME_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_SUPERKINGDOM_NAME_COLUMN_NAME); + return -1; + } + } + + // Strand column + if (obi_view_add_column(o_view, ECOPCR_STRAND_COLUMN_NAME, -1, NULL, OBI_CHAR, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_STRAND_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_STRAND_COLUMN_NAME); + return -1; + } + + // Primer 1 column + if (obi_view_add_column(o_view, ECOPCR_PRIMER1_COLUMN_NAME, -1, NULL, OBI_SEQ, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_PRIMER1_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_PRIMER1_COLUMN_NAME); + return -1; + } + + // Primer 2 column + if (obi_view_add_column(o_view, ECOPCR_PRIMER2_COLUMN_NAME, -1, NULL, OBI_SEQ, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_PRIMER2_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_PRIMER2_COLUMN_NAME); + return -1; + } + + // Error 1 column + if (obi_view_add_column(o_view, ECOPCR_ERROR1_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_ERROR1_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_ERROR1_COLUMN_NAME); + return -1; + } + + // Error 2 column + if (obi_view_add_column(o_view, ECOPCR_ERROR2_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_ERROR2_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_ERROR2_COLUMN_NAME); + return -1; + } + + // Temperature 1 column + if (obi_view_add_column(o_view, ECOPCR_TEMP1_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_TEMP1_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_TEMP1_COLUMN_NAME); + return -1; + } + + // Temperature 2 column + if (obi_view_add_column(o_view, ECOPCR_TEMP2_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOPCR_TEMP2_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", ECOPCR_TEMP2_COLUMN_NAME); + return -1; + } + + return 0; +} + + +static int print_seq(Obiview_p i_view, Obiview_p o_view, + index_t i_idx, index_t o_idx, + OBIDMS_taxonomy_p taxonomy, + char* sequence, + obiint_t taxid, + int seq_len, + int32_t amplicon_len, + const char* primer1, const char* primer2, + PNNParams tparm, + PatternPtr o1, PatternPtr o2, + int32_t pos1, int32_t pos2, + int32_t err1, int32_t err2, + char strand, bool kingdom_mode, + int keep_nucleotides, + OBIDMS_column_p i_id_column, OBIDMS_column_p o_id_column, OBIDMS_column_p o_ori_seq_len_column, + OBIDMS_column_p o_amplicon_column, OBIDMS_column_p o_amplicon_length_column, + OBIDMS_column_p o_taxid_column, OBIDMS_column_p o_rank_column, OBIDMS_column_p o_name_column, + OBIDMS_column_p o_species_taxid_column, OBIDMS_column_p o_species_name_column, + OBIDMS_column_p o_genus_taxid_column, OBIDMS_column_p o_genus_name_column, + OBIDMS_column_p o_family_taxid_column, OBIDMS_column_p o_family_name_column, + OBIDMS_column_p o_kingdom_taxid_column, OBIDMS_column_p o_kingdom_name_column, + OBIDMS_column_p o_superkingdom_taxid_column, OBIDMS_column_p o_superkingdom_name_column, + OBIDMS_column_p o_strand_column, + OBIDMS_column_p o_primer1_column, OBIDMS_column_p o_primer2_column, + OBIDMS_column_p o_error1_column, OBIDMS_column_p o_error2_column, + OBIDMS_column_p o_temp1_column, OBIDMS_column_p o_temp2_column) +{ + const char* seq_id; + ecotx_t* main_taxon; + ecotx_t* taxon; + int32_t rank_idx; + const char* rank_label; + + char oligo1[MAX_PAT_LEN+1] = {0}; + char oligo2[MAX_PAT_LEN+1] = {0}; + + int32_t error1; + int32_t error2; + int32_t ldelta, rdelta; + int32_t len_with_primers; + + char* amplicon = NULL; + double tm1,tm2; + double tm=0; + + int32_t i; + + ldelta = (pos1 <= keep_nucleotides)?pos1:keep_nucleotides; + rdelta = ((pos2+keep_nucleotides)>=seq_len)?seq_len-pos2:keep_nucleotides; + + amplicon = getSubSequence(sequence, pos1-ldelta, pos2+rdelta); + len_with_primers = amplicon_len + o1->patlen + o2->patlen; + + if (strand == 'R') + { + ecoComplementSequence(amplicon); + + strncpy(oligo1, amplicon + rdelta, o2->patlen); + + oligo1[o2->patlen] = 0; + error1 = err2; + + strncpy(oligo2, amplicon + rdelta + len_with_primers - o1->patlen, o1->patlen); + oligo2[o1->patlen] = 0; + error2 = err1; + + if (keep_nucleotides == 0) + amplicon+=o2->patlen; + else + { + keep_nucleotides = ldelta; + ldelta = rdelta+o2->patlen; + rdelta = keep_nucleotides+o1->patlen; + } + } + else /* strand == 'D' */ + { + strncpy(oligo1, amplicon+ldelta, o1->patlen); + oligo1[o1->patlen] = 0; + error1 = err1; + + strncpy(oligo2, amplicon + ldelta + len_with_primers - o2->patlen, o2->patlen); + oligo2[o2->patlen] = 0; + error2 = err2; + + if (keep_nucleotides==0) + amplicon+=o1->patlen; + else + { + ldelta+=o1->patlen; + rdelta+=o2->patlen; + } + } + + ecoComplementSequence(oligo2); + if (keep_nucleotides == 0) + amplicon[amplicon_len]=0; + else + { + amplicon_len = ldelta+rdelta+amplicon_len; + for (i=0; ipatlen) - 273.15; + tm2=nparam_CalcTwoTM(tparm,oligo2,primer2,o2->patlen) - 273.15; + tm = (tm1 < tm2) ? tm1:tm2; + + // Get the taxon structure + main_taxon = obi_taxo_get_taxon_with_taxid(taxonomy, taxid); + if (main_taxon == NULL) + { + obidebug(1, "\nError reading the taxonomic information of a sequence"); + return -1; + } + + // Write sequence id + seq_id = obi_get_str_with_elt_idx_and_col_p_in_view(i_view, i_id_column, i_idx, 0); + if (obi_set_str_with_elt_idx_and_col_p_in_view(o_view, o_id_column, o_idx, 0, seq_id) < 0) + { + obidebug(1, "\nError writing the sequence id"); + return -1; + } + + // Write original sequence length + if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, o_ori_seq_len_column, o_idx, 0, seq_len) < 0) + { + obidebug(1, "\nError writing the original sequence length"); + return -1; + } + + // Write the amplicon itself + if (obi_set_seq_with_elt_idx_and_col_p_in_view(o_view, o_amplicon_column, o_idx, 0, amplicon) < 0) + { + obidebug(1, "\nError writing the amplicon"); + return -1; + } + + // Write the amplicon length + if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, o_amplicon_length_column, o_idx, 0, amplicon_len) < 0) + { + obidebug(1, "\nError writing the amplicon length"); + return -1; + } + + // Write taxonomic rank + rank_idx = main_taxon->rank; + rank_label = obi_taxo_rank_index_to_label(rank_idx, taxonomy->ranks); + if (rank_label == NULL) + { + obidebug(1, "\nError reading the taxonomic rank"); + return -1; + } + if (obi_set_str_with_elt_idx_and_col_p_in_view(o_view, o_rank_column, o_idx, 0, rank_label) < 0) + { + obidebug(1, "\nError writing the taxonomic rank"); + return -1; + } + + // Write taxid + if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, o_taxid_column, o_idx, 0, taxid) < 0) + { + obidebug(1, "\nError writing the taxid"); + return -1; + } + + // Write scientific name + if (obi_set_str_with_elt_idx_and_col_p_in_view(o_view, o_name_column, o_idx, 0, main_taxon->name) < 0) + { + obidebug(1, "\nError writing the scientific name"); + return -1; + } + + // Write species informations if available + taxon = obi_taxo_get_species(main_taxon, taxonomy); + if (taxon) + { + if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, o_species_taxid_column, o_idx, 0, taxon->taxid) < 0) + { + obidebug(1, "\nError writing the species taxid"); + return -1; + } + if (obi_set_str_with_elt_idx_and_col_p_in_view(o_view, o_species_name_column, o_idx, 0, taxon->name) < 0) + { + obidebug(1, "\nError writing the species name"); + return -1; + } + } + + // Write genus informations if available + taxon = obi_taxo_get_genus(main_taxon, taxonomy); + if (taxon) + { + if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, o_genus_taxid_column, o_idx, 0, taxon->taxid) < 0) + { + obidebug(1, "\nError writing the genus taxid"); + return -1; + } + if (obi_set_str_with_elt_idx_and_col_p_in_view(o_view, o_genus_name_column, o_idx, 0, taxon->name) < 0) + { + obidebug(1, "\nError writing the genus name"); + return -1; + } + } + + // Write family informations if available + taxon = obi_taxo_get_family(main_taxon, taxonomy); + if (taxon) + { + if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, o_family_taxid_column, o_idx, 0, taxon->taxid) < 0) + { + obidebug(1, "\nError writing the family taxid"); + return -1; + } + if (obi_set_str_with_elt_idx_and_col_p_in_view(o_view, o_family_name_column, o_idx, 0, taxon->name) < 0) + { + obidebug(1, "\nError writing the family name"); + return -1; + } + } + + // Write kingdom or superkingdom informations if available + if (kingdom_mode) + { + taxon = obi_taxo_get_kingdom(main_taxon, taxonomy); + if (taxon) + { + if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, o_kingdom_taxid_column, o_idx, 0, taxon->taxid) < 0) + { + obidebug(1, "\nError writing the kingdom taxid"); + return -1; + } + if (obi_set_str_with_elt_idx_and_col_p_in_view(o_view, o_kingdom_name_column, o_idx, 0, taxon->name) < 0) + { + obidebug(1, "\nError writing the kingdom name"); + return -1; + } + } + } + else + { + taxon = obi_taxo_get_superkingdom(main_taxon, taxonomy); + if (taxon) + { + if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, o_superkingdom_taxid_column, o_idx, 0, taxon->taxid) < 0) + { + obidebug(1, "\nError writing the superkingdom taxid"); + return -1; + } + if (obi_set_str_with_elt_idx_and_col_p_in_view(o_view, o_superkingdom_name_column, o_idx, 0, taxon->name) < 0) + { + obidebug(1, "\nError writing the superkingdom name"); + return -1; + } + } + } + + // Write strand + if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, o_strand_column, o_idx, 0, strand) < 0) + { + obidebug(1, "\nError writing the strand"); + return -1; + } + + // Write primer1 + if (obi_set_seq_with_elt_idx_and_col_p_in_view(o_view, o_primer1_column, o_idx, 0, oligo1) < 0) + { + obidebug(1, "\nError writing the first primer: >%s<", oligo1); + return -1; + } + + // Write primer2 + if (obi_set_seq_with_elt_idx_and_col_p_in_view(o_view, o_primer2_column, o_idx, 0, oligo2) < 0) + { + obidebug(1, "\nError writing the second primer"); + return -1; + } + + // Write error1 + if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, o_error1_column, o_idx, 0, error1) < 0) + { + obidebug(1, "\nError writing the first error count"); + return -1; + } + + // Write error2 + if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, o_error2_column, o_idx, 0, error2) < 0) + { + obidebug(1, "\nError writing the second error count"); + return -1; + } + + // Write temp1 + if (obi_set_float_with_elt_idx_and_col_p_in_view(o_view, o_temp1_column, o_idx, 0, tm1) < 0) + { + obidebug(1, "\nError writing the first temperature"); + return -1; + } + + // Write temp2 + if (obi_set_float_with_elt_idx_and_col_p_in_view(o_view, o_temp2_column, o_idx, 0, tm2) < 0) + { + obidebug(1, "\nError writing the second temperature"); + return -1; + } + + return 0; +} + + +/********************************************************************** + * + * D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S + * + **********************************************************************/ + + +int obi_ecopcr(const char* i_dms_name, + const char* i_view_name, + const char* taxonomy_name, // TODO discuss that input dms assumed + const char* o_dms_name, + const char* o_view_name, + const char* o_view_comments, + const char* primer1, + const char* primer2, + int error_max, + int min_len, + int max_len, + int32_t* restrict_to_taxids, + int32_t* ignore_taxids, + bool circular, + double salt, + int saltmethod, + int keep_nucleotides, + bool kingdom_mode) +{ + + index_t i_idx; + index_t o_idx; + int r,g; + float p; + + CNNParams tparm; + + PatternPtr o1; + PatternPtr o2; + PatternPtr o1c; + PatternPtr o2c; + + double tm,tm1,tm2; + + OBIDMS_p i_dms; + OBIDMS_p o_dms; + OBIDMS_taxonomy_p taxonomy; + Obiview_p i_view = NULL; + Obiview_p o_view = NULL; + OBIDMS_column_p i_seq_column, i_taxid_column, \ + i_id_column, o_id_column, \ + o_ori_seq_len_column, o_amplicon_column, o_amplicon_length_column, \ + o_taxid_column, o_rank_column, o_name_column, \ + o_species_taxid_column, o_species_name_column, \ + o_genus_taxid_column, o_genus_name_column, \ + o_family_taxid_column, o_family_name_column, \ + o_kingdom_taxid_column, o_kingdom_name_column, \ + o_superkingdom_taxid_column, o_superkingdom_name_column, \ + o_strand_column, \ + o_primer1_column, o_primer2_column, \ + o_error1_column, o_error2_column, \ + o_temp1_column, o_temp2_column = NULL; + + index_t seq_count; + + index_t checkedSequence = 0; + index_t positiveSequence = 0; + index_t ampliconCount = 0; + + obiint_t taxid; + char* sequence; + + SeqPtr apatseq=NULL; + int32_t o1Hits; + int32_t o2Hits; + int32_t o1cHits; + int32_t o2cHits; + int32_t length; + int32_t begin; + StackiPtr stktmp; + int32_t i; + int32_t j; + int32_t posi; + int32_t posj; + int32_t erri; + int32_t errj; + + if (circular) + { + circular = strlen(primer1); + if (strlen(primer2)>(size_t)circular) + circular = strlen(primer2); + } + + nparam_InitParams(&tparm, + DEF_CONC_PRIMERS, + DEF_CONC_PRIMERS, + salt, + saltmethod); + + if (!primer1 || !primer2) + { + obi_set_errno(OBI_ECOPCR_ERROR); + obidebug(1, "\nError: first and/or second primer(s) not specified (%s, %s)", primer1, primer2); + return -1; + } + + o1 = buildPattern(primer1, error_max); + o2 = buildPattern(primer2, error_max); + + o1c = complementPattern(o1); + o2c = complementPattern(o2); + + tm1 = nparam_CalcSelfTM(&tparm,o1->cpat,o1->patlen) - 273.15; + tm2 = nparam_CalcSelfTM(&tparm,o2->cpat,o2->patlen) - 273.15; + tm = (tm1 < tm2) ? tm1:tm2; + + // Open input DMS + i_dms = obi_open_dms(i_dms_name); + if (i_dms == NULL) + { + obidebug(1, "\nError opening the input DMS"); + return -1; + } + + // Open input sequence view + i_view = obi_open_view(i_dms, i_view_name); + if (i_view == NULL) + { + obidebug(1, "\nError opening the input view"); + return -1; + } + + // Open output DMS + o_dms = obi_open_dms(o_dms_name); + if (o_dms == NULL) + { + obidebug(1, "\nError opening the output DMS"); + return -1; + } + + // Create output view + o_view = obi_new_view_nuc_seqs(o_dms, o_view_name, NULL, NULL, o_view_comments, false); + if (o_view == NULL) + { + obidebug(1, "\nError creating the output view"); + return -1; + } + + // Create output columns + if (create_output_columns(o_view, kingdom_mode) < 0) + { + obidebug(1, "\nError creating the output columns"); + return -1; + } + + // Open all the input and output columns needed and keep pointers for efficiency + if (strcmp((i_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) + i_seq_column = obi_view_get_column(i_view, NUC_SEQUENCE_COLUMN); + else + { + obi_set_errno(OBI_ECOPCR_ERROR); + obidebug(1, "\nError: no sequence column"); + return -1; + } + if (i_seq_column == NULL) + { + obidebug(1, "\nError getting the sequence column"); + return -1; + } + i_id_column = obi_view_get_column(i_view, ID_COLUMN); + if (i_id_column == NULL) + { + obidebug(1, "\nError getting the input id column"); + return -1; + } + i_taxid_column = obi_view_get_column(i_view, TAXID_COLUMN); + if (i_taxid_column == NULL) + { + obidebug(1, "\nError getting the taxid column"); + return -1; + } + o_id_column = obi_view_get_column(o_view, ID_COLUMN); + if (o_id_column == NULL) + { + obidebug(1, "\nError getting the output id column"); + return -1; + } + o_ori_seq_len_column = obi_view_get_column(o_view, ECOPCR_SEQLEN_COLUMN_NAME); + if (o_ori_seq_len_column == NULL) + { + obidebug(1, "\nError getting the output original sequence length column"); + return -1; + } + o_amplicon_column = obi_view_get_column(o_view, NUC_SEQUENCE_COLUMN); + if (o_amplicon_column == NULL) + { + obidebug(1, "\nError getting the output sequence column"); + return -1; + } + o_amplicon_length_column = obi_view_get_column(o_view, ECOPCR_AMPLICONLEN_COLUMN_NAME); + if (o_amplicon_length_column == NULL) + { + obidebug(1, "\nError getting the output amplicon length column"); + return -1; + } + o_taxid_column = obi_view_get_column(o_view, TAXID_COLUMN); + if (o_taxid_column == NULL) + { + obidebug(1, "\nError getting the output taxid column"); + return -1; + } + o_rank_column = obi_view_get_column(o_view, ECOPCR_RANK_COLUMN_NAME); + if (o_rank_column == NULL) + { + obidebug(1, "\nError getting the output rank column"); + return -1; + } + o_name_column = obi_view_get_column(o_view, ECOPCR_SCIENTIFIC_NAME_COLUMN_NAME); + if (o_name_column == NULL) + { + obidebug(1, "\nError getting the output scientific name column"); + return -1; + } + o_species_taxid_column = obi_view_get_column(o_view, ECOPCR_SPECIES_TAXID_COLUMN_NAME); + if (o_species_taxid_column == NULL) + { + obidebug(1, "\nError getting the output species taxid column"); + return -1; + } + o_species_name_column = obi_view_get_column(o_view, ECOPCR_SPECIES_NAME_COLUMN_NAME); + if (o_species_name_column == NULL) + { + obidebug(1, "\nError getting the output species name column"); + return -1; + } + o_genus_taxid_column = obi_view_get_column(o_view, ECOPCR_GENUS_TAXID_COLUMN_NAME); + if (o_genus_taxid_column == NULL) + { + obidebug(1, "\nError getting the output genus taxid column"); + return -1; + } + o_genus_name_column = obi_view_get_column(o_view, ECOPCR_GENUS_NAME_COLUMN_NAME); + if (o_genus_name_column == NULL) + { + obidebug(1, "\nError getting the output genus name column"); + return -1; + } + o_family_taxid_column = obi_view_get_column(o_view, ECOPCR_FAMILY_TAXID_COLUMN_NAME); + if (o_family_taxid_column == NULL) + { + obidebug(1, "\nError getting the output family taxid column"); + return -1; + } + o_family_name_column = obi_view_get_column(o_view, ECOPCR_FAMILY_NAME_COLUMN_NAME); + if (o_family_name_column == NULL) + { + obidebug(1, "\nError getting the output family name column"); + return -1; + } + if (kingdom_mode) + { + o_kingdom_taxid_column = obi_view_get_column(o_view, ECOPCR_KINGDOM_TAXID_COLUMN_NAME); + if (o_kingdom_taxid_column == NULL) + { + obidebug(1, "\nError getting the output kingdom taxid column"); + return -1; + } + o_kingdom_name_column = obi_view_get_column(o_view, ECOPCR_KINGDOM_NAME_COLUMN_NAME); + if (o_kingdom_name_column == NULL) + { + obidebug(1, "\nError getting the output kingdom name column"); + return -1; + } + } + else + { + o_superkingdom_taxid_column = obi_view_get_column(o_view, ECOPCR_SUPERKINGDOM_TAXID_COLUMN_NAME); + if (o_superkingdom_taxid_column == NULL) + { + obidebug(1, "\nError getting the output superkingdom taxid column"); + return -1; + } + o_superkingdom_name_column = obi_view_get_column(o_view, ECOPCR_SUPERKINGDOM_NAME_COLUMN_NAME); + if (o_superkingdom_name_column == NULL) + { + obidebug(1, "\nError getting the output superkingdom name column"); + return -1; + } + } + o_strand_column = obi_view_get_column(o_view, ECOPCR_STRAND_COLUMN_NAME); + if (o_strand_column == NULL) + { + obidebug(1, "\nError getting the output strand column"); + return -1; + } + o_primer1_column = obi_view_get_column(o_view, ECOPCR_PRIMER1_COLUMN_NAME); + if (o_primer1_column == NULL) + { + obidebug(1, "\nError getting the output primer1 column"); + return -1; + } + o_primer2_column = obi_view_get_column(o_view, ECOPCR_PRIMER2_COLUMN_NAME); + if (o_primer2_column == NULL) + { + obidebug(1, "\nError getting the output primer2 column"); + return -1; + } + o_error1_column = obi_view_get_column(o_view, ECOPCR_ERROR1_COLUMN_NAME); + if (o_error1_column == NULL) + { + obidebug(1, "\nError getting the output error1 column"); + return -1; + } + o_error2_column = obi_view_get_column(o_view, ECOPCR_ERROR2_COLUMN_NAME); + if (o_error2_column == NULL) + { + obidebug(1, "\nError getting the output error2 column"); + return -1; + } + o_temp1_column = obi_view_get_column(o_view, ECOPCR_TEMP1_COLUMN_NAME); + if (o_temp1_column == NULL) + { + obidebug(1, "\nError getting the output temp1 column"); + return -1; + } + o_temp2_column = obi_view_get_column(o_view, ECOPCR_TEMP2_COLUMN_NAME); + if (o_temp2_column == NULL) + { + obidebug(1, "\nError getting the output temp2 column"); + return -1; + } + + // Open the taxonomy + taxonomy = obi_read_taxonomy(i_dms, taxonomy_name, false); + if (taxonomy == NULL) + { + obidebug(1, "\nError opening the taxonomy"); + return -1; + } + + checkedSequence = 0; + positiveSequence= 0; + ampliconCount = 0; + + seq_count = (i_view->infos)->line_count; + + // Count number of restricted taxids + i=0; + while (restrict_to_taxids[i] != -1) + i++; + r=i; + + // Count number of ignored taxids + i=0; + while (ignore_taxids[i] != -1) + i++; + g=i; + + o_idx = 0; + for (i_idx=0; i_idxseqlen+apatseq->circular); + o2cHits= 0; + + if (o1Hits) + { + stktmp = apatseq->hitpos[0]; + begin = stktmp->val[0] + o1->patlen; + + if (max_len) + length = stktmp->val[stktmp->top-1] + o1->patlen - begin + max_len + o2->patlen; + else + length = apatseq->seqlen - begin; + + if (circular) + { + begin = 0; + length = apatseq->seqlen+circular; + } + + o2cHits = ManberAll(apatseq, o2c, 1, begin, length); + + if (o2cHits) + { + for (i=0; i < o1Hits; i++) + { + posi = apatseq->hitpos[0]->val[i]; + + if (posi < apatseq->seqlen) + { + erri = apatseq->hiterr[0]->val[i]; + for (j=0; j < o2cHits; j++) + { + posj = apatseq->hitpos[1]->val[j]; + + if (posj < apatseq->seqlen) + { + posj+=o2c->patlen; + errj = apatseq->hiterr[1]->val[j]; + length = 0; + if (posj > posi) + length = posj - posi - o1->patlen - o2->patlen; + if (posj < posi) + //length = posj + apatseq->seqlen - posi - o1->patlen - o2->patlen; // TODO hein???? + length = posi - posj - o1->patlen - o2->patlen; + if ((length>0) && // For when primers touch or overlap + (!min_len || (length >= min_len)) && + (!max_len || (length <= max_len))) + { + // Print the found amplicon + if (print_seq(i_view, o_view, + i_idx, o_idx, + taxonomy, + sequence, + taxid, + apatseq->seqlen, + length, + primer1, primer2, + &tparm, + o1, o2c, + posi, posj, + erri, errj, + 'D', kingdom_mode, + keep_nucleotides, + i_id_column, o_id_column, o_ori_seq_len_column, + o_amplicon_column, o_amplicon_length_column, + o_taxid_column, o_rank_column, o_name_column, + o_species_taxid_column, o_species_name_column, + o_genus_taxid_column, o_genus_name_column, + o_family_taxid_column, o_family_name_column, + o_kingdom_taxid_column, o_kingdom_name_column, + o_superkingdom_taxid_column, o_superkingdom_name_column, + o_strand_column, + o_primer1_column, o_primer2_column, + o_error1_column, o_error2_column, + o_temp1_column, o_temp2_column) < 0) + { + obidebug(1, "\nError writing the ecopcr result"); + return -1; + } + o_idx++; + } + } + } + } + } + } + } + + o2Hits = ManberAll(apatseq, o2, 2, 0, apatseq->seqlen); + o1cHits= 0; + + if (o2Hits) + { + stktmp = apatseq->hitpos[2]; + begin = stktmp->val[0] + o2->patlen; + + if (max_len) + length = stktmp->val[stktmp->top-1] + o2->patlen - begin + max_len + o1->patlen; + else + length = apatseq->seqlen - begin; + + if (circular) + { + begin = 0; + length = apatseq->seqlen+circular; + } + + o1cHits = ManberAll(apatseq, o1c, 3, begin, length); + + if (o1cHits) + { + for (i=0; i < o2Hits; i++) + { + posi = apatseq->hitpos[2]->val[i]; + + if (posi < apatseq->seqlen) + { + erri = apatseq->hiterr[2]->val[i]; + for (j=0; j < o1cHits; j++) + { + posj = apatseq->hitpos[3]->val[j]; + if (posj < apatseq->seqlen) + { + posj+=o1c->patlen; + errj=apatseq->hiterr[3]->val[j]; + + length = 0; + if (posj > posi) + //length = posj - posi + 1 - o2->patlen - o1->patlen; /* - o1->patlen : deleted by (prior to the OBITools3) */ TODO ???? + length = posj - posi - o2->patlen - o1->patlen; + if (posj < posi) + //length = posj + apatseq->seqlen - posi - o1->patlen - o2->patlen; TODO ???? + length = posi - posj - o2->patlen - o1->patlen; + if ((length>0) && // For when primers touch or overlap + (!min_len || (length >= min_len)) && + (!max_len || (length <= max_len))) + { + // Print the found amplicon + if (print_seq(i_view, o_view, + i_idx, o_idx, + taxonomy, + sequence, + taxid, + apatseq->seqlen, + length, + primer1, primer2, + &tparm, + o2, o1c, + posi, posj, + erri, errj, + 'R', kingdom_mode, + keep_nucleotides, + i_id_column, o_id_column, o_ori_seq_len_column, + o_amplicon_column, o_amplicon_length_column, + o_taxid_column, o_rank_column, o_name_column, + o_species_taxid_column, o_species_name_column, + o_genus_taxid_column, o_genus_name_column, + o_family_taxid_column, o_family_name_column, + o_kingdom_taxid_column, o_kingdom_name_column, + o_superkingdom_taxid_column, o_superkingdom_name_column, + o_strand_column, + o_primer1_column, o_primer2_column, + o_error1_column, o_error2_column, + o_temp1_column, o_temp2_column) < 0) + { + obidebug(1, "\nError writing the ecopcr result"); + return -1; + } + o_idx++; + } + } + } + } + } + } + } + } + } + } + + free(sequence); + } + + if (delete_apatseq((apatseq)) < 0) + { + obidebug(1, "\nError freeing a sequence structure"); + return -1; + } + + // Close views + if (obi_save_and_close_view(i_view) < 0) + { + obidebug(1, "\nError closing the input view"); + return -1; + } + + if (obi_save_and_close_view(o_view) < 0) + { + obidebug(1, "\nError closing the output view"); + return -1; + } + + // Close the taxonomy + if (obi_close_taxonomy(taxonomy) < 0) + { + obidebug(1, "\nError closing the taxonomy"); + return -1; + } + + fprintf(stderr,"\rDone : 100 %% "); + return 0; + + return 0; +} + + + diff --git a/src/obi_ecopcr.h b/src/obi_ecopcr.h new file mode 100644 index 0000000..62a7b1b --- /dev/null +++ b/src/obi_ecopcr.h @@ -0,0 +1,127 @@ +/************************************************************************************************* + * Header file for in silico PCR * + *************************************************************************************************/ + +/** + * @file obi_ecopcr.h + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @date June 5th 2018 + * @brief Header file for the functions performing an in silico PCR. + */ + + +#ifndef OBI_ECOPCR_H_ +#define OBI_ECOPCR_H_ + + +#include +#include +#include + +#include "obidms.h" +#include "obiview.h" +#include "obidmscolumn.h" +#include "obitypes.h" + +// TODO discuss names +#define ECOPCR_SEQLEN_COLUMN_NAME "seq_length_ori" +#define ECOPCR_SEQLEN_COLUMN_COMMENTS "" +#define ECOPCR_AMPLICONLEN_COLUMN_NAME "seq_length" +#define ECOPCR_AMPLICONLEN_COLUMN_COMMENTS "" +#define TAXID_COLUMN_COMMENTS "" +#define ECOPCR_RANK_COLUMN_NAME "rank" +#define ECOPCR_RANK_COLUMN_COMMENTS "" +#define ECOPCR_SCIENTIFIC_NAME_COLUMN_NAME "scientific_name" +#define ECOPCR_SCIENTIFIC_NAME_COLUMN_COMMENTS "" +#define ECOPCR_SPECIES_TAXID_COLUMN_NAME "species" +#define ECOPCR_SPECIES_TAXID_COLUMN_COMMENTS "" +#define ECOPCR_GENUS_TAXID_COLUMN_NAME "genus" +#define ECOPCR_GENUS_TAXID_COLUMN_COMMENTS "" +#define ECOPCR_FAMILY_TAXID_COLUMN_NAME "family" +#define ECOPCR_FAMILY_TAXID_COLUMN_COMMENTS "" +#define ECOPCR_KINGDOM_TAXID_COLUMN_NAME "kingdom" +#define ECOPCR_KINGDOM_TAXID_COLUMN_COMMENTS "" +#define ECOPCR_SUPERKINGDOM_TAXID_COLUMN_NAME "superkingdom" +#define ECOPCR_SUPERKINGDOM_TAXID_COLUMN_COMMENTS "" +#define ECOPCR_SPECIES_NAME_COLUMN_NAME "species_name" +#define ECOPCR_SPECIES_NAME_COLUMN_COMMENTS "" +#define ECOPCR_GENUS_NAME_COLUMN_NAME "genus_name" +#define ECOPCR_GENUS_NAME_COLUMN_COMMENTS "" +#define ECOPCR_FAMILY_NAME_COLUMN_NAME "family_name" +#define ECOPCR_FAMILY_NAME_COLUMN_COMMENTS "" +#define ECOPCR_KINGDOM_NAME_COLUMN_NAME "kindgom_name" +#define ECOPCR_KINGDOM_NAME_COLUMN_COMMENTS "" +#define ECOPCR_SUPERKINGDOM_NAME_COLUMN_NAME "superkingdom_name" +#define ECOPCR_SUPERKINGDOM_NAME_COLUMN_COMMENTS "" +#define ECOPCR_STRAND_COLUMN_NAME "strand" +#define ECOPCR_STRAND_COLUMN_COMMENTS "" +#define ECOPCR_PRIMER1_COLUMN_NAME "forward_match" +#define ECOPCR_PRIMER1_COLUMN_COMMENTS "" +#define ECOPCR_PRIMER2_COLUMN_NAME "reverse_match" +#define ECOPCR_PRIMER2_COLUMN_COMMENTS "" +#define ECOPCR_ERROR1_COLUMN_NAME "forward_error" +#define ECOPCR_ERROR1_COLUMN_COMMENTS "" +#define ECOPCR_ERROR2_COLUMN_NAME "reverse_error" +#define ECOPCR_ERROR2_COLUMN_COMMENTS "" +#define ECOPCR_TEMP1_COLUMN_NAME "forward_tm" +#define ECOPCR_TEMP1_COLUMN_COMMENTS "" +#define ECOPCR_TEMP2_COLUMN_NAME "reverse_tm" +#define ECOPCR_TEMP2_COLUMN_COMMENTS "" + + + +/** + * @brief ecoPCR works as an in silico PCR that preserves the taxonomic information of the selected sequences, and allows various specified conditions for the in silico amplification. + * + * Note: The columns where the results are written are automatically named and created. + * + * @param i_dms_name The path to the input DMS. + * @param i_view_name The name of the input view. + * @param taxonomy_name The name of the taxonomy in the input DMS. + * @param o_dms_name The path to the output DMS. + * @param o_view_name The name of the output view. + * @param o_view_comments The comments to associate with the output view. + * @param primer1 The first primer. + * @param primer2 The second primer. + * @param error_max The maximum number of errors allowed per primer for amplification. + * @param min_len The minimum length of an amplicon. + * @param max_len The maximum length of an amplicon. + * @param restrict_to_taxids A pointer on an array of taxids. A sequence must belong to at least one of the groups formed by the taxids to be kept + * (example: be a genus under a family in the list). + * @param ignore_taxids A pointer on an array of taxids. A sequence must NOT belong to any of the groups formed by the taxids to be kept. + * @param circular Whether the input sequences are circular (e.g. mitochondrial or chloroplastic DNA). + * @param salt_concentration The salt concentration used for estimating the Tm. + * @param salt_correction_method The method used for estimating the Tm (melting temperature) between the primers and their corresponding + * target sequences. SANTALUCIA: 1, or OWCZARZY: 2. + * @param keep_nucleotides The number of nucleotides to keep on each side of the in silico amplified sequences + * (already including the amplified DNA fragment plus the two target sequences of the primers). + * @param kingdom_mode Whether the kingdom or the superkingdom informations should be printed to the output. + * + * @returns A value indicating the success of the operation. + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since June 2018 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int obi_ecopcr(const char* i_dms_name, + const char* i_view_name, + const char* taxonomy_name, + const char* o_dms_name, + const char* o_view_name, + const char* o_view_comments, + const char* primer1, + const char* primer2, + int error_max, + int min_len, + int max_len, + int32_t* restrict_to_taxids, + int32_t* ignore_taxids, + bool circular, + double salt_concentration, + int salt_correction_method, + int keep_nucleotides, + bool kingdom_mode); + +#endif /* OBI_ECOPCR_H_ */ + diff --git a/src/obierrno.h b/src/obierrno.h index 61fe10b..53f8e3a 100644 --- a/src/obierrno.h +++ b/src/obierrno.h @@ -124,7 +124,8 @@ extern int obi_errno; */ #define OBI_CLEAN_ERROR (32) /** Error while cleaning sequences */ - +#define OBI_ECOPCR_ERROR (33) /** Error while performing an in silico PCR + */ /**@}*/ #endif /* OBIERRNO_H_ */