Update to consider evolution of the language

This commit is contained in:
2023-06-29 12:12:46 +02:00
parent 73236c72a8
commit 92826de147
18 changed files with 146 additions and 199 deletions

14
.gitignore vendored
View File

@ -1,16 +1,14 @@
*.o
# /src/
/src/*.P /src/*.P
/src/*.a /src/*.a
/src/ecoPrimer /src/ecoPrimer
# /src/libecoPCR/
/src/libecoPCR/*.P /src/libecoPCR/*.P
/src/libecoPCR/*.a /src/libecoPCR/*.a
/src/libecoPCR/*.o
# /src/libecoprimer/
/src/libecoprimer/*.P /src/libecoprimer/*.P
/src/libecoprimer/*.a /src/libecoprimer/*.a
/src/libecoprimer/*.o
# /src/libthermo/
/src/libthermo/*.P /src/libthermo/*.P
/src/libthermo/*.a
/src/libthermo/*.o
phylonorway.*

View File

@ -171,13 +171,9 @@ void printapair(int32_t index,ppair_t pair, poptions_t options)
bool_t good2=pair->p2->good; bool_t good2=pair->p2->good;
bool_t goodtmp; bool_t goodtmp;
bool_t strand; bool_t strand;
uint32_t i, j; uint32_t i;
float temp;
CNNParams nnparams;
//nparam_InitParams(&nnparams, DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,DEF_SALT,SALT_METHOD_SANTALUCIA); const char *c;
char *c;
char p1[32]; char p1[32];
char p2[32]; char p2[32];
@ -376,21 +372,20 @@ uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t opti
void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy, pecodnadb_t seqdb) void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy, pecodnadb_t seqdb)
{ {
ppair_t* sortedpairs; ppair_t* sortedpairs;
ppair_t* index; // ppair_t* index;
ppairlist_t pl; ppairlist_t pl;
size_t i,j; size_t i,j;
size_t count; size_t count;
char *taxon[]={"taxon","taxa"}; char *taxon[]={"taxon","taxa"};
ecotx_t *current_taxon; ecotx_t *current_taxon;
//pairset pair_sets; //pairset pair_sets;
pairset *pset = NULL;
//printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n"); //printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n");
fprintf(stderr,"Total pair count : %d\n",pairs->count); fprintf(stderr,"Total pair count : %d\n",pairs->count);
sortedpairs = ECOMALLOC(pairs->count*sizeof(ppair_t),"Cannot Allocate ordered pairs"); sortedpairs = ECOMALLOC(pairs->count*sizeof(ppair_t),"Cannot Allocate ordered pairs");
index=sortedpairs; // index=sortedpairs;
pl=pairs->first; pl=pairs->first;
j=0; j=0;
while(pl->next) while(pl->next)
@ -475,10 +470,10 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy,
if (options->filter_on_links) if (options->filter_on_links)
{ {
fprintf (stderr, "Old size: %d, ", count); fprintf (stderr, "Old size: %ld, ", count);
count = primers_changeSortedArray (&sortedpairs, count, options); count = primers_changeSortedArray (&sortedpairs, count, options);
//count = primers_filterWithGivenLinks (&sortedpairs, count, options); //count = primers_filterWithGivenLinks (&sortedpairs, count, options);
fprintf (stderr, "New size: %d\n", count); fprintf (stderr, "New size: %ld\n", count);
if (count == 0) if (count == 0)
{ {
@ -803,9 +798,9 @@ int main(int argc, char **argv)
if (options.saltmethod != 2) //if not SALT_METHOD_OWCZARZY if (options.saltmethod != 2) //if not SALT_METHOD_OWCZARZY
options.saltmethod = SALT_METHOD_SANTALUCIA; //then force SALT_METHOD_SANTALUCIA options.saltmethod = SALT_METHOD_SANTALUCIA; //then force SALT_METHOD_SANTALUCIA
if (options.salt < 0.01 || options.salt > 0.3) //if salt value out of literature values if (options.salt < 0.01 || options.salt > 0.3) {//if salt value out of literature values
options.salt = DEF_SALT; //set to default options.salt = DEF_SALT; //set to default
}
nparam_InitParams(&nnparams, DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,options.salt,options.saltmethod); nparam_InitParams(&nnparams, DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,options.salt,options.saltmethod);
fprintf(stderr,"Reading taxonomy database ..."); fprintf(stderr,"Reading taxonomy database ...");
@ -851,7 +846,7 @@ int main(int argc, char **argv)
fprintf(stderr,"\nIndexing words in sequences\n"); fprintf(stderr,"\nIndexing words in sequences\n");
words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options); words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
fprintf(stderr,"\n Strict primer count : %d\n",words->size); fprintf(stderr,"\n Strict primer count : %lld\n",words->size);
/*/TR Testing /*/TR Testing
fprintf(stderr,"\nReducing for debugging\n"); fprintf(stderr,"\nReducing for debugging\n");
@ -871,7 +866,7 @@ int main(int argc, char **argv)
if (options.no_multi_match) if (options.no_multi_match)
{ {
(void)filterMultiStrictPrimer(words); (void)filterMultiStrictPrimer(words);
fprintf(stderr,"\n Strict primer with single match count : %d\n",words->size); fprintf(stderr,"\n Strict primer with single match count : %lld\n",words->size);
} }
@ -921,7 +916,7 @@ pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options)
{ {
uint32_t i, k; uint32_t i, k;
pwordcount_t new_words; pwordcount_t new_words;
char *rwrd; const char *rwrd;
char dwrd[20]; char dwrd[20];
/*char *strict_words[DEBUG_WORDS_CNT] = {"GAGTCTCTGCACCTATCC", "GCAATCCTGAGCCAAATC", "ACCCCTAACCACAACTCA", /*char *strict_words[DEBUG_WORDS_CNT] = {"GAGTCTCTGCACCTATCC", "GCAATCCTGAGCCAAATC", "ACCCCTAACCACAACTCA",
"TCCGAACCGACTGATGTT", "GAAGCTTGGGTGAAACTA", "GGAGAACCAGCTAGCTCT", "GCTGGTTCTCCCCGAAAT", "TCCGAACCGACTGATGTT", "GAAGCTTGGGTGAAACTA", "GGAGAACCAGCTAGCTCT", "GCTGGTTCTCCCCGAAAT",
@ -981,7 +976,7 @@ pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options)
void print_wordwith_positions (primer_t prm, uint32_t seqdbsize, poptions_t options) void print_wordwith_positions (primer_t prm, uint32_t seqdbsize, poptions_t options)
{ {
char *wrd; const char *wrd;
uint32_t i, j; uint32_t i, j;
char *twrd = "GCCTGTTTACCAAAAACA"; char *twrd = "GCCTGTTTACCAAAAACA";

View File

@ -1,5 +1,5 @@
MACHINE=MAC_OS_X MACHINE=MAC_OS_X
LIBPATH= -LlibecoPCR -Llibecoprimer -Llibthermo LIBPATH= -LlibecoPCR -Llibecoprimer -Llibthermo -L/usr/local/lib
MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $< MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $<
CC=gcc CC=gcc

View File

@ -2,7 +2,7 @@
#include <stdlib.h> #include <stdlib.h>
static int eco_log_malloc = 0; static int eco_log_malloc = 0;
static size_t eco_amount_malloc=0; //static size_t eco_amount_malloc=0;
static size_t eco_chunk_malloc=0; static size_t eco_chunk_malloc=0;
void eco_trace_memory_allocation() void eco_trace_memory_allocation()
@ -37,7 +37,7 @@ void *eco_malloc(int64_t chunksize,
if (eco_log_malloc) if (eco_log_malloc)
fprintf(stderr, fprintf(stderr,
"Memory segment located at %p of size %d is allocated (file : %s [%d])", "Memory segment located at %p of size %lld is allocated (file : %s [%d])",
chunk, chunk,
chunksize, chunksize,
filename, filename,
@ -65,7 +65,7 @@ void *eco_realloc(void *chunk,
if (!newchunk) if (!newchunk)
{ {
fprintf(stderr,"Requested memory : %d\n",newsize); fprintf(stderr,"Requested memory : %lld\n",newsize);
ecoError(ECO_MEM_ERROR,error_message,filename,line); ecoError(ECO_MEM_ERROR,error_message,filename,line);
} }
if (!chunk) if (!chunk)
@ -73,7 +73,7 @@ void *eco_realloc(void *chunk,
if (eco_log_malloc) if (eco_log_malloc)
fprintf(stderr, fprintf(stderr,
"Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])", "Old memory segment %p is reallocated at %p with a size of %lld (file : %s [%d])",
chunk, chunk,
newchunk, newchunk,
newsize, newsize,

View File

@ -21,8 +21,8 @@ typedef struct {
int32_t taxid; int32_t taxid;
char AC[20]; char AC[20];
int32_t DE_length; int32_t DE_length;
int32_t SQ_length; u_int32_t SQ_length;
int32_t CSQ_length; /*what is this CSQ_length ? */ u_int32_t CSQ_length; /*what is this CSQ_length ? */
char data[1]; char data[1];
@ -30,7 +30,7 @@ typedef struct {
typedef struct { typedef struct {
int32_t taxid; int32_t taxid;
int32_t SQ_length; u_int32_t SQ_length;
int32_t isexample; int32_t isexample;
char *AC; char *AC;
char *DE; char *DE;

View File

@ -2,6 +2,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <zlib.h> #include <zlib.h>
#include <ctype.h>
#include <string.h> #include <string.h>
#include <stdio.h> #include <stdio.h>

View File

@ -1139,7 +1139,7 @@ void sets_by_SimulatedAnealing (pairset *pair_set,
{ {
pair_set = extend_set_randomly (NULL, &params, 3); pair_set = extend_set_randomly (NULL, &params, 3);
printf("\nStart Random seed set for Simulated :\n"); printf("\nStart Random seed set for Simulated :\n");
print_set_info (&pair_set, &params); print_set_info (pair_set, &params);
} }
min_spc = max_spc = pair_set->set_specificity; min_spc = max_spc = pair_set->set_specificity;
min_cov = max_cov = pair_set->set_coverage; min_cov = max_cov = pair_set->set_coverage;
@ -1566,7 +1566,8 @@ int32_t *addinset (int32_t *set, int32_t i, int32_t j, int32_t* slots, int32_t *
size_t primers_changeSortedArray (ppair_t ** pairs, size_t primers_changeSortedArray (ppair_t ** pairs,
size_t sorted_count, poptions_t options) size_t sorted_count, poptions_t options)
{ {
int32_t i, j, k, l, total_links; int32_t i, k, total_links;
u_int32_t j;
int *owi; int *owi;
int *iwi; int *iwi;
int allowedtaxa; int allowedtaxa;
@ -1579,14 +1580,14 @@ size_t primers_changeSortedArray (ppair_t ** pairs,
idx_set = ECOMALLOC(slots*sizeof (int32_t), idx_set = ECOMALLOC(slots*sizeof (int32_t),
"Could not allocate memory for index set."); "Could not allocate memory for index set.");
for (i=0; i<sorted_count; i++) for (i=0; ((u_int32_t)i)<sorted_count; i++)
{ {
owi = sortedpairs[i]->wellIdentifiedSeqs; owi = sortedpairs[i]->wellIdentifiedSeqs;
passed = FALSE; passed = FALSE;
for (j=0; j<sorted_count; j++) for (j=0; j<sorted_count; j++)
{ {
if (i == j) continue; if ((u_int32_t)i == j) continue;
iwi = sortedpairs[j]->wellIdentifiedSeqs; iwi = sortedpairs[j]->wellIdentifiedSeqs;
total_links = 0; total_links = 0;
@ -1610,9 +1611,10 @@ size_t primers_changeSortedArray (ppair_t ** pairs,
if (options->max_links_percent > 0) if (options->max_links_percent > 0)
{ {
allowedtaxa = options->max_links_percent; allowedtaxa = options->max_links_percent;
if (total_links > allowedtaxa) if (total_links > allowedtaxa){
passed = TRUE; passed = TRUE;
break; }
break;
} }
else else
if (!(total_links > 5 && total_links <= allowedtaxa)) if (!(total_links > 5 && total_links <= allowedtaxa))
@ -1630,7 +1632,7 @@ size_t primers_changeSortedArray (ppair_t ** pairs,
for (j=0; j<sorted_count; j++) for (j=0; j<sorted_count; j++)
{ {
for (k=0; k<index; k++) for (k=0; k<index; k++)
if (j == idx_set[k]) break; if (j == (u_int32_t)(idx_set[k])) break;
//need to remove this element //need to remove this element
if (k == index) if (k == index)
{ {
@ -1711,9 +1713,9 @@ int32_t *addinset_withLinks (int32_t *set, int32_t i, int32_t* slots, int32_t *i
size_t primers_filterWithGivenLinks (ppair_t ** pairs, size_t primers_filterWithGivenLinks (ppair_t ** pairs,
size_t sorted_count, poptions_t options) size_t sorted_count, poptions_t options)
{ {
int32_t i, j, k; int32_t i, k;
u_int32_t j;
ppair_t *sortedpairs = *pairs; ppair_t *sortedpairs = *pairs;
bool_t passed;
int32_t *idx_set = NULL; int32_t *idx_set = NULL;
int32_t slots=50, index=0; int32_t slots=50, index=0;
@ -1732,7 +1734,7 @@ size_t primers_filterWithGivenLinks (ppair_t ** pairs,
for (j=0; j<sorted_count; j++) for (j=0; j<sorted_count; j++)
{ {
for (k=0; k<index; k++) for (k=0; k<index; k++)
if (j == idx_set[k]) break; if (j == (u_int32_t) (idx_set[k])) break;
//need to remove this element //need to remove this element
if (k == index) if (k == index)
{ {
@ -1755,7 +1757,7 @@ size_t primers_filterWithGivenLinks (ppair_t ** pairs,
else i=sorted_count; else i=sorted_count;
for (j=0; j<i; j++) for (j=0; j<(u_int32_t)i; j++)
for (k=0; k<options->dbsize; k++) for (k=0; k<options->dbsize; k++)
if ((*pairs)[j]->coveredSeqs[k] == 1) if ((*pairs)[j]->coveredSeqs[k] == 1)
cov[k] = 1; cov[k] = 1;

View File

@ -213,7 +213,6 @@ pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsi
char *base; char *base;
int8_t code; int8_t code;
uint32_t goodPrimers=0; uint32_t goodPrimers=0;
static int iii=0;
//inSequenceQuorum = (uint32_t)floor((float)exampleCount * options->sensitivity_quorum); //inSequenceQuorum = (uint32_t)floor((float)exampleCount * options->sensitivity_quorum);

View File

@ -36,7 +36,7 @@ int32_t ManberNoErr(pecoseq_t pseq,ppattern_t pat,
ppatternParam_t param, ppatternParam_t param,
StackiPtr stkpos) StackiPtr stkpos)
{ {
int32_t pos; u_int32_t pos;
uint32_t smask, r; uint32_t smask, r;
uint8_t *data; uint8_t *data;
int32_t end; int32_t end;
@ -84,7 +84,7 @@ int32_t ManberSub(pecoseq_t pseq,ppattern_t pat,
StackiPtr stkpos) StackiPtr stkpos)
{ {
int e, found; int e, found;
int32_t pos; u_int32_t pos;
uint32_t smask, cmask, sindx; uint32_t smask, cmask, sindx;
uint32_t *pr, r[2 * MAX_PAT_ERR + 2]; uint32_t *pr, r[2 * MAX_PAT_ERR + 2];
uint8_t *data; uint8_t *data;

View File

@ -201,7 +201,7 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3
data[w].good = data[w].inexample >= inSequenceQuorum && data[w].outexample <= outSequenceQuorum; data[w].good = data[w].inexample >= inSequenceQuorum && data[w].outexample <= outSequenceQuorum;
goodPrimers+=data[w].good? 1:0; goodPrimers+=data[w].good? 1:0;
fprintf(stderr,"Primers %5d/%d analyzed => sequence : %s in %d example and %d counterexample sequences \r", fprintf(stderr,"Primers %5d/%lld analyzed => sequence : %s in %d example and %d counterexample sequences \r",
i+1,words->size,ecoUnhashWord(data[w].word,options->primer_length), i+1,words->size,ecoUnhashWord(data[w].word,options->primer_length),
data[w].inexample,data[w].outexample); data[w].inexample,data[w].outexample);
@ -224,8 +224,8 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3
ECOFREE(data[w].reversePos,"Free direct count table"); ECOFREE(data[w].reversePos,"Free direct count table");
} }
fprintf(stderr,"\n\nOn %d analyzed primers %d respect quorum conditions\n",words->size,goodPrimers); fprintf(stderr,"\n\nOn %lld analyzed primers %d respect quorum conditions\n",words->size,goodPrimers);
fprintf(stderr,"Conserved primers for further analysis : %d/%d\n",w,words->size); fprintf(stderr,"Conserved primers for further analysis : %d/%lld\n",w,words->size);
primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table"); primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table");
primers->primers=ECOREALLOC(data, primers->primers=ECOREALLOC(data,

View File

@ -8,14 +8,6 @@
#ifndef HASHENCODER_H_ #ifndef HASHENCODER_H_
#define HASHENCODER_H_ #define HASHENCODER_H_
static int8_t encoder[] = {0, // A extern int8_t encoder[];
-1, // b
1, // C
-1,-1,-1, // d, e, f
2, // G
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
3,3, // T,U
-1,-1,-1,-1,-1}; // v,w,x,y,z
#endif /* HASHENCODER_H_ */ #endif /* HASHENCODER_H_ */

View File

@ -12,6 +12,16 @@ static int cmpword(const void *x,const void *y);
#include "hashencoder.h" #include "hashencoder.h"
int8_t encoder[] = {0, // A
-1, // b
1, // C
-1,-1,-1, // d, e, f
2, // G
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
3,3, // T,U
-1,-1,-1,-1,-1}; // v,w,x,y,z
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq) uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
{ {
uint32_t wordcount; uint32_t wordcount;
@ -257,6 +267,6 @@ uint32_t ecoFindWord(pwordcount_t table,word_t word)
char ecoComplementChar(char base) char ecoComplementChar(char base)
{ {
return (base < 4)? !base & 3: 4; return (base < 4)? ~base & 3: 4;
} }

View File

@ -205,8 +205,8 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
} }
else else
{ {
uint32_t s; // uint32_t s;
s = strictprimers->size; // s = strictprimers->size;
// DEBUG_LOG("stack size : %u",s); // DEBUG_LOG("stack size : %u",s);
addSeqToWordCountTable(strictprimers,options->primer_length, addSeqToWordCountTable(strictprimers,options->primer_length,
options->circular, options->circular,
@ -223,7 +223,7 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
logfile = fopen(logfilename,"a"); logfile = fopen(logfilename,"a");
seconde = timeval_subtract(&(usage.ru_utime),&(start.ru_utime)) + seconde = timeval_subtract(&(usage.ru_utime),&(start.ru_utime)) +
timeval_subtract(&(usage.ru_stime),&(start.ru_stime)); timeval_subtract(&(usage.ru_stime),&(start.ru_stime));
fprintf(logfile,"%d\t%llu\t%lu\t%8.3f\t%8.3e\n",i, fprintf(logfile,"%d\t%llu\t%llu\t%8.3f\t%8.3e\n",i,
(long long unsigned)totallength, (long long unsigned)totallength,
strictprimers->size*(sizeof(int64_t)+sizeof(int32_t)), strictprimers->size*(sizeof(int64_t)+sizeof(int32_t)),
seconde,seconde/(double)totallength); seconde,seconde/(double)totallength);
@ -248,9 +248,9 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
sizeof(word_t)*strictprimers->size, sizeof(word_t)*strictprimers->size,
"Cannot reallocate strict primer table"); "Cannot reallocate strict primer table");
if (neededWords) if (neededWords){
ECOFREE(neededWords,"Clean needed word table"); ECOFREE(neededWords,"Clean needed word table");
}
//TR: Somehow for some primers strictcount value is extremely large hence invalid //TR: Somehow for some primers strictcount value is extremely large hence invalid
//we need to remove these primers from the list //we need to remove these primers from the list
j = strictprimers->size+1; j = strictprimers->size+1;

View File

@ -231,8 +231,8 @@ static int cmpamp(const void *ampf1, const void* ampf2)
char cd1; char cd1;
char cd2; char cd2;
int len = 0; int len = 0;
char *ch1; const char *ch1;
char *ch2; const char *ch2;
int incr1; int incr1;
int incr2; int incr2;

View File

@ -17,6 +17,15 @@
double forbidden_entropy; double forbidden_entropy;
char bpencoder[] = { 1, // A
0, // b
2, // C
0,0,0, // d, e, f
3, // G
0,0,0,0,0,0,0,0,0,0,0,0, // h,i,j,k,l,m,n,o,p,q,r,s
4,0, // T,U
0,0,0,0,0}; // v,w,x,y,z
double nparam_GetInitialEntropy(PNNParams nparm) double nparam_GetInitialEntropy(PNNParams nparm)
{ {
@ -470,7 +479,7 @@ int nparam_CountGCContent(char * seq ) {
return count; return count;
} }
void nparam_CleanSeq (char* inseq, char* outseq, int len) void nparam_CleanSeq (const char* inseq, char* outseq, int len)
{ {
int seqlen = strlen (inseq); int seqlen = strlen (inseq);
int i, j; int i, j;
@ -508,7 +517,7 @@ void nparam_CleanSeq (char* inseq, char* outseq, int len)
} }
//Calculate TM for given sequence against its complement //Calculate TM for given sequence against its complement
double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len) double nparam_CalcSelfTM(PNNParams nparm, const char* seq, size_t len)
{ {
double thedH = 0; double thedH = 0;
//double thedS = nparam_GetInitialEntropy(nparm); //double thedS = nparam_GetInitialEntropy(nparm);
@ -520,7 +529,7 @@ double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len)
char c4; char c4;
unsigned int i; unsigned int i;
char nseq[50]; char nseq[50];
char *useq = seq; const char *useq = seq;
nparam_CleanSeq (seq, nseq, len); nparam_CleanSeq (seq, nseq, len);
useq = nseq; useq = nseq;
@ -533,7 +542,7 @@ double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len)
c4 = GETNUMCODE(useq[i]); c4 = GETNUMCODE(useq[i]);
thedH += nparm->dH[c3][c4][c1][c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2); thedH += nparm->dH[(u_int8_t) c3][(u_int8_t)c4][(u_int8_t)c1][(u_int8_t)c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2);
thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2); thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2);
} }
//printf("------------------\n"); //printf("------------------\n");
@ -543,7 +552,7 @@ double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len)
return mtemp; return mtemp;
} }
double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len) double nparam_CalcTwoTM(PNNParams nparm, const char* seq1, const char* seq2, size_t len)
{ {
double thedH = 0; double thedH = 0;
//double thedS = nparam_GetInitialEntropy(nparm); //double thedS = nparam_GetInitialEntropy(nparm);
@ -575,7 +584,7 @@ double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len)
//fprintf (stderr,"Primer : %s %f %f %d %d, %d %d %f\n",useq,thedH,thedS,(int)c3,(int)c4,(int)c1,(int)c2,nparam_GetEnthalpy(nparm, c3,c4,c1,c2)); //fprintf (stderr,"Primer : %s %f %f %d %d, %d %d %f\n",useq,thedH,thedS,(int)c3,(int)c4,(int)c1,(int)c2,nparam_GetEnthalpy(nparm, c3,c4,c1,c2));
thedH += nparm->dH[c3][c4][c1][c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2); thedH += nparm->dH[(u_int8_t)c3][(u_int8_t)c4][(u_int8_t)c1][(u_int8_t)c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2);
thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2); thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2);
} }
//fprintf(stderr,"------------------\n"); //fprintf(stderr,"------------------\n");

View File

@ -16,8 +16,8 @@
//#include "../libecoprimer/ecoprimer.h" //#include "../libecoprimer/ecoprimer.h"
// following defines to simplify coding... // following defines to simplify coding...
#define ndH(a,b,c,d) nparm->dH[a][b][c][d] #define ndH(a,b,c,d) nparm->dH[(u_int8_t) a][(u_int8_t) b][(u_int8_t) c][(u_int8_t) d]
#define ndS(a,b,c,d) nparm->dS[a][b][c][d] #define ndS(a,b,c,d) nparm->dS[(u_int8_t) a][(u_int8_t) b][(u_int8_t) c][(u_int8_t) d]
#define forbidden_enthalpy 1000000000000000000.0f #define forbidden_enthalpy 1000000000000000000.0f
#define R 1.987f #define R 1.987f
#define SALT_METHOD_SANTALUCIA 1 #define SALT_METHOD_SANTALUCIA 1
@ -33,14 +33,7 @@
extern double forbidden_entropy; extern double forbidden_entropy;
static char bpencoder[] = { 1, // A extern char bpencoder[]; // v,w,x,y,z
0, // b
2, // C
0,0,0, // d, e, f
3, // G
0,0,0,0,0,0,0,0,0,0,0,0, // h,i,j,k,l,m,n,o,p,q,r,s
4,0, // T,U
0,0,0,0,0}; // v,w,x,y,z
typedef struct CNNParams_st typedef struct CNNParams_st
@ -62,8 +55,8 @@ int nparam_CountGCContent(char * seq );
double nparam_GetEntropy(PNNParams nparm, char x0, char x1, char y0, char y1); double nparam_GetEntropy(PNNParams nparm, char x0, char x1, char y0, char y1);
double nparam_GetEnthalpy(PNNParams nparm, char x0, char x1, char y0, char y1); double nparam_GetEnthalpy(PNNParams nparm, char x0, char x1, char y0, char y1);
double nparam_CalcTM(double entropy,double enthalpy); double nparam_CalcTM(double entropy,double enthalpy);
double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len); double nparam_CalcSelfTM(PNNParams nparm, const char* seq, size_t len);
double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len); double nparam_CalcTwoTM(PNNParams nparm, const char* seq1, const char* seq2, size_t len);
double nparam_GetInitialEntropy(PNNParams nparm) ; double nparam_GetInitialEntropy(PNNParams nparm) ;
double calculateMeltingTemperatureBasic (char * seq); double calculateMeltingTemperatureBasic (char * seq);

View File

@ -28,17 +28,15 @@ word_t extractSite(char* sequence, size_t begin, size_t length, bool_t strand)
void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options) void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options)
{ {
size_t i, j,k,l; size_t i, j;
uint32_t bp1,bp2; uint32_t bp1,bp2;
uint32_t ep1,ep2;
word_t w1; word_t w1;
word_t w2; word_t w2;
bool_t strand; bool_t strand;
char *sq,*sq1,*sq2,*c; char *sq;
char prmrd[50]; char prmrd[50];
char prmrr[50]; char prmrr[50];
char sqsite[50];
double mtemp; double mtemp;
for (i = 0; i < count; i++) for (i = 0; i < count; i++)

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python3
import re import re
import gzip import gzip
@ -6,12 +6,9 @@ import struct
import sys import sys
import time import time
import getopt import getopt
from functools import cmp_to_key
try: _dbenable=False
import psycopg2
_dbenable=True
except ImportError:
_dbenable=False
##### #####
# #
@ -80,15 +77,17 @@ class ColumnFile(object):
def __init__(self,stream,sep=None,strip=True,types=None): def __init__(self,stream,sep=None,strip=True,types=None):
if isinstance(stream,str): if isinstance(stream,str):
self._stream = open(stream) self._stream = open(stream)
elif hasattr(stream,'next'):
self._stream = stream
else: else:
raise ValueError,'stream must be string or an iterator' try:
iter(stream)
self._stream = stream
except TypeError:
raise ValueError('stream must be string or an iterator')
self._delimiter=sep self._delimiter=sep
self._strip=strip self._strip=strip
if types: if types:
self._types=[x for x in types] self._types=[x for x in types]
for i in xrange(len(self._types)): for i in range(len(self._types)):
if self._types[i] is bool: if self._types[i] is bool:
self._types[i]=ColumnFile.str2bool self._types[i]=ColumnFile.str2bool
else: else:
@ -103,14 +102,14 @@ class ColumnFile(object):
def __iter__(self): def __iter__(self):
return self return self
def next(self): def __next__(self):
ligne = self._stream.next() ligne = next(self._stream)
data = ligne.split(self._delimiter) data = ligne.split(self._delimiter)
if self._strip or self._types: if self._strip or self._types:
data = [x.strip() for x in data] data = [x.strip() for x in data]
if self._types: if self._types:
it = endLessIterator(self._types) it = endLessIterator(self._types)
data = [x[1](x[0]) for x in ((y,it.next()) for y in data)] data = [x[1](x[0]) for x in ((y,next(it)) for y in data)]
return data return data
def taxonCmp(t1,t2): def taxonCmp(t1,t2):
@ -125,14 +124,14 @@ def bsearchTaxon(taxonomy,taxid):
begin = 0 begin = 0
end = taxCount end = taxCount
oldcheck=taxCount oldcheck=taxCount
check = begin + end / 2 check = int(begin + end / 2)
while check != oldcheck and taxonomy[check][0]!=taxid : while check != oldcheck and taxonomy[check][0]!=taxid :
if taxonomy[check][0] < taxid: if taxonomy[check][0] < taxid:
begin=check begin=check
else: else:
end=check end=check
oldcheck=check oldcheck=check
check = (begin + end) / 2 check = int((begin + end) / 2)
if taxonomy[check][0]==taxid: if taxonomy[check][0]==taxid:
@ -152,22 +151,22 @@ def readNodeTable(file):
str,str,bool, str,str,bool,
int,bool,int, int,bool,int,
bool,bool,bool,str)) bool,bool,bool,str))
print >>sys.stderr,"Reading taxonomy dump file..." print("Reading taxonomy dump file...", file=sys.stderr)
taxonomy=[[n[0],n[2],n[1]] for n in nodes] taxonomy=[[n[0],n[2],n[1]] for n in nodes]
print >>sys.stderr,"List all taxonomy rank..." print("List all taxonomy rank...", file=sys.stderr)
ranks =list(set(x[1] for x in taxonomy)) ranks =list(set(x[1] for x in taxonomy))
ranks.sort() ranks.sort()
ranks = dict(map(None,ranks,xrange(len(ranks)))) ranks = {rank: index for index, rank in enumerate(ranks)}
print >>sys.stderr,"Sorting taxons..." print("Sorting taxons...", file=sys.stderr)
taxonomy.sort(taxonCmp) taxonomy.sort(key=lambda x: x[0])
print >>sys.stderr,"Indexing taxonomy..." print("Indexing taxonomy...", file=sys.stderr)
index = {} index = {}
for t in taxonomy: for t in taxonomy:
index[t[0]]=bsearchTaxon(taxonomy, t[0]) index[t[0]]=bsearchTaxon(taxonomy, t[0])
print >>sys.stderr,"Indexing parent and rank..." print("Indexing parent and rank...", file=sys.stderr)
for t in taxonomy: for t in taxonomy:
t[1]=ranks[t[1]] t[1]=ranks[t[1]]
t[2]=index[t[2]] t[2]=index[t[2]]
@ -203,7 +202,7 @@ def deletedNodeIterator(file):
def readTaxonomyDump(taxdir): def readTaxonomyDump(taxdir):
taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir) taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir)
print >>sys.stderr,"Adding scientific name..." print("Adding scientific name...", file=sys.stderr)
alternativeName=[] alternativeName=[]
for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir): for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir):
@ -211,66 +210,16 @@ def readTaxonomyDump(taxdir):
if classname == 'scientific name': if classname == 'scientific name':
taxonomy[index[taxid]].append(name) taxonomy[index[taxid]].append(name)
print >>sys.stderr,"Adding taxid alias..." print("Adding taxid alias...", file=sys.stderr)
for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir): for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir):
index[taxid]=index[current] index[taxid]=index[current]
print >>sys.stderr,"Adding deleted taxid..." print("Adding deleted taxid...", file=sys.stderr)
for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir): for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir):
index[taxid]=None index[taxid]=None
return taxonomy,ranks,alternativeName,index return taxonomy,ranks,alternativeName,index
def readTaxonomyDB(dbname):
connection = psycopg2.connect(database=dbname)
cursor = connection.cursor()
cursor.execute("select numid,rank,parent from ncbi_taxonomy.taxon")
taxonomy=[list(x) for x in cursor]
cursor.execute("select rank_class from ncbi_taxonomy.taxon_rank_class order by rank_class")
ranks=cursor.fetchall()
ranks = dict(map(None,(x[0] for x in ranks),xrange(len(ranks))))
print >>sys.stderr,"Sorting taxons..."
taxonomy.sort(taxonCmp)
print >>sys.stderr,"Indexing taxonomy..."
index = {}
for t in taxonomy:
index[t[0]]=bsearchTaxon(taxonomy, t[0])
print >>sys.stderr,"Indexing parent and rank..."
for t in taxonomy:
t[1]=ranks[t[1]]
try:
t[2]=index[t[2]]
except KeyError,e:
if t[2] is None and t[0]==1:
t[2]=index[t[0]]
else:
raise e
cursor.execute("select taxid,name,category from ncbi_taxonomy.name")
alternativeName=[]
for taxid,name,classname in cursor:
alternativeName.append((name,classname,index[taxid]))
if classname == 'scientific name':
taxonomy[index[taxid]].append(name)
cursor.execute("select old_numid,current_numid from ncbi_taxonomy.taxon_id_alias")
print >>sys.stderr,"Adding taxid alias..."
for taxid,current in cursor:
if current is not None:
index[taxid]=index[current]
else:
index[taxid]=None
return taxonomy,ranks,alternativeName,index
##### #####
# #
# #
@ -282,22 +231,27 @@ def readTaxonomyDB(dbname):
def entryIterator(file): def entryIterator(file):
file = universalOpen(file) file = universalOpen(file)
rep =[] rep =[]
for ligne in file: ligne = file.readline()
while ligne:
rep.append(ligne) rep.append(ligne)
if ligne == '//\n': if ligne == '//\n':
rep = ''.join(rep) rep = ''.join(rep)
yield rep yield rep
rep = [] rep = []
ligne = file.readline()
def fastaEntryIterator(file): def fastaEntryIterator(file):
file = universalOpen(file) file = universalOpen(file)
rep =[] rep =[]
for ligne in file: ligne = file.readline()
while ligne:
if ligne[0] == '>' and rep: if ligne[0] == '>' and rep:
rep = ''.join(rep) rep = ''.join(rep)
yield rep yield rep
rep = [] rep = []
rep.append(ligne) rep.append(ligne)
ligne = file.readline()
if rep: if rep:
rep = ''.join(rep) rep = ''.join(rep)
yield rep yield rep
@ -418,7 +372,7 @@ def taxonomyInfo(entry,connection):
def ecoSeqPacker(sq): def ecoSeqPacker(sq):
compactseq = gzip.zlib.compress(sq['sequence'],9) compactseq = gzip.zlib.compress(bytes(sq['sequence'],"ascii"),9)
cptseqlength = len(compactseq) cptseqlength = len(compactseq)
delength = len(sq['definition']) delength = len(sq['definition'])
@ -427,11 +381,11 @@ def ecoSeqPacker(sq):
packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength), packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength),
totalSize, totalSize,
sq['taxid'], sq['taxid'],
sq['id'], bytes(sq['id'],"ascii"),
delength, delength,
len(sq['sequence']), len(sq['sequence']),
cptseqlength, cptseqlength,
sq['definition'], bytes(sq['definition'],"ascii"),
compactseq) compactseq)
assert len(packed) == totalSize+4, "error in sequence packing" assert len(packed) == totalSize+4, "error in sequence packing"
@ -450,7 +404,7 @@ def ecoTaxPacker(tx):
tx[1], tx[1],
tx[2], tx[2],
namelength, namelength,
tx[3]) bytes(tx[3],"ascii"))
return packed return packed
@ -460,7 +414,7 @@ def ecoRankPacker(rank):
packed = struct.pack('> I %ds' % namelength, packed = struct.pack('> I %ds' % namelength,
namelength, namelength,
rank) bytes(rank, 'ascii'))
return packed return packed
@ -476,8 +430,8 @@ def ecoNamePacker(name):
namelength, namelength,
classlength, classlength,
name[2], name[2],
name[0], bytes(name[0], 'ascii'),
name[1]) bytes(name[1], 'ascii'))
return packed return packed
@ -505,11 +459,11 @@ def ecoSeqWriter(file,input,taxindex,parser):
skipped.append(entry['id']) skipped.append(entry['id'])
where = universalTell(input) where = universalTell(input)
progressBar(where, inputsize) progressBar(where, inputsize)
print >>sys.stderr," Readed sequences : %d " % seqcount, print(" Readed sequences : %d " % seqcount, end=' ', file=sys.stderr)
else: else:
skipped.append(entry['id']) skipped.append(entry['id'])
print >>sys.stderr print(file=sys.stderr)
output.seek(0,0) output.seek(0,0)
output.write(struct.pack('> I',seqcount)) output.write(struct.pack('> I',seqcount))
@ -530,7 +484,7 @@ def ecoRankWriter(file,ranks):
output = open(file,'wb') output = open(file,'wb')
output.write(struct.pack('> I',len(ranks))) output.write(struct.pack('> I',len(ranks)))
rankNames = ranks.keys() rankNames = list(ranks.keys())
rankNames.sort() rankNames.sort()
for rank in rankNames: for rank in rankNames:
@ -552,7 +506,7 @@ def ecoNameWriter(file,names):
output = open(file,'wb') output = open(file,'wb')
output.write(struct.pack('> I',len(names))) output.write(struct.pack('> I',len(names)))
names.sort(nameCmp) names.sort(key=lambda x:x[0].upper())
for name in names: for name in names:
output.write(ecoNamePacker(name)) output.write(ecoNamePacker(name))
@ -573,8 +527,8 @@ def ecoDBWriter(prefix,taxonomy,seqFileNames,parser):
taxonomy[3], taxonomy[3],
parser) parser)
if sk: if sk:
print >>sys.stderr,"Skipped entry :" print("Skipped entry :", file=sys.stderr)
print >>sys.stderr,sk print(sk, file=sys.stderr)
def ecoParseOptions(arguments): def ecoParseOptions(arguments):
opt = { opt = {
@ -618,34 +572,30 @@ def ecoParseOptions(arguments):
opt['parser']=sequenceIteratorFactory(emblEntryParser, opt['parser']=sequenceIteratorFactory(emblEntryParser,
entryIterator) entryIterator)
else: else:
raise ValueError,'Unknown option %s' % name raise ValueError('Unknown option %s' % name)
return opt,filenames return opt,filenames
def printHelp(): def printHelp():
print "-----------------------------------" print("-----------------------------------")
print " ecoPCRFormat.py" print(" ecoPCRFormat.py")
print "-----------------------------------" print("-----------------------------------")
print "ecoPCRFormat.py [option] <argument>" print("ecoPCRFormat.py [option] <argument>")
print "-----------------------------------" print("-----------------------------------")
print "-e --embl :[E]mbl format" print("-e --embl :[E]mbl format")
print "-f --fasta :[F]asta format" print("-f --fasta :[F]asta format")
print "-g --genbank :[G]enbank format" print("-g --genbank :[G]enbank format")
print "-h --help :[H]elp - print this help" print("-h --help :[H]elp - print this help")
print "-n --name :[N]ame of the new database created" print("-n --name :[N]ame of the new database created")
print "-t --taxonomy :[T]axonomy - path to the taxonomy database" print("-t --taxonomy :[T]axonomy - path to the taxonomy database")
print " :bcp-like dump from GenBank taxonomy database." print(" :bcp-like dump from GenBank taxonomy database.")
print "-----------------------------------" print("-----------------------------------")
if __name__ == '__main__': if __name__ == '__main__':
opt,filenames = ecoParseOptions(sys.argv[1:]) opt,filenames = ecoParseOptions(sys.argv[1:])
if opt['taxmod']=='dump': taxonomy = readTaxonomyDump(opt['taxdir'])
taxonomy = readTaxonomyDump(opt['taxdir'])
elif opt['taxmod']=='db':
taxonomy = readTaxonomyDB(opt['taxdb'])
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser']) ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])