Update to consider evolution of the language
This commit is contained in:
14
.gitignore
vendored
14
.gitignore
vendored
@ -1,16 +1,14 @@
|
||||
|
||||
# /src/
|
||||
*.o
|
||||
/src/*.P
|
||||
/src/*.a
|
||||
/src/ecoPrimer
|
||||
|
||||
# /src/libecoPCR/
|
||||
/src/libecoPCR/*.P
|
||||
/src/libecoPCR/*.a
|
||||
|
||||
# /src/libecoprimer/
|
||||
/src/libecoPCR/*.o
|
||||
/src/libecoprimer/*.P
|
||||
/src/libecoprimer/*.a
|
||||
|
||||
# /src/libthermo/
|
||||
/src/libecoprimer/*.o
|
||||
/src/libthermo/*.P
|
||||
/src/libthermo/*.a
|
||||
/src/libthermo/*.o
|
||||
phylonorway.*
|
||||
|
@ -171,13 +171,9 @@ void printapair(int32_t index,ppair_t pair, poptions_t options)
|
||||
bool_t good2=pair->p2->good;
|
||||
bool_t goodtmp;
|
||||
bool_t strand;
|
||||
uint32_t i, j;
|
||||
float temp;
|
||||
CNNParams nnparams;
|
||||
uint32_t i;
|
||||
|
||||
//nparam_InitParams(&nnparams, DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,DEF_SALT,SALT_METHOD_SANTALUCIA);
|
||||
|
||||
char *c;
|
||||
const char *c;
|
||||
char p1[32];
|
||||
char p2[32];
|
||||
|
||||
@ -376,21 +372,20 @@ uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t opti
|
||||
void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy, pecodnadb_t seqdb)
|
||||
{
|
||||
ppair_t* sortedpairs;
|
||||
ppair_t* index;
|
||||
// ppair_t* index;
|
||||
ppairlist_t pl;
|
||||
size_t i,j;
|
||||
size_t count;
|
||||
char *taxon[]={"taxon","taxa"};
|
||||
ecotx_t *current_taxon;
|
||||
//pairset pair_sets;
|
||||
pairset *pset = NULL;
|
||||
|
||||
//printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n");
|
||||
|
||||
fprintf(stderr,"Total pair count : %d\n",pairs->count);
|
||||
|
||||
sortedpairs = ECOMALLOC(pairs->count*sizeof(ppair_t),"Cannot Allocate ordered pairs");
|
||||
index=sortedpairs;
|
||||
// index=sortedpairs;
|
||||
pl=pairs->first;
|
||||
j=0;
|
||||
while(pl->next)
|
||||
@ -475,10 +470,10 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy,
|
||||
|
||||
if (options->filter_on_links)
|
||||
{
|
||||
fprintf (stderr, "Old size: %d, ", count);
|
||||
fprintf (stderr, "Old size: %ld, ", count);
|
||||
count = primers_changeSortedArray (&sortedpairs, count, options);
|
||||
//count = primers_filterWithGivenLinks (&sortedpairs, count, options);
|
||||
fprintf (stderr, "New size: %d\n", count);
|
||||
fprintf (stderr, "New size: %ld\n", count);
|
||||
|
||||
if (count == 0)
|
||||
{
|
||||
@ -803,9 +798,9 @@ int main(int argc, char **argv)
|
||||
if (options.saltmethod != 2) //if not SALT_METHOD_OWCZARZY
|
||||
options.saltmethod = SALT_METHOD_SANTALUCIA; //then force SALT_METHOD_SANTALUCIA
|
||||
|
||||
if (options.salt < 0.01 || options.salt > 0.3) //if salt value out of literature values
|
||||
if (options.salt < 0.01 || options.salt > 0.3) {//if salt value out of literature values
|
||||
options.salt = DEF_SALT; //set to default
|
||||
|
||||
}
|
||||
nparam_InitParams(&nnparams, DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,options.salt,options.saltmethod);
|
||||
|
||||
fprintf(stderr,"Reading taxonomy database ...");
|
||||
@ -851,7 +846,7 @@ int main(int argc, char **argv)
|
||||
fprintf(stderr,"\nIndexing words in sequences\n");
|
||||
|
||||
words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
|
||||
fprintf(stderr,"\n Strict primer count : %d\n",words->size);
|
||||
fprintf(stderr,"\n Strict primer count : %lld\n",words->size);
|
||||
|
||||
/*/TR Testing
|
||||
fprintf(stderr,"\nReducing for debugging\n");
|
||||
@ -871,7 +866,7 @@ int main(int argc, char **argv)
|
||||
if (options.no_multi_match)
|
||||
{
|
||||
(void)filterMultiStrictPrimer(words);
|
||||
fprintf(stderr,"\n Strict primer with single match count : %d\n",words->size);
|
||||
fprintf(stderr,"\n Strict primer with single match count : %lld\n",words->size);
|
||||
}
|
||||
|
||||
|
||||
@ -921,7 +916,7 @@ pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options)
|
||||
{
|
||||
uint32_t i, k;
|
||||
pwordcount_t new_words;
|
||||
char *rwrd;
|
||||
const char *rwrd;
|
||||
char dwrd[20];
|
||||
/*char *strict_words[DEBUG_WORDS_CNT] = {"GAGTCTCTGCACCTATCC", "GCAATCCTGAGCCAAATC", "ACCCCTAACCACAACTCA",
|
||||
"TCCGAACCGACTGATGTT", "GAAGCTTGGGTGAAACTA", "GGAGAACCAGCTAGCTCT", "GCTGGTTCTCCCCGAAAT",
|
||||
@ -981,7 +976,7 @@ pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options)
|
||||
|
||||
void print_wordwith_positions (primer_t prm, uint32_t seqdbsize, poptions_t options)
|
||||
{
|
||||
char *wrd;
|
||||
const char *wrd;
|
||||
uint32_t i, j;
|
||||
char *twrd = "GCCTGTTTACCAAAAACA";
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
MACHINE=MAC_OS_X
|
||||
LIBPATH= -LlibecoPCR -Llibecoprimer -Llibthermo
|
||||
LIBPATH= -LlibecoPCR -Llibecoprimer -Llibthermo -L/usr/local/lib
|
||||
MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $<
|
||||
|
||||
CC=gcc
|
||||
|
@ -2,7 +2,7 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
static int eco_log_malloc = 0;
|
||||
static size_t eco_amount_malloc=0;
|
||||
//static size_t eco_amount_malloc=0;
|
||||
static size_t eco_chunk_malloc=0;
|
||||
|
||||
void eco_trace_memory_allocation()
|
||||
@ -37,7 +37,7 @@ void *eco_malloc(int64_t chunksize,
|
||||
|
||||
if (eco_log_malloc)
|
||||
fprintf(stderr,
|
||||
"Memory segment located at %p of size %d is allocated (file : %s [%d])",
|
||||
"Memory segment located at %p of size %lld is allocated (file : %s [%d])",
|
||||
chunk,
|
||||
chunksize,
|
||||
filename,
|
||||
@ -65,7 +65,7 @@ void *eco_realloc(void *chunk,
|
||||
|
||||
if (!newchunk)
|
||||
{
|
||||
fprintf(stderr,"Requested memory : %d\n",newsize);
|
||||
fprintf(stderr,"Requested memory : %lld\n",newsize);
|
||||
ecoError(ECO_MEM_ERROR,error_message,filename,line);
|
||||
}
|
||||
if (!chunk)
|
||||
@ -73,7 +73,7 @@ void *eco_realloc(void *chunk,
|
||||
|
||||
if (eco_log_malloc)
|
||||
fprintf(stderr,
|
||||
"Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])",
|
||||
"Old memory segment %p is reallocated at %p with a size of %lld (file : %s [%d])",
|
||||
chunk,
|
||||
newchunk,
|
||||
newsize,
|
||||
|
@ -21,8 +21,8 @@ typedef struct {
|
||||
int32_t taxid;
|
||||
char AC[20];
|
||||
int32_t DE_length;
|
||||
int32_t SQ_length;
|
||||
int32_t CSQ_length; /*what is this CSQ_length ? */
|
||||
u_int32_t SQ_length;
|
||||
u_int32_t CSQ_length; /*what is this CSQ_length ? */
|
||||
|
||||
char data[1];
|
||||
|
||||
@ -30,7 +30,7 @@ typedef struct {
|
||||
|
||||
typedef struct {
|
||||
int32_t taxid;
|
||||
int32_t SQ_length;
|
||||
u_int32_t SQ_length;
|
||||
int32_t isexample;
|
||||
char *AC;
|
||||
char *DE;
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <zlib.h>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
|
@ -1139,7 +1139,7 @@ void sets_by_SimulatedAnealing (pairset *pair_set,
|
||||
{
|
||||
pair_set = extend_set_randomly (NULL, ¶ms, 3);
|
||||
printf("\nStart Random seed set for Simulated :\n");
|
||||
print_set_info (&pair_set, ¶ms);
|
||||
print_set_info (pair_set, ¶ms);
|
||||
}
|
||||
min_spc = max_spc = pair_set->set_specificity;
|
||||
min_cov = max_cov = pair_set->set_coverage;
|
||||
@ -1566,7 +1566,8 @@ int32_t *addinset (int32_t *set, int32_t i, int32_t j, int32_t* slots, int32_t *
|
||||
size_t primers_changeSortedArray (ppair_t ** pairs,
|
||||
size_t sorted_count, poptions_t options)
|
||||
{
|
||||
int32_t i, j, k, l, total_links;
|
||||
int32_t i, k, total_links;
|
||||
u_int32_t j;
|
||||
int *owi;
|
||||
int *iwi;
|
||||
int allowedtaxa;
|
||||
@ -1579,14 +1580,14 @@ size_t primers_changeSortedArray (ppair_t ** pairs,
|
||||
idx_set = ECOMALLOC(slots*sizeof (int32_t),
|
||||
"Could not allocate memory for index set.");
|
||||
|
||||
for (i=0; i<sorted_count; i++)
|
||||
for (i=0; ((u_int32_t)i)<sorted_count; i++)
|
||||
{
|
||||
owi = sortedpairs[i]->wellIdentifiedSeqs;
|
||||
passed = FALSE;
|
||||
|
||||
for (j=0; j<sorted_count; j++)
|
||||
{
|
||||
if (i == j) continue;
|
||||
if ((u_int32_t)i == j) continue;
|
||||
|
||||
iwi = sortedpairs[j]->wellIdentifiedSeqs;
|
||||
total_links = 0;
|
||||
@ -1610,9 +1611,10 @@ size_t primers_changeSortedArray (ppair_t ** pairs,
|
||||
if (options->max_links_percent > 0)
|
||||
{
|
||||
allowedtaxa = options->max_links_percent;
|
||||
if (total_links > allowedtaxa)
|
||||
if (total_links > allowedtaxa){
|
||||
passed = TRUE;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
else
|
||||
if (!(total_links > 5 && total_links <= allowedtaxa))
|
||||
@ -1630,7 +1632,7 @@ size_t primers_changeSortedArray (ppair_t ** pairs,
|
||||
for (j=0; j<sorted_count; j++)
|
||||
{
|
||||
for (k=0; k<index; k++)
|
||||
if (j == idx_set[k]) break;
|
||||
if (j == (u_int32_t)(idx_set[k])) break;
|
||||
//need to remove this element
|
||||
if (k == index)
|
||||
{
|
||||
@ -1711,9 +1713,9 @@ int32_t *addinset_withLinks (int32_t *set, int32_t i, int32_t* slots, int32_t *i
|
||||
size_t primers_filterWithGivenLinks (ppair_t ** pairs,
|
||||
size_t sorted_count, poptions_t options)
|
||||
{
|
||||
int32_t i, j, k;
|
||||
int32_t i, k;
|
||||
u_int32_t j;
|
||||
ppair_t *sortedpairs = *pairs;
|
||||
bool_t passed;
|
||||
|
||||
int32_t *idx_set = NULL;
|
||||
int32_t slots=50, index=0;
|
||||
@ -1732,7 +1734,7 @@ size_t primers_filterWithGivenLinks (ppair_t ** pairs,
|
||||
for (j=0; j<sorted_count; j++)
|
||||
{
|
||||
for (k=0; k<index; k++)
|
||||
if (j == idx_set[k]) break;
|
||||
if (j == (u_int32_t) (idx_set[k])) break;
|
||||
//need to remove this element
|
||||
if (k == index)
|
||||
{
|
||||
@ -1755,7 +1757,7 @@ size_t primers_filterWithGivenLinks (ppair_t ** pairs,
|
||||
else i=sorted_count;
|
||||
|
||||
|
||||
for (j=0; j<i; j++)
|
||||
for (j=0; j<(u_int32_t)i; j++)
|
||||
for (k=0; k<options->dbsize; k++)
|
||||
if ((*pairs)[j]->coveredSeqs[k] == 1)
|
||||
cov[k] = 1;
|
||||
|
@ -213,7 +213,6 @@ pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsi
|
||||
char *base;
|
||||
int8_t code;
|
||||
uint32_t goodPrimers=0;
|
||||
static int iii=0;
|
||||
|
||||
|
||||
//inSequenceQuorum = (uint32_t)floor((float)exampleCount * options->sensitivity_quorum);
|
||||
|
@ -36,7 +36,7 @@ int32_t ManberNoErr(pecoseq_t pseq,ppattern_t pat,
|
||||
ppatternParam_t param,
|
||||
StackiPtr stkpos)
|
||||
{
|
||||
int32_t pos;
|
||||
u_int32_t pos;
|
||||
uint32_t smask, r;
|
||||
uint8_t *data;
|
||||
int32_t end;
|
||||
@ -84,7 +84,7 @@ int32_t ManberSub(pecoseq_t pseq,ppattern_t pat,
|
||||
StackiPtr stkpos)
|
||||
{
|
||||
int e, found;
|
||||
int32_t pos;
|
||||
u_int32_t pos;
|
||||
uint32_t smask, cmask, sindx;
|
||||
uint32_t *pr, r[2 * MAX_PAT_ERR + 2];
|
||||
uint8_t *data;
|
||||
|
@ -201,7 +201,7 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3
|
||||
data[w].good = data[w].inexample >= inSequenceQuorum && data[w].outexample <= outSequenceQuorum;
|
||||
goodPrimers+=data[w].good? 1:0;
|
||||
|
||||
fprintf(stderr,"Primers %5d/%d analyzed => sequence : %s in %d example and %d counterexample sequences \r",
|
||||
fprintf(stderr,"Primers %5d/%lld analyzed => sequence : %s in %d example and %d counterexample sequences \r",
|
||||
i+1,words->size,ecoUnhashWord(data[w].word,options->primer_length),
|
||||
data[w].inexample,data[w].outexample);
|
||||
|
||||
@ -224,8 +224,8 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3
|
||||
ECOFREE(data[w].reversePos,"Free direct count table");
|
||||
}
|
||||
|
||||
fprintf(stderr,"\n\nOn %d analyzed primers %d respect quorum conditions\n",words->size,goodPrimers);
|
||||
fprintf(stderr,"Conserved primers for further analysis : %d/%d\n",w,words->size);
|
||||
fprintf(stderr,"\n\nOn %lld analyzed primers %d respect quorum conditions\n",words->size,goodPrimers);
|
||||
fprintf(stderr,"Conserved primers for further analysis : %d/%lld\n",w,words->size);
|
||||
|
||||
primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table");
|
||||
primers->primers=ECOREALLOC(data,
|
||||
|
@ -8,14 +8,6 @@
|
||||
#ifndef HASHENCODER_H_
|
||||
#define HASHENCODER_H_
|
||||
|
||||
static int8_t encoder[] = {0, // A
|
||||
-1, // b
|
||||
1, // C
|
||||
-1,-1,-1, // d, e, f
|
||||
2, // G
|
||||
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
|
||||
3,3, // T,U
|
||||
-1,-1,-1,-1,-1}; // v,w,x,y,z
|
||||
|
||||
extern int8_t encoder[];
|
||||
|
||||
#endif /* HASHENCODER_H_ */
|
||||
|
@ -12,6 +12,16 @@ static int cmpword(const void *x,const void *y);
|
||||
|
||||
#include "hashencoder.h"
|
||||
|
||||
int8_t encoder[] = {0, // A
|
||||
-1, // b
|
||||
1, // C
|
||||
-1,-1,-1, // d, e, f
|
||||
2, // G
|
||||
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
|
||||
3,3, // T,U
|
||||
-1,-1,-1,-1,-1}; // v,w,x,y,z
|
||||
|
||||
|
||||
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
|
||||
{
|
||||
uint32_t wordcount;
|
||||
@ -257,6 +267,6 @@ uint32_t ecoFindWord(pwordcount_t table,word_t word)
|
||||
|
||||
char ecoComplementChar(char base)
|
||||
{
|
||||
return (base < 4)? !base & 3: 4;
|
||||
return (base < 4)? ~base & 3: 4;
|
||||
}
|
||||
|
||||
|
@ -205,8 +205,8 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t s;
|
||||
s = strictprimers->size;
|
||||
// uint32_t s;
|
||||
// s = strictprimers->size;
|
||||
// DEBUG_LOG("stack size : %u",s);
|
||||
addSeqToWordCountTable(strictprimers,options->primer_length,
|
||||
options->circular,
|
||||
@ -223,7 +223,7 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
||||
logfile = fopen(logfilename,"a");
|
||||
seconde = timeval_subtract(&(usage.ru_utime),&(start.ru_utime)) +
|
||||
timeval_subtract(&(usage.ru_stime),&(start.ru_stime));
|
||||
fprintf(logfile,"%d\t%llu\t%lu\t%8.3f\t%8.3e\n",i,
|
||||
fprintf(logfile,"%d\t%llu\t%llu\t%8.3f\t%8.3e\n",i,
|
||||
(long long unsigned)totallength,
|
||||
strictprimers->size*(sizeof(int64_t)+sizeof(int32_t)),
|
||||
seconde,seconde/(double)totallength);
|
||||
@ -248,9 +248,9 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
||||
sizeof(word_t)*strictprimers->size,
|
||||
"Cannot reallocate strict primer table");
|
||||
|
||||
if (neededWords)
|
||||
if (neededWords){
|
||||
ECOFREE(neededWords,"Clean needed word table");
|
||||
|
||||
}
|
||||
//TR: Somehow for some primers strictcount value is extremely large hence invalid
|
||||
//we need to remove these primers from the list
|
||||
j = strictprimers->size+1;
|
||||
|
@ -231,8 +231,8 @@ static int cmpamp(const void *ampf1, const void* ampf2)
|
||||
char cd1;
|
||||
char cd2;
|
||||
int len = 0;
|
||||
char *ch1;
|
||||
char *ch2;
|
||||
const char *ch1;
|
||||
const char *ch2;
|
||||
int incr1;
|
||||
int incr2;
|
||||
|
||||
|
@ -17,6 +17,15 @@
|
||||
|
||||
double forbidden_entropy;
|
||||
|
||||
char bpencoder[] = { 1, // A
|
||||
0, // b
|
||||
2, // C
|
||||
0,0,0, // d, e, f
|
||||
3, // G
|
||||
0,0,0,0,0,0,0,0,0,0,0,0, // h,i,j,k,l,m,n,o,p,q,r,s
|
||||
4,0, // T,U
|
||||
0,0,0,0,0}; // v,w,x,y,z
|
||||
|
||||
|
||||
double nparam_GetInitialEntropy(PNNParams nparm)
|
||||
{
|
||||
@ -470,7 +479,7 @@ int nparam_CountGCContent(char * seq ) {
|
||||
return count;
|
||||
}
|
||||
|
||||
void nparam_CleanSeq (char* inseq, char* outseq, int len)
|
||||
void nparam_CleanSeq (const char* inseq, char* outseq, int len)
|
||||
{
|
||||
int seqlen = strlen (inseq);
|
||||
int i, j;
|
||||
@ -508,7 +517,7 @@ void nparam_CleanSeq (char* inseq, char* outseq, int len)
|
||||
}
|
||||
|
||||
//Calculate TM for given sequence against its complement
|
||||
double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len)
|
||||
double nparam_CalcSelfTM(PNNParams nparm, const char* seq, size_t len)
|
||||
{
|
||||
double thedH = 0;
|
||||
//double thedS = nparam_GetInitialEntropy(nparm);
|
||||
@ -520,7 +529,7 @@ double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len)
|
||||
char c4;
|
||||
unsigned int i;
|
||||
char nseq[50];
|
||||
char *useq = seq;
|
||||
const char *useq = seq;
|
||||
|
||||
nparam_CleanSeq (seq, nseq, len);
|
||||
useq = nseq;
|
||||
@ -533,7 +542,7 @@ double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len)
|
||||
c4 = GETNUMCODE(useq[i]);
|
||||
|
||||
|
||||
thedH += nparm->dH[c3][c4][c1][c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2);
|
||||
thedH += nparm->dH[(u_int8_t) c3][(u_int8_t)c4][(u_int8_t)c1][(u_int8_t)c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2);
|
||||
thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2);
|
||||
}
|
||||
//printf("------------------\n");
|
||||
@ -543,7 +552,7 @@ double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len)
|
||||
return mtemp;
|
||||
}
|
||||
|
||||
double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len)
|
||||
double nparam_CalcTwoTM(PNNParams nparm, const char* seq1, const char* seq2, size_t len)
|
||||
{
|
||||
double thedH = 0;
|
||||
//double thedS = nparam_GetInitialEntropy(nparm);
|
||||
@ -575,7 +584,7 @@ double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len)
|
||||
|
||||
//fprintf (stderr,"Primer : %s %f %f %d %d, %d %d %f\n",useq,thedH,thedS,(int)c3,(int)c4,(int)c1,(int)c2,nparam_GetEnthalpy(nparm, c3,c4,c1,c2));
|
||||
|
||||
thedH += nparm->dH[c3][c4][c1][c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2);
|
||||
thedH += nparm->dH[(u_int8_t)c3][(u_int8_t)c4][(u_int8_t)c1][(u_int8_t)c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2);
|
||||
thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2);
|
||||
}
|
||||
//fprintf(stderr,"------------------\n");
|
||||
|
@ -16,8 +16,8 @@
|
||||
//#include "../libecoprimer/ecoprimer.h"
|
||||
|
||||
// following defines to simplify coding...
|
||||
#define ndH(a,b,c,d) nparm->dH[a][b][c][d]
|
||||
#define ndS(a,b,c,d) nparm->dS[a][b][c][d]
|
||||
#define ndH(a,b,c,d) nparm->dH[(u_int8_t) a][(u_int8_t) b][(u_int8_t) c][(u_int8_t) d]
|
||||
#define ndS(a,b,c,d) nparm->dS[(u_int8_t) a][(u_int8_t) b][(u_int8_t) c][(u_int8_t) d]
|
||||
#define forbidden_enthalpy 1000000000000000000.0f
|
||||
#define R 1.987f
|
||||
#define SALT_METHOD_SANTALUCIA 1
|
||||
@ -33,14 +33,7 @@
|
||||
|
||||
extern double forbidden_entropy;
|
||||
|
||||
static char bpencoder[] = { 1, // A
|
||||
0, // b
|
||||
2, // C
|
||||
0,0,0, // d, e, f
|
||||
3, // G
|
||||
0,0,0,0,0,0,0,0,0,0,0,0, // h,i,j,k,l,m,n,o,p,q,r,s
|
||||
4,0, // T,U
|
||||
0,0,0,0,0}; // v,w,x,y,z
|
||||
extern char bpencoder[]; // v,w,x,y,z
|
||||
|
||||
|
||||
typedef struct CNNParams_st
|
||||
@ -62,8 +55,8 @@ int nparam_CountGCContent(char * seq );
|
||||
double nparam_GetEntropy(PNNParams nparm, char x0, char x1, char y0, char y1);
|
||||
double nparam_GetEnthalpy(PNNParams nparm, char x0, char x1, char y0, char y1);
|
||||
double nparam_CalcTM(double entropy,double enthalpy);
|
||||
double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len);
|
||||
double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len);
|
||||
double nparam_CalcSelfTM(PNNParams nparm, const char* seq, size_t len);
|
||||
double nparam_CalcTwoTM(PNNParams nparm, const char* seq1, const char* seq2, size_t len);
|
||||
|
||||
double nparam_GetInitialEntropy(PNNParams nparm) ;
|
||||
double calculateMeltingTemperatureBasic (char * seq);
|
||||
|
@ -28,17 +28,15 @@ word_t extractSite(char* sequence, size_t begin, size_t length, bool_t strand)
|
||||
|
||||
void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options)
|
||||
{
|
||||
size_t i, j,k,l;
|
||||
size_t i, j;
|
||||
uint32_t bp1,bp2;
|
||||
uint32_t ep1,ep2;
|
||||
word_t w1;
|
||||
word_t w2;
|
||||
bool_t strand;
|
||||
|
||||
char *sq,*sq1,*sq2,*c;
|
||||
char *sq;
|
||||
char prmrd[50];
|
||||
char prmrr[50];
|
||||
char sqsite[50];
|
||||
double mtemp;
|
||||
|
||||
for (i = 0; i < count; i++)
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import re
|
||||
import gzip
|
||||
@ -6,12 +6,9 @@ import struct
|
||||
import sys
|
||||
import time
|
||||
import getopt
|
||||
from functools import cmp_to_key
|
||||
|
||||
try:
|
||||
import psycopg2
|
||||
_dbenable=True
|
||||
except ImportError:
|
||||
_dbenable=False
|
||||
_dbenable=False
|
||||
|
||||
#####
|
||||
#
|
||||
@ -80,15 +77,17 @@ class ColumnFile(object):
|
||||
def __init__(self,stream,sep=None,strip=True,types=None):
|
||||
if isinstance(stream,str):
|
||||
self._stream = open(stream)
|
||||
elif hasattr(stream,'next'):
|
||||
self._stream = stream
|
||||
else:
|
||||
raise ValueError,'stream must be string or an iterator'
|
||||
try:
|
||||
iter(stream)
|
||||
self._stream = stream
|
||||
except TypeError:
|
||||
raise ValueError('stream must be string or an iterator')
|
||||
self._delimiter=sep
|
||||
self._strip=strip
|
||||
if types:
|
||||
self._types=[x for x in types]
|
||||
for i in xrange(len(self._types)):
|
||||
for i in range(len(self._types)):
|
||||
if self._types[i] is bool:
|
||||
self._types[i]=ColumnFile.str2bool
|
||||
else:
|
||||
@ -103,14 +102,14 @@ class ColumnFile(object):
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
ligne = self._stream.next()
|
||||
def __next__(self):
|
||||
ligne = next(self._stream)
|
||||
data = ligne.split(self._delimiter)
|
||||
if self._strip or self._types:
|
||||
data = [x.strip() for x in data]
|
||||
if self._types:
|
||||
it = endLessIterator(self._types)
|
||||
data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
|
||||
data = [x[1](x[0]) for x in ((y,next(it)) for y in data)]
|
||||
return data
|
||||
|
||||
def taxonCmp(t1,t2):
|
||||
@ -125,14 +124,14 @@ def bsearchTaxon(taxonomy,taxid):
|
||||
begin = 0
|
||||
end = taxCount
|
||||
oldcheck=taxCount
|
||||
check = begin + end / 2
|
||||
check = int(begin + end / 2)
|
||||
while check != oldcheck and taxonomy[check][0]!=taxid :
|
||||
if taxonomy[check][0] < taxid:
|
||||
begin=check
|
||||
else:
|
||||
end=check
|
||||
oldcheck=check
|
||||
check = (begin + end) / 2
|
||||
check = int((begin + end) / 2)
|
||||
|
||||
|
||||
if taxonomy[check][0]==taxid:
|
||||
@ -152,22 +151,22 @@ def readNodeTable(file):
|
||||
str,str,bool,
|
||||
int,bool,int,
|
||||
bool,bool,bool,str))
|
||||
print >>sys.stderr,"Reading taxonomy dump file..."
|
||||
print("Reading taxonomy dump file...", file=sys.stderr)
|
||||
taxonomy=[[n[0],n[2],n[1]] for n in nodes]
|
||||
print >>sys.stderr,"List all taxonomy rank..."
|
||||
print("List all taxonomy rank...", file=sys.stderr)
|
||||
ranks =list(set(x[1] for x in taxonomy))
|
||||
ranks.sort()
|
||||
ranks = dict(map(None,ranks,xrange(len(ranks))))
|
||||
ranks = {rank: index for index, rank in enumerate(ranks)}
|
||||
|
||||
print >>sys.stderr,"Sorting taxons..."
|
||||
taxonomy.sort(taxonCmp)
|
||||
print("Sorting taxons...", file=sys.stderr)
|
||||
taxonomy.sort(key=lambda x: x[0])
|
||||
|
||||
print >>sys.stderr,"Indexing taxonomy..."
|
||||
print("Indexing taxonomy...", file=sys.stderr)
|
||||
index = {}
|
||||
for t in taxonomy:
|
||||
index[t[0]]=bsearchTaxon(taxonomy, t[0])
|
||||
|
||||
print >>sys.stderr,"Indexing parent and rank..."
|
||||
print("Indexing parent and rank...", file=sys.stderr)
|
||||
for t in taxonomy:
|
||||
t[1]=ranks[t[1]]
|
||||
t[2]=index[t[2]]
|
||||
@ -203,7 +202,7 @@ def deletedNodeIterator(file):
|
||||
def readTaxonomyDump(taxdir):
|
||||
taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir)
|
||||
|
||||
print >>sys.stderr,"Adding scientific name..."
|
||||
print("Adding scientific name...", file=sys.stderr)
|
||||
|
||||
alternativeName=[]
|
||||
for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir):
|
||||
@ -211,66 +210,16 @@ def readTaxonomyDump(taxdir):
|
||||
if classname == 'scientific name':
|
||||
taxonomy[index[taxid]].append(name)
|
||||
|
||||
print >>sys.stderr,"Adding taxid alias..."
|
||||
print("Adding taxid alias...", file=sys.stderr)
|
||||
for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir):
|
||||
index[taxid]=index[current]
|
||||
|
||||
print >>sys.stderr,"Adding deleted taxid..."
|
||||
print("Adding deleted taxid...", file=sys.stderr)
|
||||
for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir):
|
||||
index[taxid]=None
|
||||
|
||||
return taxonomy,ranks,alternativeName,index
|
||||
|
||||
def readTaxonomyDB(dbname):
|
||||
connection = psycopg2.connect(database=dbname)
|
||||
|
||||
cursor = connection.cursor()
|
||||
cursor.execute("select numid,rank,parent from ncbi_taxonomy.taxon")
|
||||
taxonomy=[list(x) for x in cursor]
|
||||
|
||||
cursor.execute("select rank_class from ncbi_taxonomy.taxon_rank_class order by rank_class")
|
||||
ranks=cursor.fetchall()
|
||||
ranks = dict(map(None,(x[0] for x in ranks),xrange(len(ranks))))
|
||||
|
||||
print >>sys.stderr,"Sorting taxons..."
|
||||
taxonomy.sort(taxonCmp)
|
||||
|
||||
print >>sys.stderr,"Indexing taxonomy..."
|
||||
index = {}
|
||||
for t in taxonomy:
|
||||
index[t[0]]=bsearchTaxon(taxonomy, t[0])
|
||||
|
||||
print >>sys.stderr,"Indexing parent and rank..."
|
||||
for t in taxonomy:
|
||||
t[1]=ranks[t[1]]
|
||||
try:
|
||||
t[2]=index[t[2]]
|
||||
except KeyError,e:
|
||||
if t[2] is None and t[0]==1:
|
||||
t[2]=index[t[0]]
|
||||
else:
|
||||
raise e
|
||||
|
||||
cursor.execute("select taxid,name,category from ncbi_taxonomy.name")
|
||||
|
||||
alternativeName=[]
|
||||
for taxid,name,classname in cursor:
|
||||
alternativeName.append((name,classname,index[taxid]))
|
||||
if classname == 'scientific name':
|
||||
taxonomy[index[taxid]].append(name)
|
||||
|
||||
cursor.execute("select old_numid,current_numid from ncbi_taxonomy.taxon_id_alias")
|
||||
|
||||
print >>sys.stderr,"Adding taxid alias..."
|
||||
for taxid,current in cursor:
|
||||
if current is not None:
|
||||
index[taxid]=index[current]
|
||||
else:
|
||||
index[taxid]=None
|
||||
|
||||
|
||||
return taxonomy,ranks,alternativeName,index
|
||||
|
||||
#####
|
||||
#
|
||||
#
|
||||
@ -282,22 +231,27 @@ def readTaxonomyDB(dbname):
|
||||
def entryIterator(file):
|
||||
file = universalOpen(file)
|
||||
rep =[]
|
||||
for ligne in file:
|
||||
ligne = file.readline()
|
||||
while ligne:
|
||||
rep.append(ligne)
|
||||
if ligne == '//\n':
|
||||
rep = ''.join(rep)
|
||||
yield rep
|
||||
rep = []
|
||||
ligne = file.readline()
|
||||
|
||||
def fastaEntryIterator(file):
|
||||
file = universalOpen(file)
|
||||
rep =[]
|
||||
for ligne in file:
|
||||
ligne = file.readline()
|
||||
while ligne:
|
||||
if ligne[0] == '>' and rep:
|
||||
rep = ''.join(rep)
|
||||
yield rep
|
||||
rep = []
|
||||
rep.append(ligne)
|
||||
ligne = file.readline()
|
||||
|
||||
if rep:
|
||||
rep = ''.join(rep)
|
||||
yield rep
|
||||
@ -418,7 +372,7 @@ def taxonomyInfo(entry,connection):
|
||||
|
||||
def ecoSeqPacker(sq):
|
||||
|
||||
compactseq = gzip.zlib.compress(sq['sequence'],9)
|
||||
compactseq = gzip.zlib.compress(bytes(sq['sequence'],"ascii"),9)
|
||||
cptseqlength = len(compactseq)
|
||||
delength = len(sq['definition'])
|
||||
|
||||
@ -427,11 +381,11 @@ def ecoSeqPacker(sq):
|
||||
packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength),
|
||||
totalSize,
|
||||
sq['taxid'],
|
||||
sq['id'],
|
||||
bytes(sq['id'],"ascii"),
|
||||
delength,
|
||||
len(sq['sequence']),
|
||||
cptseqlength,
|
||||
sq['definition'],
|
||||
bytes(sq['definition'],"ascii"),
|
||||
compactseq)
|
||||
|
||||
assert len(packed) == totalSize+4, "error in sequence packing"
|
||||
@ -450,7 +404,7 @@ def ecoTaxPacker(tx):
|
||||
tx[1],
|
||||
tx[2],
|
||||
namelength,
|
||||
tx[3])
|
||||
bytes(tx[3],"ascii"))
|
||||
|
||||
return packed
|
||||
|
||||
@ -460,7 +414,7 @@ def ecoRankPacker(rank):
|
||||
|
||||
packed = struct.pack('> I %ds' % namelength,
|
||||
namelength,
|
||||
rank)
|
||||
bytes(rank, 'ascii'))
|
||||
|
||||
return packed
|
||||
|
||||
@ -476,8 +430,8 @@ def ecoNamePacker(name):
|
||||
namelength,
|
||||
classlength,
|
||||
name[2],
|
||||
name[0],
|
||||
name[1])
|
||||
bytes(name[0], 'ascii'),
|
||||
bytes(name[1], 'ascii'))
|
||||
|
||||
return packed
|
||||
|
||||
@ -505,11 +459,11 @@ def ecoSeqWriter(file,input,taxindex,parser):
|
||||
skipped.append(entry['id'])
|
||||
where = universalTell(input)
|
||||
progressBar(where, inputsize)
|
||||
print >>sys.stderr," Readed sequences : %d " % seqcount,
|
||||
print(" Readed sequences : %d " % seqcount, end=' ', file=sys.stderr)
|
||||
else:
|
||||
skipped.append(entry['id'])
|
||||
|
||||
print >>sys.stderr
|
||||
print(file=sys.stderr)
|
||||
output.seek(0,0)
|
||||
output.write(struct.pack('> I',seqcount))
|
||||
|
||||
@ -530,7 +484,7 @@ def ecoRankWriter(file,ranks):
|
||||
output = open(file,'wb')
|
||||
output.write(struct.pack('> I',len(ranks)))
|
||||
|
||||
rankNames = ranks.keys()
|
||||
rankNames = list(ranks.keys())
|
||||
rankNames.sort()
|
||||
|
||||
for rank in rankNames:
|
||||
@ -552,7 +506,7 @@ def ecoNameWriter(file,names):
|
||||
output = open(file,'wb')
|
||||
output.write(struct.pack('> I',len(names)))
|
||||
|
||||
names.sort(nameCmp)
|
||||
names.sort(key=lambda x:x[0].upper())
|
||||
|
||||
for name in names:
|
||||
output.write(ecoNamePacker(name))
|
||||
@ -573,8 +527,8 @@ def ecoDBWriter(prefix,taxonomy,seqFileNames,parser):
|
||||
taxonomy[3],
|
||||
parser)
|
||||
if sk:
|
||||
print >>sys.stderr,"Skipped entry :"
|
||||
print >>sys.stderr,sk
|
||||
print("Skipped entry :", file=sys.stderr)
|
||||
print(sk, file=sys.stderr)
|
||||
|
||||
def ecoParseOptions(arguments):
|
||||
opt = {
|
||||
@ -618,34 +572,30 @@ def ecoParseOptions(arguments):
|
||||
opt['parser']=sequenceIteratorFactory(emblEntryParser,
|
||||
entryIterator)
|
||||
else:
|
||||
raise ValueError,'Unknown option %s' % name
|
||||
raise ValueError('Unknown option %s' % name)
|
||||
|
||||
return opt,filenames
|
||||
|
||||
def printHelp():
|
||||
print "-----------------------------------"
|
||||
print " ecoPCRFormat.py"
|
||||
print "-----------------------------------"
|
||||
print "ecoPCRFormat.py [option] <argument>"
|
||||
print "-----------------------------------"
|
||||
print "-e --embl :[E]mbl format"
|
||||
print "-f --fasta :[F]asta format"
|
||||
print "-g --genbank :[G]enbank format"
|
||||
print "-h --help :[H]elp - print this help"
|
||||
print "-n --name :[N]ame of the new database created"
|
||||
print "-t --taxonomy :[T]axonomy - path to the taxonomy database"
|
||||
print " :bcp-like dump from GenBank taxonomy database."
|
||||
print "-----------------------------------"
|
||||
print("-----------------------------------")
|
||||
print(" ecoPCRFormat.py")
|
||||
print("-----------------------------------")
|
||||
print("ecoPCRFormat.py [option] <argument>")
|
||||
print("-----------------------------------")
|
||||
print("-e --embl :[E]mbl format")
|
||||
print("-f --fasta :[F]asta format")
|
||||
print("-g --genbank :[G]enbank format")
|
||||
print("-h --help :[H]elp - print this help")
|
||||
print("-n --name :[N]ame of the new database created")
|
||||
print("-t --taxonomy :[T]axonomy - path to the taxonomy database")
|
||||
print(" :bcp-like dump from GenBank taxonomy database.")
|
||||
print("-----------------------------------")
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
opt,filenames = ecoParseOptions(sys.argv[1:])
|
||||
|
||||
if opt['taxmod']=='dump':
|
||||
taxonomy = readTaxonomyDump(opt['taxdir'])
|
||||
elif opt['taxmod']=='db':
|
||||
taxonomy = readTaxonomyDB(opt['taxdb'])
|
||||
|
||||
taxonomy = readTaxonomyDump(opt['taxdir'])
|
||||
|
||||
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])
|
||||
|
||||
|
Reference in New Issue
Block a user