diff --git a/.gitignore b/.gitignore index 9f247c2..e9dcb61 100644 --- a/.gitignore +++ b/.gitignore @@ -1,16 +1,14 @@ - -# /src/ +*.o /src/*.P /src/*.a /src/ecoPrimer - -# /src/libecoPCR/ /src/libecoPCR/*.P /src/libecoPCR/*.a - -# /src/libecoprimer/ +/src/libecoPCR/*.o /src/libecoprimer/*.P /src/libecoprimer/*.a - -# /src/libthermo/ +/src/libecoprimer/*.o /src/libthermo/*.P +/src/libthermo/*.a +/src/libthermo/*.o +phylonorway.* diff --git a/src/ecoprimer.c b/src/ecoprimer.c index b5ab37f..d588256 100644 --- a/src/ecoprimer.c +++ b/src/ecoprimer.c @@ -171,13 +171,9 @@ void printapair(int32_t index,ppair_t pair, poptions_t options) bool_t good2=pair->p2->good; bool_t goodtmp; bool_t strand; - uint32_t i, j; - float temp; - CNNParams nnparams; + uint32_t i; - //nparam_InitParams(&nnparams, DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,DEF_SALT,SALT_METHOD_SANTALUCIA); - - char *c; + const char *c; char p1[32]; char p2[32]; @@ -376,21 +372,20 @@ uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t opti void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy, pecodnadb_t seqdb) { ppair_t* sortedpairs; - ppair_t* index; + // ppair_t* index; ppairlist_t pl; size_t i,j; size_t count; char *taxon[]={"taxon","taxa"}; ecotx_t *current_taxon; //pairset pair_sets; - pairset *pset = NULL; //printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n"); fprintf(stderr,"Total pair count : %d\n",pairs->count); sortedpairs = ECOMALLOC(pairs->count*sizeof(ppair_t),"Cannot Allocate ordered pairs"); - index=sortedpairs; + // index=sortedpairs; pl=pairs->first; j=0; while(pl->next) @@ -475,10 +470,10 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy, if (options->filter_on_links) { - fprintf (stderr, "Old size: %d, ", count); + fprintf (stderr, "Old size: %ld, ", count); count = primers_changeSortedArray (&sortedpairs, count, options); //count = primers_filterWithGivenLinks (&sortedpairs, count, options); - fprintf (stderr, "New size: %d\n", count); + fprintf (stderr, "New size: %ld\n", count); if (count == 0) { @@ -803,9 +798,9 @@ int main(int argc, char **argv) if (options.saltmethod != 2) //if not SALT_METHOD_OWCZARZY options.saltmethod = SALT_METHOD_SANTALUCIA; //then force SALT_METHOD_SANTALUCIA - if (options.salt < 0.01 || options.salt > 0.3) //if salt value out of literature values + if (options.salt < 0.01 || options.salt > 0.3) {//if salt value out of literature values options.salt = DEF_SALT; //set to default - + } nparam_InitParams(&nnparams, DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,options.salt,options.saltmethod); fprintf(stderr,"Reading taxonomy database ..."); @@ -851,7 +846,7 @@ int main(int argc, char **argv) fprintf(stderr,"\nIndexing words in sequences\n"); words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options); - fprintf(stderr,"\n Strict primer count : %d\n",words->size); + fprintf(stderr,"\n Strict primer count : %lld\n",words->size); /*/TR Testing fprintf(stderr,"\nReducing for debugging\n"); @@ -871,7 +866,7 @@ int main(int argc, char **argv) if (options.no_multi_match) { (void)filterMultiStrictPrimer(words); - fprintf(stderr,"\n Strict primer with single match count : %d\n",words->size); + fprintf(stderr,"\n Strict primer with single match count : %lld\n",words->size); } @@ -921,7 +916,7 @@ pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options) { uint32_t i, k; pwordcount_t new_words; - char *rwrd; + const char *rwrd; char dwrd[20]; /*char *strict_words[DEBUG_WORDS_CNT] = {"GAGTCTCTGCACCTATCC", "GCAATCCTGAGCCAAATC", "ACCCCTAACCACAACTCA", "TCCGAACCGACTGATGTT", "GAAGCTTGGGTGAAACTA", "GGAGAACCAGCTAGCTCT", "GCTGGTTCTCCCCGAAAT", @@ -981,7 +976,7 @@ pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options) void print_wordwith_positions (primer_t prm, uint32_t seqdbsize, poptions_t options) { - char *wrd; + const char *wrd; uint32_t i, j; char *twrd = "GCCTGTTTACCAAAAACA"; diff --git a/src/global.mk b/src/global.mk index 7cf2763..301f4e6 100644 --- a/src/global.mk +++ b/src/global.mk @@ -1,5 +1,5 @@ MACHINE=MAC_OS_X -LIBPATH= -LlibecoPCR -Llibecoprimer -Llibthermo +LIBPATH= -LlibecoPCR -Llibecoprimer -Llibthermo -L/usr/local/lib MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $< CC=gcc diff --git a/src/libecoPCR/ecoMalloc.c b/src/libecoPCR/ecoMalloc.c index c34f97c..2509255 100644 --- a/src/libecoPCR/ecoMalloc.c +++ b/src/libecoPCR/ecoMalloc.c @@ -2,7 +2,7 @@ #include static int eco_log_malloc = 0; -static size_t eco_amount_malloc=0; +//static size_t eco_amount_malloc=0; static size_t eco_chunk_malloc=0; void eco_trace_memory_allocation() @@ -37,7 +37,7 @@ void *eco_malloc(int64_t chunksize, if (eco_log_malloc) fprintf(stderr, - "Memory segment located at %p of size %d is allocated (file : %s [%d])", + "Memory segment located at %p of size %lld is allocated (file : %s [%d])", chunk, chunksize, filename, @@ -65,7 +65,7 @@ void *eco_realloc(void *chunk, if (!newchunk) { - fprintf(stderr,"Requested memory : %d\n",newsize); + fprintf(stderr,"Requested memory : %lld\n",newsize); ecoError(ECO_MEM_ERROR,error_message,filename,line); } if (!chunk) @@ -73,7 +73,7 @@ void *eco_realloc(void *chunk, if (eco_log_malloc) fprintf(stderr, - "Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])", + "Old memory segment %p is reallocated at %p with a size of %lld (file : %s [%d])", chunk, newchunk, newsize, diff --git a/src/libecoPCR/ecoPCR.h b/src/libecoPCR/ecoPCR.h index f2fe663..c736167 100644 --- a/src/libecoPCR/ecoPCR.h +++ b/src/libecoPCR/ecoPCR.h @@ -21,8 +21,8 @@ typedef struct { int32_t taxid; char AC[20]; int32_t DE_length; - int32_t SQ_length; - int32_t CSQ_length; /*what is this CSQ_length ? */ + u_int32_t SQ_length; + u_int32_t CSQ_length; /*what is this CSQ_length ? */ char data[1]; @@ -30,7 +30,7 @@ typedef struct { typedef struct { int32_t taxid; - int32_t SQ_length; + u_int32_t SQ_length; int32_t isexample; char *AC; char *DE; diff --git a/src/libecoPCR/ecoseq.c b/src/libecoPCR/ecoseq.c index 1368be9..9483ade 100644 --- a/src/libecoPCR/ecoseq.c +++ b/src/libecoPCR/ecoseq.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include diff --git a/src/libecoprimer/PrimerSets.c b/src/libecoprimer/PrimerSets.c index 6417961..9725f9f 100644 --- a/src/libecoprimer/PrimerSets.c +++ b/src/libecoprimer/PrimerSets.c @@ -1139,7 +1139,7 @@ void sets_by_SimulatedAnealing (pairset *pair_set, { pair_set = extend_set_randomly (NULL, ¶ms, 3); printf("\nStart Random seed set for Simulated :\n"); - print_set_info (&pair_set, ¶ms); + print_set_info (pair_set, ¶ms); } min_spc = max_spc = pair_set->set_specificity; min_cov = max_cov = pair_set->set_coverage; @@ -1566,7 +1566,8 @@ int32_t *addinset (int32_t *set, int32_t i, int32_t j, int32_t* slots, int32_t * size_t primers_changeSortedArray (ppair_t ** pairs, size_t sorted_count, poptions_t options) { - int32_t i, j, k, l, total_links; + int32_t i, k, total_links; + u_int32_t j; int *owi; int *iwi; int allowedtaxa; @@ -1579,14 +1580,14 @@ size_t primers_changeSortedArray (ppair_t ** pairs, idx_set = ECOMALLOC(slots*sizeof (int32_t), "Could not allocate memory for index set."); - for (i=0; iwellIdentifiedSeqs; passed = FALSE; for (j=0; jwellIdentifiedSeqs; total_links = 0; @@ -1610,9 +1611,10 @@ size_t primers_changeSortedArray (ppair_t ** pairs, if (options->max_links_percent > 0) { allowedtaxa = options->max_links_percent; - if (total_links > allowedtaxa) + if (total_links > allowedtaxa){ passed = TRUE; - break; + } + break; } else if (!(total_links > 5 && total_links <= allowedtaxa)) @@ -1630,7 +1632,7 @@ size_t primers_changeSortedArray (ppair_t ** pairs, for (j=0; jdbsize; k++) if ((*pairs)[j]->coveredSeqs[k] == 1) cov[k] = 1; diff --git a/src/libecoprimer/ahocorasick.c b/src/libecoprimer/ahocorasick.c index 078be0d..5adc10c 100755 --- a/src/libecoprimer/ahocorasick.c +++ b/src/libecoprimer/ahocorasick.c @@ -213,7 +213,6 @@ pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsi char *base; int8_t code; uint32_t goodPrimers=0; - static int iii=0; //inSequenceQuorum = (uint32_t)floor((float)exampleCount * options->sensitivity_quorum); diff --git a/src/libecoprimer/apat_search.c b/src/libecoprimer/apat_search.c index 22ae905..fc42b7a 100644 --- a/src/libecoprimer/apat_search.c +++ b/src/libecoprimer/apat_search.c @@ -36,7 +36,7 @@ int32_t ManberNoErr(pecoseq_t pseq,ppattern_t pat, ppatternParam_t param, StackiPtr stkpos) { - int32_t pos; + u_int32_t pos; uint32_t smask, r; uint8_t *data; int32_t end; @@ -84,7 +84,7 @@ int32_t ManberSub(pecoseq_t pseq,ppattern_t pat, StackiPtr stkpos) { int e, found; - int32_t pos; + u_int32_t pos; uint32_t smask, cmask, sindx; uint32_t *pr, r[2 * MAX_PAT_ERR + 2]; uint8_t *data; diff --git a/src/libecoprimer/aproxpattern.c b/src/libecoprimer/aproxpattern.c index 0cf349e..3433044 100644 --- a/src/libecoprimer/aproxpattern.c +++ b/src/libecoprimer/aproxpattern.c @@ -201,7 +201,7 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3 data[w].good = data[w].inexample >= inSequenceQuorum && data[w].outexample <= outSequenceQuorum; goodPrimers+=data[w].good? 1:0; - fprintf(stderr,"Primers %5d/%d analyzed => sequence : %s in %d example and %d counterexample sequences \r", + fprintf(stderr,"Primers %5d/%lld analyzed => sequence : %s in %d example and %d counterexample sequences \r", i+1,words->size,ecoUnhashWord(data[w].word,options->primer_length), data[w].inexample,data[w].outexample); @@ -224,8 +224,8 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3 ECOFREE(data[w].reversePos,"Free direct count table"); } - fprintf(stderr,"\n\nOn %d analyzed primers %d respect quorum conditions\n",words->size,goodPrimers); - fprintf(stderr,"Conserved primers for further analysis : %d/%d\n",w,words->size); + fprintf(stderr,"\n\nOn %lld analyzed primers %d respect quorum conditions\n",words->size,goodPrimers); + fprintf(stderr,"Conserved primers for further analysis : %d/%lld\n",w,words->size); primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table"); primers->primers=ECOREALLOC(data, diff --git a/src/libecoprimer/hashencoder.h b/src/libecoprimer/hashencoder.h index 00540d7..19e5258 100644 --- a/src/libecoprimer/hashencoder.h +++ b/src/libecoprimer/hashencoder.h @@ -8,14 +8,6 @@ #ifndef HASHENCODER_H_ #define HASHENCODER_H_ -static int8_t encoder[] = {0, // A - -1, // b - 1, // C - -1,-1,-1, // d, e, f - 2, // G - -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s - 3,3, // T,U - -1,-1,-1,-1,-1}; // v,w,x,y,z - +extern int8_t encoder[]; #endif /* HASHENCODER_H_ */ diff --git a/src/libecoprimer/hashsequence.c b/src/libecoprimer/hashsequence.c index e8f1a9a..4184e62 100644 --- a/src/libecoprimer/hashsequence.c +++ b/src/libecoprimer/hashsequence.c @@ -12,6 +12,16 @@ static int cmpword(const void *x,const void *y); #include "hashencoder.h" +int8_t encoder[] = {0, // A + -1, // b + 1, // C + -1,-1,-1, // d, e, f + 2, // G + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s + 3,3, // T,U + -1,-1,-1,-1,-1}; // v,w,x,y,z + + uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq) { uint32_t wordcount; @@ -257,6 +267,6 @@ uint32_t ecoFindWord(pwordcount_t table,word_t word) char ecoComplementChar(char base) { - return (base < 4)? !base & 3: 4; + return (base < 4)? ~base & 3: 4; } diff --git a/src/libecoprimer/strictprimers.c b/src/libecoprimer/strictprimers.c index dfd6211..9bf5116 100644 --- a/src/libecoprimer/strictprimers.c +++ b/src/libecoprimer/strictprimers.c @@ -205,8 +205,8 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize, } else { - uint32_t s; - s = strictprimers->size; +// uint32_t s; +// s = strictprimers->size; // DEBUG_LOG("stack size : %u",s); addSeqToWordCountTable(strictprimers,options->primer_length, options->circular, @@ -223,7 +223,7 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize, logfile = fopen(logfilename,"a"); seconde = timeval_subtract(&(usage.ru_utime),&(start.ru_utime)) + timeval_subtract(&(usage.ru_stime),&(start.ru_stime)); - fprintf(logfile,"%d\t%llu\t%lu\t%8.3f\t%8.3e\n",i, + fprintf(logfile,"%d\t%llu\t%llu\t%8.3f\t%8.3e\n",i, (long long unsigned)totallength, strictprimers->size*(sizeof(int64_t)+sizeof(int32_t)), seconde,seconde/(double)totallength); @@ -248,9 +248,9 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize, sizeof(word_t)*strictprimers->size, "Cannot reallocate strict primer table"); - if (neededWords) + if (neededWords){ ECOFREE(neededWords,"Clean needed word table"); - + } //TR: Somehow for some primers strictcount value is extremely large hence invalid //we need to remove these primers from the list j = strictprimers->size+1; diff --git a/src/libecoprimer/taxstats.c b/src/libecoprimer/taxstats.c index 0898303..7665f43 100644 --- a/src/libecoprimer/taxstats.c +++ b/src/libecoprimer/taxstats.c @@ -231,8 +231,8 @@ static int cmpamp(const void *ampf1, const void* ampf2) char cd1; char cd2; int len = 0; - char *ch1; - char *ch2; + const char *ch1; + const char *ch2; int incr1; int incr2; diff --git a/src/libthermo/nnparams.c b/src/libthermo/nnparams.c index f4b4937..ab74604 100644 --- a/src/libthermo/nnparams.c +++ b/src/libthermo/nnparams.c @@ -17,6 +17,15 @@ double forbidden_entropy; +char bpencoder[] = { 1, // A + 0, // b + 2, // C + 0,0,0, // d, e, f + 3, // G + 0,0,0,0,0,0,0,0,0,0,0,0, // h,i,j,k,l,m,n,o,p,q,r,s + 4,0, // T,U + 0,0,0,0,0}; // v,w,x,y,z + double nparam_GetInitialEntropy(PNNParams nparm) { @@ -470,7 +479,7 @@ int nparam_CountGCContent(char * seq ) { return count; } -void nparam_CleanSeq (char* inseq, char* outseq, int len) +void nparam_CleanSeq (const char* inseq, char* outseq, int len) { int seqlen = strlen (inseq); int i, j; @@ -508,7 +517,7 @@ void nparam_CleanSeq (char* inseq, char* outseq, int len) } //Calculate TM for given sequence against its complement -double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len) +double nparam_CalcSelfTM(PNNParams nparm, const char* seq, size_t len) { double thedH = 0; //double thedS = nparam_GetInitialEntropy(nparm); @@ -520,7 +529,7 @@ double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len) char c4; unsigned int i; char nseq[50]; - char *useq = seq; + const char *useq = seq; nparam_CleanSeq (seq, nseq, len); useq = nseq; @@ -533,7 +542,7 @@ double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len) c4 = GETNUMCODE(useq[i]); - thedH += nparm->dH[c3][c4][c1][c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2); + thedH += nparm->dH[(u_int8_t) c3][(u_int8_t)c4][(u_int8_t)c1][(u_int8_t)c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2); thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2); } //printf("------------------\n"); @@ -543,7 +552,7 @@ double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len) return mtemp; } -double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len) +double nparam_CalcTwoTM(PNNParams nparm, const char* seq1, const char* seq2, size_t len) { double thedH = 0; //double thedS = nparam_GetInitialEntropy(nparm); @@ -575,7 +584,7 @@ double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len) //fprintf (stderr,"Primer : %s %f %f %d %d, %d %d %f\n",useq,thedH,thedS,(int)c3,(int)c4,(int)c1,(int)c2,nparam_GetEnthalpy(nparm, c3,c4,c1,c2)); - thedH += nparm->dH[c3][c4][c1][c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2); + thedH += nparm->dH[(u_int8_t)c3][(u_int8_t)c4][(u_int8_t)c1][(u_int8_t)c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2); thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2); } //fprintf(stderr,"------------------\n"); diff --git a/src/libthermo/nnparams.h b/src/libthermo/nnparams.h index 5520ae1..b88825e 100644 --- a/src/libthermo/nnparams.h +++ b/src/libthermo/nnparams.h @@ -16,8 +16,8 @@ //#include "../libecoprimer/ecoprimer.h" // following defines to simplify coding... -#define ndH(a,b,c,d) nparm->dH[a][b][c][d] -#define ndS(a,b,c,d) nparm->dS[a][b][c][d] +#define ndH(a,b,c,d) nparm->dH[(u_int8_t) a][(u_int8_t) b][(u_int8_t) c][(u_int8_t) d] +#define ndS(a,b,c,d) nparm->dS[(u_int8_t) a][(u_int8_t) b][(u_int8_t) c][(u_int8_t) d] #define forbidden_enthalpy 1000000000000000000.0f #define R 1.987f #define SALT_METHOD_SANTALUCIA 1 @@ -33,14 +33,7 @@ extern double forbidden_entropy; -static char bpencoder[] = { 1, // A - 0, // b - 2, // C - 0,0,0, // d, e, f - 3, // G - 0,0,0,0,0,0,0,0,0,0,0,0, // h,i,j,k,l,m,n,o,p,q,r,s - 4,0, // T,U - 0,0,0,0,0}; // v,w,x,y,z +extern char bpencoder[]; // v,w,x,y,z typedef struct CNNParams_st @@ -62,8 +55,8 @@ int nparam_CountGCContent(char * seq ); double nparam_GetEntropy(PNNParams nparm, char x0, char x1, char y0, char y1); double nparam_GetEnthalpy(PNNParams nparm, char x0, char x1, char y0, char y1); double nparam_CalcTM(double entropy,double enthalpy); -double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len); -double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len); +double nparam_CalcSelfTM(PNNParams nparm, const char* seq, size_t len); +double nparam_CalcTwoTM(PNNParams nparm, const char* seq1, const char* seq2, size_t len); double nparam_GetInitialEntropy(PNNParams nparm) ; double calculateMeltingTemperatureBasic (char * seq); diff --git a/src/libthermo/thermostats.c b/src/libthermo/thermostats.c index 9141e17..0cbbcd5 100644 --- a/src/libthermo/thermostats.c +++ b/src/libthermo/thermostats.c @@ -28,17 +28,15 @@ word_t extractSite(char* sequence, size_t begin, size_t length, bool_t strand) void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options) { - size_t i, j,k,l; + size_t i, j; uint32_t bp1,bp2; - uint32_t ep1,ep2; word_t w1; word_t w2; bool_t strand; - char *sq,*sq1,*sq2,*c; + char *sq; char prmrd[50]; char prmrr[50]; - char sqsite[50]; double mtemp; for (i = 0; i < count; i++) diff --git a/tools/ecoPCRFormat.py b/tools/ecoPCRFormat.py index 3884001..b849588 100755 --- a/tools/ecoPCRFormat.py +++ b/tools/ecoPCRFormat.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import re import gzip @@ -6,12 +6,9 @@ import struct import sys import time import getopt +from functools import cmp_to_key -try: - import psycopg2 - _dbenable=True -except ImportError: - _dbenable=False +_dbenable=False ##### # @@ -80,15 +77,17 @@ class ColumnFile(object): def __init__(self,stream,sep=None,strip=True,types=None): if isinstance(stream,str): self._stream = open(stream) - elif hasattr(stream,'next'): - self._stream = stream else: - raise ValueError,'stream must be string or an iterator' + try: + iter(stream) + self._stream = stream + except TypeError: + raise ValueError('stream must be string or an iterator') self._delimiter=sep self._strip=strip if types: self._types=[x for x in types] - for i in xrange(len(self._types)): + for i in range(len(self._types)): if self._types[i] is bool: self._types[i]=ColumnFile.str2bool else: @@ -103,14 +102,14 @@ class ColumnFile(object): def __iter__(self): return self - def next(self): - ligne = self._stream.next() + def __next__(self): + ligne = next(self._stream) data = ligne.split(self._delimiter) if self._strip or self._types: data = [x.strip() for x in data] if self._types: it = endLessIterator(self._types) - data = [x[1](x[0]) for x in ((y,it.next()) for y in data)] + data = [x[1](x[0]) for x in ((y,next(it)) for y in data)] return data def taxonCmp(t1,t2): @@ -125,14 +124,14 @@ def bsearchTaxon(taxonomy,taxid): begin = 0 end = taxCount oldcheck=taxCount - check = begin + end / 2 + check = int(begin + end / 2) while check != oldcheck and taxonomy[check][0]!=taxid : if taxonomy[check][0] < taxid: begin=check else: end=check oldcheck=check - check = (begin + end) / 2 + check = int((begin + end) / 2) if taxonomy[check][0]==taxid: @@ -152,22 +151,22 @@ def readNodeTable(file): str,str,bool, int,bool,int, bool,bool,bool,str)) - print >>sys.stderr,"Reading taxonomy dump file..." + print("Reading taxonomy dump file...", file=sys.stderr) taxonomy=[[n[0],n[2],n[1]] for n in nodes] - print >>sys.stderr,"List all taxonomy rank..." + print("List all taxonomy rank...", file=sys.stderr) ranks =list(set(x[1] for x in taxonomy)) ranks.sort() - ranks = dict(map(None,ranks,xrange(len(ranks)))) + ranks = {rank: index for index, rank in enumerate(ranks)} - print >>sys.stderr,"Sorting taxons..." - taxonomy.sort(taxonCmp) + print("Sorting taxons...", file=sys.stderr) + taxonomy.sort(key=lambda x: x[0]) - print >>sys.stderr,"Indexing taxonomy..." + print("Indexing taxonomy...", file=sys.stderr) index = {} for t in taxonomy: index[t[0]]=bsearchTaxon(taxonomy, t[0]) - print >>sys.stderr,"Indexing parent and rank..." + print("Indexing parent and rank...", file=sys.stderr) for t in taxonomy: t[1]=ranks[t[1]] t[2]=index[t[2]] @@ -203,7 +202,7 @@ def deletedNodeIterator(file): def readTaxonomyDump(taxdir): taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir) - print >>sys.stderr,"Adding scientific name..." + print("Adding scientific name...", file=sys.stderr) alternativeName=[] for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir): @@ -211,66 +210,16 @@ def readTaxonomyDump(taxdir): if classname == 'scientific name': taxonomy[index[taxid]].append(name) - print >>sys.stderr,"Adding taxid alias..." + print("Adding taxid alias...", file=sys.stderr) for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir): index[taxid]=index[current] - print >>sys.stderr,"Adding deleted taxid..." + print("Adding deleted taxid...", file=sys.stderr) for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir): index[taxid]=None return taxonomy,ranks,alternativeName,index -def readTaxonomyDB(dbname): - connection = psycopg2.connect(database=dbname) - - cursor = connection.cursor() - cursor.execute("select numid,rank,parent from ncbi_taxonomy.taxon") - taxonomy=[list(x) for x in cursor] - - cursor.execute("select rank_class from ncbi_taxonomy.taxon_rank_class order by rank_class") - ranks=cursor.fetchall() - ranks = dict(map(None,(x[0] for x in ranks),xrange(len(ranks)))) - - print >>sys.stderr,"Sorting taxons..." - taxonomy.sort(taxonCmp) - - print >>sys.stderr,"Indexing taxonomy..." - index = {} - for t in taxonomy: - index[t[0]]=bsearchTaxon(taxonomy, t[0]) - - print >>sys.stderr,"Indexing parent and rank..." - for t in taxonomy: - t[1]=ranks[t[1]] - try: - t[2]=index[t[2]] - except KeyError,e: - if t[2] is None and t[0]==1: - t[2]=index[t[0]] - else: - raise e - - cursor.execute("select taxid,name,category from ncbi_taxonomy.name") - - alternativeName=[] - for taxid,name,classname in cursor: - alternativeName.append((name,classname,index[taxid])) - if classname == 'scientific name': - taxonomy[index[taxid]].append(name) - - cursor.execute("select old_numid,current_numid from ncbi_taxonomy.taxon_id_alias") - - print >>sys.stderr,"Adding taxid alias..." - for taxid,current in cursor: - if current is not None: - index[taxid]=index[current] - else: - index[taxid]=None - - - return taxonomy,ranks,alternativeName,index - ##### # # @@ -282,22 +231,27 @@ def readTaxonomyDB(dbname): def entryIterator(file): file = universalOpen(file) rep =[] - for ligne in file: + ligne = file.readline() + while ligne: rep.append(ligne) if ligne == '//\n': rep = ''.join(rep) yield rep rep = [] + ligne = file.readline() def fastaEntryIterator(file): file = universalOpen(file) rep =[] - for ligne in file: + ligne = file.readline() + while ligne: if ligne[0] == '>' and rep: rep = ''.join(rep) yield rep rep = [] rep.append(ligne) + ligne = file.readline() + if rep: rep = ''.join(rep) yield rep @@ -418,7 +372,7 @@ def taxonomyInfo(entry,connection): def ecoSeqPacker(sq): - compactseq = gzip.zlib.compress(sq['sequence'],9) + compactseq = gzip.zlib.compress(bytes(sq['sequence'],"ascii"),9) cptseqlength = len(compactseq) delength = len(sq['definition']) @@ -427,11 +381,11 @@ def ecoSeqPacker(sq): packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength), totalSize, sq['taxid'], - sq['id'], + bytes(sq['id'],"ascii"), delength, len(sq['sequence']), cptseqlength, - sq['definition'], + bytes(sq['definition'],"ascii"), compactseq) assert len(packed) == totalSize+4, "error in sequence packing" @@ -450,7 +404,7 @@ def ecoTaxPacker(tx): tx[1], tx[2], namelength, - tx[3]) + bytes(tx[3],"ascii")) return packed @@ -460,7 +414,7 @@ def ecoRankPacker(rank): packed = struct.pack('> I %ds' % namelength, namelength, - rank) + bytes(rank, 'ascii')) return packed @@ -476,8 +430,8 @@ def ecoNamePacker(name): namelength, classlength, name[2], - name[0], - name[1]) + bytes(name[0], 'ascii'), + bytes(name[1], 'ascii')) return packed @@ -505,11 +459,11 @@ def ecoSeqWriter(file,input,taxindex,parser): skipped.append(entry['id']) where = universalTell(input) progressBar(where, inputsize) - print >>sys.stderr," Readed sequences : %d " % seqcount, + print(" Readed sequences : %d " % seqcount, end=' ', file=sys.stderr) else: skipped.append(entry['id']) - print >>sys.stderr + print(file=sys.stderr) output.seek(0,0) output.write(struct.pack('> I',seqcount)) @@ -530,7 +484,7 @@ def ecoRankWriter(file,ranks): output = open(file,'wb') output.write(struct.pack('> I',len(ranks))) - rankNames = ranks.keys() + rankNames = list(ranks.keys()) rankNames.sort() for rank in rankNames: @@ -552,7 +506,7 @@ def ecoNameWriter(file,names): output = open(file,'wb') output.write(struct.pack('> I',len(names))) - names.sort(nameCmp) + names.sort(key=lambda x:x[0].upper()) for name in names: output.write(ecoNamePacker(name)) @@ -573,8 +527,8 @@ def ecoDBWriter(prefix,taxonomy,seqFileNames,parser): taxonomy[3], parser) if sk: - print >>sys.stderr,"Skipped entry :" - print >>sys.stderr,sk + print("Skipped entry :", file=sys.stderr) + print(sk, file=sys.stderr) def ecoParseOptions(arguments): opt = { @@ -618,34 +572,30 @@ def ecoParseOptions(arguments): opt['parser']=sequenceIteratorFactory(emblEntryParser, entryIterator) else: - raise ValueError,'Unknown option %s' % name + raise ValueError('Unknown option %s' % name) return opt,filenames def printHelp(): - print "-----------------------------------" - print " ecoPCRFormat.py" - print "-----------------------------------" - print "ecoPCRFormat.py [option] " - print "-----------------------------------" - print "-e --embl :[E]mbl format" - print "-f --fasta :[F]asta format" - print "-g --genbank :[G]enbank format" - print "-h --help :[H]elp - print this help" - print "-n --name :[N]ame of the new database created" - print "-t --taxonomy :[T]axonomy - path to the taxonomy database" - print " :bcp-like dump from GenBank taxonomy database." - print "-----------------------------------" + print("-----------------------------------") + print(" ecoPCRFormat.py") + print("-----------------------------------") + print("ecoPCRFormat.py [option] ") + print("-----------------------------------") + print("-e --embl :[E]mbl format") + print("-f --fasta :[F]asta format") + print("-g --genbank :[G]enbank format") + print("-h --help :[H]elp - print this help") + print("-n --name :[N]ame of the new database created") + print("-t --taxonomy :[T]axonomy - path to the taxonomy database") + print(" :bcp-like dump from GenBank taxonomy database.") + print("-----------------------------------") if __name__ == '__main__': opt,filenames = ecoParseOptions(sys.argv[1:]) - if opt['taxmod']=='dump': - taxonomy = readTaxonomyDump(opt['taxdir']) - elif opt['taxmod']=='db': - taxonomy = readTaxonomyDB(opt['taxdb']) - - + taxonomy = readTaxonomyDump(opt['taxdir']) + ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])