Update to consider evolution of the language

This commit is contained in:
2023-06-29 12:12:46 +02:00
parent 73236c72a8
commit 92826de147
18 changed files with 146 additions and 199 deletions

14
.gitignore vendored
View File

@ -1,16 +1,14 @@
# /src/
*.o
/src/*.P
/src/*.a
/src/ecoPrimer
# /src/libecoPCR/
/src/libecoPCR/*.P
/src/libecoPCR/*.a
# /src/libecoprimer/
/src/libecoPCR/*.o
/src/libecoprimer/*.P
/src/libecoprimer/*.a
# /src/libthermo/
/src/libecoprimer/*.o
/src/libthermo/*.P
/src/libthermo/*.a
/src/libthermo/*.o
phylonorway.*

View File

@ -171,13 +171,9 @@ void printapair(int32_t index,ppair_t pair, poptions_t options)
bool_t good2=pair->p2->good;
bool_t goodtmp;
bool_t strand;
uint32_t i, j;
float temp;
CNNParams nnparams;
uint32_t i;
//nparam_InitParams(&nnparams, DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,DEF_SALT,SALT_METHOD_SANTALUCIA);
char *c;
const char *c;
char p1[32];
char p2[32];
@ -376,21 +372,20 @@ uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t opti
void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy, pecodnadb_t seqdb)
{
ppair_t* sortedpairs;
ppair_t* index;
// ppair_t* index;
ppairlist_t pl;
size_t i,j;
size_t count;
char *taxon[]={"taxon","taxa"};
ecotx_t *current_taxon;
//pairset pair_sets;
pairset *pset = NULL;
//printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n");
fprintf(stderr,"Total pair count : %d\n",pairs->count);
sortedpairs = ECOMALLOC(pairs->count*sizeof(ppair_t),"Cannot Allocate ordered pairs");
index=sortedpairs;
// index=sortedpairs;
pl=pairs->first;
j=0;
while(pl->next)
@ -475,10 +470,10 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy,
if (options->filter_on_links)
{
fprintf (stderr, "Old size: %d, ", count);
fprintf (stderr, "Old size: %ld, ", count);
count = primers_changeSortedArray (&sortedpairs, count, options);
//count = primers_filterWithGivenLinks (&sortedpairs, count, options);
fprintf (stderr, "New size: %d\n", count);
fprintf (stderr, "New size: %ld\n", count);
if (count == 0)
{
@ -803,9 +798,9 @@ int main(int argc, char **argv)
if (options.saltmethod != 2) //if not SALT_METHOD_OWCZARZY
options.saltmethod = SALT_METHOD_SANTALUCIA; //then force SALT_METHOD_SANTALUCIA
if (options.salt < 0.01 || options.salt > 0.3) //if salt value out of literature values
if (options.salt < 0.01 || options.salt > 0.3) {//if salt value out of literature values
options.salt = DEF_SALT; //set to default
}
nparam_InitParams(&nnparams, DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,options.salt,options.saltmethod);
fprintf(stderr,"Reading taxonomy database ...");
@ -851,7 +846,7 @@ int main(int argc, char **argv)
fprintf(stderr,"\nIndexing words in sequences\n");
words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
fprintf(stderr,"\n Strict primer count : %d\n",words->size);
fprintf(stderr,"\n Strict primer count : %lld\n",words->size);
/*/TR Testing
fprintf(stderr,"\nReducing for debugging\n");
@ -871,7 +866,7 @@ int main(int argc, char **argv)
if (options.no_multi_match)
{
(void)filterMultiStrictPrimer(words);
fprintf(stderr,"\n Strict primer with single match count : %d\n",words->size);
fprintf(stderr,"\n Strict primer with single match count : %lld\n",words->size);
}
@ -921,7 +916,7 @@ pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options)
{
uint32_t i, k;
pwordcount_t new_words;
char *rwrd;
const char *rwrd;
char dwrd[20];
/*char *strict_words[DEBUG_WORDS_CNT] = {"GAGTCTCTGCACCTATCC", "GCAATCCTGAGCCAAATC", "ACCCCTAACCACAACTCA",
"TCCGAACCGACTGATGTT", "GAAGCTTGGGTGAAACTA", "GGAGAACCAGCTAGCTCT", "GCTGGTTCTCCCCGAAAT",
@ -981,7 +976,7 @@ pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options)
void print_wordwith_positions (primer_t prm, uint32_t seqdbsize, poptions_t options)
{
char *wrd;
const char *wrd;
uint32_t i, j;
char *twrd = "GCCTGTTTACCAAAAACA";

View File

@ -1,5 +1,5 @@
MACHINE=MAC_OS_X
LIBPATH= -LlibecoPCR -Llibecoprimer -Llibthermo
LIBPATH= -LlibecoPCR -Llibecoprimer -Llibthermo -L/usr/local/lib
MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $<
CC=gcc

View File

@ -2,7 +2,7 @@
#include <stdlib.h>
static int eco_log_malloc = 0;
static size_t eco_amount_malloc=0;
//static size_t eco_amount_malloc=0;
static size_t eco_chunk_malloc=0;
void eco_trace_memory_allocation()
@ -37,7 +37,7 @@ void *eco_malloc(int64_t chunksize,
if (eco_log_malloc)
fprintf(stderr,
"Memory segment located at %p of size %d is allocated (file : %s [%d])",
"Memory segment located at %p of size %lld is allocated (file : %s [%d])",
chunk,
chunksize,
filename,
@ -65,7 +65,7 @@ void *eco_realloc(void *chunk,
if (!newchunk)
{
fprintf(stderr,"Requested memory : %d\n",newsize);
fprintf(stderr,"Requested memory : %lld\n",newsize);
ecoError(ECO_MEM_ERROR,error_message,filename,line);
}
if (!chunk)
@ -73,7 +73,7 @@ void *eco_realloc(void *chunk,
if (eco_log_malloc)
fprintf(stderr,
"Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])",
"Old memory segment %p is reallocated at %p with a size of %lld (file : %s [%d])",
chunk,
newchunk,
newsize,

View File

@ -21,8 +21,8 @@ typedef struct {
int32_t taxid;
char AC[20];
int32_t DE_length;
int32_t SQ_length;
int32_t CSQ_length; /*what is this CSQ_length ? */
u_int32_t SQ_length;
u_int32_t CSQ_length; /*what is this CSQ_length ? */
char data[1];
@ -30,7 +30,7 @@ typedef struct {
typedef struct {
int32_t taxid;
int32_t SQ_length;
u_int32_t SQ_length;
int32_t isexample;
char *AC;
char *DE;

View File

@ -2,6 +2,7 @@
#include <stdlib.h>
#include <string.h>
#include <zlib.h>
#include <ctype.h>
#include <string.h>
#include <stdio.h>

View File

@ -1139,7 +1139,7 @@ void sets_by_SimulatedAnealing (pairset *pair_set,
{
pair_set = extend_set_randomly (NULL, &params, 3);
printf("\nStart Random seed set for Simulated :\n");
print_set_info (&pair_set, &params);
print_set_info (pair_set, &params);
}
min_spc = max_spc = pair_set->set_specificity;
min_cov = max_cov = pair_set->set_coverage;
@ -1566,7 +1566,8 @@ int32_t *addinset (int32_t *set, int32_t i, int32_t j, int32_t* slots, int32_t *
size_t primers_changeSortedArray (ppair_t ** pairs,
size_t sorted_count, poptions_t options)
{
int32_t i, j, k, l, total_links;
int32_t i, k, total_links;
u_int32_t j;
int *owi;
int *iwi;
int allowedtaxa;
@ -1579,14 +1580,14 @@ size_t primers_changeSortedArray (ppair_t ** pairs,
idx_set = ECOMALLOC(slots*sizeof (int32_t),
"Could not allocate memory for index set.");
for (i=0; i<sorted_count; i++)
for (i=0; ((u_int32_t)i)<sorted_count; i++)
{
owi = sortedpairs[i]->wellIdentifiedSeqs;
passed = FALSE;
for (j=0; j<sorted_count; j++)
{
if (i == j) continue;
if ((u_int32_t)i == j) continue;
iwi = sortedpairs[j]->wellIdentifiedSeqs;
total_links = 0;
@ -1610,8 +1611,9 @@ size_t primers_changeSortedArray (ppair_t ** pairs,
if (options->max_links_percent > 0)
{
allowedtaxa = options->max_links_percent;
if (total_links > allowedtaxa)
if (total_links > allowedtaxa){
passed = TRUE;
}
break;
}
else
@ -1630,7 +1632,7 @@ size_t primers_changeSortedArray (ppair_t ** pairs,
for (j=0; j<sorted_count; j++)
{
for (k=0; k<index; k++)
if (j == idx_set[k]) break;
if (j == (u_int32_t)(idx_set[k])) break;
//need to remove this element
if (k == index)
{
@ -1711,9 +1713,9 @@ int32_t *addinset_withLinks (int32_t *set, int32_t i, int32_t* slots, int32_t *i
size_t primers_filterWithGivenLinks (ppair_t ** pairs,
size_t sorted_count, poptions_t options)
{
int32_t i, j, k;
int32_t i, k;
u_int32_t j;
ppair_t *sortedpairs = *pairs;
bool_t passed;
int32_t *idx_set = NULL;
int32_t slots=50, index=0;
@ -1732,7 +1734,7 @@ size_t primers_filterWithGivenLinks (ppair_t ** pairs,
for (j=0; j<sorted_count; j++)
{
for (k=0; k<index; k++)
if (j == idx_set[k]) break;
if (j == (u_int32_t) (idx_set[k])) break;
//need to remove this element
if (k == index)
{
@ -1755,7 +1757,7 @@ size_t primers_filterWithGivenLinks (ppair_t ** pairs,
else i=sorted_count;
for (j=0; j<i; j++)
for (j=0; j<(u_int32_t)i; j++)
for (k=0; k<options->dbsize; k++)
if ((*pairs)[j]->coveredSeqs[k] == 1)
cov[k] = 1;

View File

@ -213,7 +213,6 @@ pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsi
char *base;
int8_t code;
uint32_t goodPrimers=0;
static int iii=0;
//inSequenceQuorum = (uint32_t)floor((float)exampleCount * options->sensitivity_quorum);

View File

@ -36,7 +36,7 @@ int32_t ManberNoErr(pecoseq_t pseq,ppattern_t pat,
ppatternParam_t param,
StackiPtr stkpos)
{
int32_t pos;
u_int32_t pos;
uint32_t smask, r;
uint8_t *data;
int32_t end;
@ -84,7 +84,7 @@ int32_t ManberSub(pecoseq_t pseq,ppattern_t pat,
StackiPtr stkpos)
{
int e, found;
int32_t pos;
u_int32_t pos;
uint32_t smask, cmask, sindx;
uint32_t *pr, r[2 * MAX_PAT_ERR + 2];
uint8_t *data;

View File

@ -201,7 +201,7 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3
data[w].good = data[w].inexample >= inSequenceQuorum && data[w].outexample <= outSequenceQuorum;
goodPrimers+=data[w].good? 1:0;
fprintf(stderr,"Primers %5d/%d analyzed => sequence : %s in %d example and %d counterexample sequences \r",
fprintf(stderr,"Primers %5d/%lld analyzed => sequence : %s in %d example and %d counterexample sequences \r",
i+1,words->size,ecoUnhashWord(data[w].word,options->primer_length),
data[w].inexample,data[w].outexample);
@ -224,8 +224,8 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3
ECOFREE(data[w].reversePos,"Free direct count table");
}
fprintf(stderr,"\n\nOn %d analyzed primers %d respect quorum conditions\n",words->size,goodPrimers);
fprintf(stderr,"Conserved primers for further analysis : %d/%d\n",w,words->size);
fprintf(stderr,"\n\nOn %lld analyzed primers %d respect quorum conditions\n",words->size,goodPrimers);
fprintf(stderr,"Conserved primers for further analysis : %d/%lld\n",w,words->size);
primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table");
primers->primers=ECOREALLOC(data,

View File

@ -8,14 +8,6 @@
#ifndef HASHENCODER_H_
#define HASHENCODER_H_
static int8_t encoder[] = {0, // A
-1, // b
1, // C
-1,-1,-1, // d, e, f
2, // G
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
3,3, // T,U
-1,-1,-1,-1,-1}; // v,w,x,y,z
extern int8_t encoder[];
#endif /* HASHENCODER_H_ */

View File

@ -12,6 +12,16 @@ static int cmpword(const void *x,const void *y);
#include "hashencoder.h"
int8_t encoder[] = {0, // A
-1, // b
1, // C
-1,-1,-1, // d, e, f
2, // G
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
3,3, // T,U
-1,-1,-1,-1,-1}; // v,w,x,y,z
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
{
uint32_t wordcount;
@ -257,6 +267,6 @@ uint32_t ecoFindWord(pwordcount_t table,word_t word)
char ecoComplementChar(char base)
{
return (base < 4)? !base & 3: 4;
return (base < 4)? ~base & 3: 4;
}

View File

@ -205,8 +205,8 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
}
else
{
uint32_t s;
s = strictprimers->size;
// uint32_t s;
// s = strictprimers->size;
// DEBUG_LOG("stack size : %u",s);
addSeqToWordCountTable(strictprimers,options->primer_length,
options->circular,
@ -223,7 +223,7 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
logfile = fopen(logfilename,"a");
seconde = timeval_subtract(&(usage.ru_utime),&(start.ru_utime)) +
timeval_subtract(&(usage.ru_stime),&(start.ru_stime));
fprintf(logfile,"%d\t%llu\t%lu\t%8.3f\t%8.3e\n",i,
fprintf(logfile,"%d\t%llu\t%llu\t%8.3f\t%8.3e\n",i,
(long long unsigned)totallength,
strictprimers->size*(sizeof(int64_t)+sizeof(int32_t)),
seconde,seconde/(double)totallength);
@ -248,9 +248,9 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
sizeof(word_t)*strictprimers->size,
"Cannot reallocate strict primer table");
if (neededWords)
if (neededWords){
ECOFREE(neededWords,"Clean needed word table");
}
//TR: Somehow for some primers strictcount value is extremely large hence invalid
//we need to remove these primers from the list
j = strictprimers->size+1;

View File

@ -231,8 +231,8 @@ static int cmpamp(const void *ampf1, const void* ampf2)
char cd1;
char cd2;
int len = 0;
char *ch1;
char *ch2;
const char *ch1;
const char *ch2;
int incr1;
int incr2;

View File

@ -17,6 +17,15 @@
double forbidden_entropy;
char bpencoder[] = { 1, // A
0, // b
2, // C
0,0,0, // d, e, f
3, // G
0,0,0,0,0,0,0,0,0,0,0,0, // h,i,j,k,l,m,n,o,p,q,r,s
4,0, // T,U
0,0,0,0,0}; // v,w,x,y,z
double nparam_GetInitialEntropy(PNNParams nparm)
{
@ -470,7 +479,7 @@ int nparam_CountGCContent(char * seq ) {
return count;
}
void nparam_CleanSeq (char* inseq, char* outseq, int len)
void nparam_CleanSeq (const char* inseq, char* outseq, int len)
{
int seqlen = strlen (inseq);
int i, j;
@ -508,7 +517,7 @@ void nparam_CleanSeq (char* inseq, char* outseq, int len)
}
//Calculate TM for given sequence against its complement
double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len)
double nparam_CalcSelfTM(PNNParams nparm, const char* seq, size_t len)
{
double thedH = 0;
//double thedS = nparam_GetInitialEntropy(nparm);
@ -520,7 +529,7 @@ double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len)
char c4;
unsigned int i;
char nseq[50];
char *useq = seq;
const char *useq = seq;
nparam_CleanSeq (seq, nseq, len);
useq = nseq;
@ -533,7 +542,7 @@ double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len)
c4 = GETNUMCODE(useq[i]);
thedH += nparm->dH[c3][c4][c1][c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2);
thedH += nparm->dH[(u_int8_t) c3][(u_int8_t)c4][(u_int8_t)c1][(u_int8_t)c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2);
thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2);
}
//printf("------------------\n");
@ -543,7 +552,7 @@ double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len)
return mtemp;
}
double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len)
double nparam_CalcTwoTM(PNNParams nparm, const char* seq1, const char* seq2, size_t len)
{
double thedH = 0;
//double thedS = nparam_GetInitialEntropy(nparm);
@ -575,7 +584,7 @@ double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len)
//fprintf (stderr,"Primer : %s %f %f %d %d, %d %d %f\n",useq,thedH,thedS,(int)c3,(int)c4,(int)c1,(int)c2,nparam_GetEnthalpy(nparm, c3,c4,c1,c2));
thedH += nparm->dH[c3][c4][c1][c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2);
thedH += nparm->dH[(u_int8_t)c3][(u_int8_t)c4][(u_int8_t)c1][(u_int8_t)c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2);
thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2);
}
//fprintf(stderr,"------------------\n");

View File

@ -16,8 +16,8 @@
//#include "../libecoprimer/ecoprimer.h"
// following defines to simplify coding...
#define ndH(a,b,c,d) nparm->dH[a][b][c][d]
#define ndS(a,b,c,d) nparm->dS[a][b][c][d]
#define ndH(a,b,c,d) nparm->dH[(u_int8_t) a][(u_int8_t) b][(u_int8_t) c][(u_int8_t) d]
#define ndS(a,b,c,d) nparm->dS[(u_int8_t) a][(u_int8_t) b][(u_int8_t) c][(u_int8_t) d]
#define forbidden_enthalpy 1000000000000000000.0f
#define R 1.987f
#define SALT_METHOD_SANTALUCIA 1
@ -33,14 +33,7 @@
extern double forbidden_entropy;
static char bpencoder[] = { 1, // A
0, // b
2, // C
0,0,0, // d, e, f
3, // G
0,0,0,0,0,0,0,0,0,0,0,0, // h,i,j,k,l,m,n,o,p,q,r,s
4,0, // T,U
0,0,0,0,0}; // v,w,x,y,z
extern char bpencoder[]; // v,w,x,y,z
typedef struct CNNParams_st
@ -62,8 +55,8 @@ int nparam_CountGCContent(char * seq );
double nparam_GetEntropy(PNNParams nparm, char x0, char x1, char y0, char y1);
double nparam_GetEnthalpy(PNNParams nparm, char x0, char x1, char y0, char y1);
double nparam_CalcTM(double entropy,double enthalpy);
double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len);
double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len);
double nparam_CalcSelfTM(PNNParams nparm, const char* seq, size_t len);
double nparam_CalcTwoTM(PNNParams nparm, const char* seq1, const char* seq2, size_t len);
double nparam_GetInitialEntropy(PNNParams nparm) ;
double calculateMeltingTemperatureBasic (char * seq);

View File

@ -28,17 +28,15 @@ word_t extractSite(char* sequence, size_t begin, size_t length, bool_t strand)
void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options)
{
size_t i, j,k,l;
size_t i, j;
uint32_t bp1,bp2;
uint32_t ep1,ep2;
word_t w1;
word_t w2;
bool_t strand;
char *sq,*sq1,*sq2,*c;
char *sq;
char prmrd[50];
char prmrr[50];
char sqsite[50];
double mtemp;
for (i = 0; i < count; i++)

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
import re
import gzip
@ -6,11 +6,8 @@ import struct
import sys
import time
import getopt
from functools import cmp_to_key
try:
import psycopg2
_dbenable=True
except ImportError:
_dbenable=False
#####
@ -80,15 +77,17 @@ class ColumnFile(object):
def __init__(self,stream,sep=None,strip=True,types=None):
if isinstance(stream,str):
self._stream = open(stream)
elif hasattr(stream,'next'):
self._stream = stream
else:
raise ValueError,'stream must be string or an iterator'
try:
iter(stream)
self._stream = stream
except TypeError:
raise ValueError('stream must be string or an iterator')
self._delimiter=sep
self._strip=strip
if types:
self._types=[x for x in types]
for i in xrange(len(self._types)):
for i in range(len(self._types)):
if self._types[i] is bool:
self._types[i]=ColumnFile.str2bool
else:
@ -103,14 +102,14 @@ class ColumnFile(object):
def __iter__(self):
return self
def next(self):
ligne = self._stream.next()
def __next__(self):
ligne = next(self._stream)
data = ligne.split(self._delimiter)
if self._strip or self._types:
data = [x.strip() for x in data]
if self._types:
it = endLessIterator(self._types)
data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
data = [x[1](x[0]) for x in ((y,next(it)) for y in data)]
return data
def taxonCmp(t1,t2):
@ -125,14 +124,14 @@ def bsearchTaxon(taxonomy,taxid):
begin = 0
end = taxCount
oldcheck=taxCount
check = begin + end / 2
check = int(begin + end / 2)
while check != oldcheck and taxonomy[check][0]!=taxid :
if taxonomy[check][0] < taxid:
begin=check
else:
end=check
oldcheck=check
check = (begin + end) / 2
check = int((begin + end) / 2)
if taxonomy[check][0]==taxid:
@ -152,22 +151,22 @@ def readNodeTable(file):
str,str,bool,
int,bool,int,
bool,bool,bool,str))
print >>sys.stderr,"Reading taxonomy dump file..."
print("Reading taxonomy dump file...", file=sys.stderr)
taxonomy=[[n[0],n[2],n[1]] for n in nodes]
print >>sys.stderr,"List all taxonomy rank..."
print("List all taxonomy rank...", file=sys.stderr)
ranks =list(set(x[1] for x in taxonomy))
ranks.sort()
ranks = dict(map(None,ranks,xrange(len(ranks))))
ranks = {rank: index for index, rank in enumerate(ranks)}
print >>sys.stderr,"Sorting taxons..."
taxonomy.sort(taxonCmp)
print("Sorting taxons...", file=sys.stderr)
taxonomy.sort(key=lambda x: x[0])
print >>sys.stderr,"Indexing taxonomy..."
print("Indexing taxonomy...", file=sys.stderr)
index = {}
for t in taxonomy:
index[t[0]]=bsearchTaxon(taxonomy, t[0])
print >>sys.stderr,"Indexing parent and rank..."
print("Indexing parent and rank...", file=sys.stderr)
for t in taxonomy:
t[1]=ranks[t[1]]
t[2]=index[t[2]]
@ -203,7 +202,7 @@ def deletedNodeIterator(file):
def readTaxonomyDump(taxdir):
taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir)
print >>sys.stderr,"Adding scientific name..."
print("Adding scientific name...", file=sys.stderr)
alternativeName=[]
for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir):
@ -211,66 +210,16 @@ def readTaxonomyDump(taxdir):
if classname == 'scientific name':
taxonomy[index[taxid]].append(name)
print >>sys.stderr,"Adding taxid alias..."
print("Adding taxid alias...", file=sys.stderr)
for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir):
index[taxid]=index[current]
print >>sys.stderr,"Adding deleted taxid..."
print("Adding deleted taxid...", file=sys.stderr)
for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir):
index[taxid]=None
return taxonomy,ranks,alternativeName,index
def readTaxonomyDB(dbname):
connection = psycopg2.connect(database=dbname)
cursor = connection.cursor()
cursor.execute("select numid,rank,parent from ncbi_taxonomy.taxon")
taxonomy=[list(x) for x in cursor]
cursor.execute("select rank_class from ncbi_taxonomy.taxon_rank_class order by rank_class")
ranks=cursor.fetchall()
ranks = dict(map(None,(x[0] for x in ranks),xrange(len(ranks))))
print >>sys.stderr,"Sorting taxons..."
taxonomy.sort(taxonCmp)
print >>sys.stderr,"Indexing taxonomy..."
index = {}
for t in taxonomy:
index[t[0]]=bsearchTaxon(taxonomy, t[0])
print >>sys.stderr,"Indexing parent and rank..."
for t in taxonomy:
t[1]=ranks[t[1]]
try:
t[2]=index[t[2]]
except KeyError,e:
if t[2] is None and t[0]==1:
t[2]=index[t[0]]
else:
raise e
cursor.execute("select taxid,name,category from ncbi_taxonomy.name")
alternativeName=[]
for taxid,name,classname in cursor:
alternativeName.append((name,classname,index[taxid]))
if classname == 'scientific name':
taxonomy[index[taxid]].append(name)
cursor.execute("select old_numid,current_numid from ncbi_taxonomy.taxon_id_alias")
print >>sys.stderr,"Adding taxid alias..."
for taxid,current in cursor:
if current is not None:
index[taxid]=index[current]
else:
index[taxid]=None
return taxonomy,ranks,alternativeName,index
#####
#
#
@ -282,22 +231,27 @@ def readTaxonomyDB(dbname):
def entryIterator(file):
file = universalOpen(file)
rep =[]
for ligne in file:
ligne = file.readline()
while ligne:
rep.append(ligne)
if ligne == '//\n':
rep = ''.join(rep)
yield rep
rep = []
ligne = file.readline()
def fastaEntryIterator(file):
file = universalOpen(file)
rep =[]
for ligne in file:
ligne = file.readline()
while ligne:
if ligne[0] == '>' and rep:
rep = ''.join(rep)
yield rep
rep = []
rep.append(ligne)
ligne = file.readline()
if rep:
rep = ''.join(rep)
yield rep
@ -418,7 +372,7 @@ def taxonomyInfo(entry,connection):
def ecoSeqPacker(sq):
compactseq = gzip.zlib.compress(sq['sequence'],9)
compactseq = gzip.zlib.compress(bytes(sq['sequence'],"ascii"),9)
cptseqlength = len(compactseq)
delength = len(sq['definition'])
@ -427,11 +381,11 @@ def ecoSeqPacker(sq):
packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength),
totalSize,
sq['taxid'],
sq['id'],
bytes(sq['id'],"ascii"),
delength,
len(sq['sequence']),
cptseqlength,
sq['definition'],
bytes(sq['definition'],"ascii"),
compactseq)
assert len(packed) == totalSize+4, "error in sequence packing"
@ -450,7 +404,7 @@ def ecoTaxPacker(tx):
tx[1],
tx[2],
namelength,
tx[3])
bytes(tx[3],"ascii"))
return packed
@ -460,7 +414,7 @@ def ecoRankPacker(rank):
packed = struct.pack('> I %ds' % namelength,
namelength,
rank)
bytes(rank, 'ascii'))
return packed
@ -476,8 +430,8 @@ def ecoNamePacker(name):
namelength,
classlength,
name[2],
name[0],
name[1])
bytes(name[0], 'ascii'),
bytes(name[1], 'ascii'))
return packed
@ -505,11 +459,11 @@ def ecoSeqWriter(file,input,taxindex,parser):
skipped.append(entry['id'])
where = universalTell(input)
progressBar(where, inputsize)
print >>sys.stderr," Readed sequences : %d " % seqcount,
print(" Readed sequences : %d " % seqcount, end=' ', file=sys.stderr)
else:
skipped.append(entry['id'])
print >>sys.stderr
print(file=sys.stderr)
output.seek(0,0)
output.write(struct.pack('> I',seqcount))
@ -530,7 +484,7 @@ def ecoRankWriter(file,ranks):
output = open(file,'wb')
output.write(struct.pack('> I',len(ranks)))
rankNames = ranks.keys()
rankNames = list(ranks.keys())
rankNames.sort()
for rank in rankNames:
@ -552,7 +506,7 @@ def ecoNameWriter(file,names):
output = open(file,'wb')
output.write(struct.pack('> I',len(names)))
names.sort(nameCmp)
names.sort(key=lambda x:x[0].upper())
for name in names:
output.write(ecoNamePacker(name))
@ -573,8 +527,8 @@ def ecoDBWriter(prefix,taxonomy,seqFileNames,parser):
taxonomy[3],
parser)
if sk:
print >>sys.stderr,"Skipped entry :"
print >>sys.stderr,sk
print("Skipped entry :", file=sys.stderr)
print(sk, file=sys.stderr)
def ecoParseOptions(arguments):
opt = {
@ -618,34 +572,30 @@ def ecoParseOptions(arguments):
opt['parser']=sequenceIteratorFactory(emblEntryParser,
entryIterator)
else:
raise ValueError,'Unknown option %s' % name
raise ValueError('Unknown option %s' % name)
return opt,filenames
def printHelp():
print "-----------------------------------"
print " ecoPCRFormat.py"
print "-----------------------------------"
print "ecoPCRFormat.py [option] <argument>"
print "-----------------------------------"
print "-e --embl :[E]mbl format"
print "-f --fasta :[F]asta format"
print "-g --genbank :[G]enbank format"
print "-h --help :[H]elp - print this help"
print "-n --name :[N]ame of the new database created"
print "-t --taxonomy :[T]axonomy - path to the taxonomy database"
print " :bcp-like dump from GenBank taxonomy database."
print "-----------------------------------"
print("-----------------------------------")
print(" ecoPCRFormat.py")
print("-----------------------------------")
print("ecoPCRFormat.py [option] <argument>")
print("-----------------------------------")
print("-e --embl :[E]mbl format")
print("-f --fasta :[F]asta format")
print("-g --genbank :[G]enbank format")
print("-h --help :[H]elp - print this help")
print("-n --name :[N]ame of the new database created")
print("-t --taxonomy :[T]axonomy - path to the taxonomy database")
print(" :bcp-like dump from GenBank taxonomy database.")
print("-----------------------------------")
if __name__ == '__main__':
opt,filenames = ecoParseOptions(sys.argv[1:])
if opt['taxmod']=='dump':
taxonomy = readTaxonomyDump(opt['taxdir'])
elif opt['taxmod']=='db':
taxonomy = readTaxonomyDB(opt['taxdb'])
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])