mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Optimize memory allocation of the apat algorithms
Former-commit-id: 5010c5a666b322715b3b81c1078d325e1f647ede
This commit is contained in:
@ -106,7 +106,6 @@ typedef struct { /* sequence */
|
|||||||
int32_t datsiz; /* data buffer size */
|
int32_t datsiz; /* data buffer size */
|
||||||
int32_t circular;
|
int32_t circular;
|
||||||
uint8_t *data; /* data buffer */
|
uint8_t *data; /* data buffer */
|
||||||
char *cseq; /* sequence buffer */
|
|
||||||
StackiPtr hitpos[MAX_PATTERN]; /* stack of hit pos. */
|
StackiPtr hitpos[MAX_PATTERN]; /* stack of hit pos. */
|
||||||
StackiPtr hiterr[MAX_PATTERN]; /* stack of errors */
|
StackiPtr hiterr[MAX_PATTERN]; /* stack of errors */
|
||||||
} Seq, *SeqPtr;
|
} Seq, *SeqPtr;
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
#include "obiapat.h"
|
#include "obiapat.h"
|
||||||
|
|
||||||
static void EncodeSequence(SeqPtr seq);
|
static void EncodeSequence(SeqPtr seq, const char *in);
|
||||||
static void UpperSequence(char *seq);
|
static void UpperSequence(char *seq);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -142,64 +142,6 @@ char *ecoComplementPattern(char *nucAcSeq)
|
|||||||
return reverseSequence(LXBioSeqComplement(nucAcSeq),1);
|
return reverseSequence(LXBioSeqComplement(nucAcSeq),1);
|
||||||
}
|
}
|
||||||
|
|
||||||
char *ecoComplementSequence(char *nucAcSeq)
|
|
||||||
{
|
|
||||||
return reverseSequence(LXBioSeqComplement(nucAcSeq),0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end,
|
|
||||||
int *errno, char **errmsg)
|
|
||||||
/*
|
|
||||||
extract subsequence from nucAcSeq [begin,end[
|
|
||||||
*/
|
|
||||||
{
|
|
||||||
static char *buffer = NULL;
|
|
||||||
static int32_t buffSize= 0;
|
|
||||||
int32_t length;
|
|
||||||
|
|
||||||
if (begin < end)
|
|
||||||
{
|
|
||||||
length = end - begin;
|
|
||||||
|
|
||||||
if (length >= buffSize)
|
|
||||||
{
|
|
||||||
buffSize = length+1;
|
|
||||||
if (buffer)
|
|
||||||
buffer=ECOREALLOC(buffer,buffSize,
|
|
||||||
"Error in reallocating sub sequence buffer",errno,errmsg);
|
|
||||||
else
|
|
||||||
buffer=ECOMALLOC(buffSize,
|
|
||||||
"Error in allocating sub sequence buffer",errno,errmsg);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
strncpy(buffer,nucAcSeq + begin,length);
|
|
||||||
buffer[length]=0;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
length = end + strlen(nucAcSeq) - begin;
|
|
||||||
|
|
||||||
if (length >= buffSize)
|
|
||||||
{
|
|
||||||
buffSize = length+1;
|
|
||||||
if (buffer)
|
|
||||||
buffer=ECOREALLOC(buffer,buffSize,
|
|
||||||
"Error in reallocating sub sequence buffer",errno,errmsg);
|
|
||||||
else
|
|
||||||
buffer=ECOMALLOC(buffSize,
|
|
||||||
"Error in allocating sub sequence buffer",errno,errmsg);
|
|
||||||
|
|
||||||
}
|
|
||||||
strncpy(buffer,nucAcSeq+begin,length - end);
|
|
||||||
strncpy(buffer+(length-end),nucAcSeq ,end);
|
|
||||||
buffer[length]=0;
|
|
||||||
}
|
|
||||||
|
|
||||||
return buffer;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* -------------------------------------------- */
|
/* -------------------------------------------- */
|
||||||
/* uppercase sequence */
|
/* uppercase sequence */
|
||||||
@ -229,29 +171,27 @@ void UpperSequence(char *seq)
|
|||||||
/* -------------------------------------------- */
|
/* -------------------------------------------- */
|
||||||
|
|
||||||
#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'Z'))
|
#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'Z'))
|
||||||
|
#define IS_LOWER(c) (((c) >= 'a') && ((c) <= 'z'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void EncodeSequence(SeqPtr seq)
|
void EncodeSequence(SeqPtr seq, const char *in)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
uint8_t *data;
|
uint8_t *data;
|
||||||
char *cseq;
|
const char *cseq;
|
||||||
char nuc;
|
char nuc;
|
||||||
|
|
||||||
data = seq->data;
|
data = seq->data;
|
||||||
cseq = seq->cseq;
|
|
||||||
|
|
||||||
while (*cseq) {
|
for (i=0,cseq=in; i < seq->seqlen; i++,cseq++,data++) {
|
||||||
nuc = *cseq & (~32);
|
nuc = *cseq;
|
||||||
*data = (IS_UPPER(nuc) ? nuc - 'A' : 0x0);
|
*data = (IS_LOWER(nuc) ? nuc - 'a' : 0x0);
|
||||||
data++;
|
|
||||||
cseq++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i=0,cseq=seq->cseq;i < seq->circular; i++,cseq++,data++) {
|
for (i=0,cseq=in; i < seq->circular; i++,cseq++,data++) {
|
||||||
nuc = *cseq & (~32);
|
nuc = *cseq;
|
||||||
*data = (IS_UPPER(nuc) ? nuc - 'A' : 0x0);
|
*data = (IS_LOWER(nuc) ? nuc - 'a' : 0x0);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0 ; i < MAX_PATTERN ; i++)
|
for (i = 0 ; i < MAX_PATTERN ; i++)
|
||||||
@ -266,6 +206,7 @@ SeqPtr new_apatseq(const char *in,int32_t circular, int32_t seqlen,
|
|||||||
SeqPtr out,
|
SeqPtr out,
|
||||||
int *errno, char **errmsg)
|
int *errno, char **errmsg)
|
||||||
{
|
{
|
||||||
|
// fprintf(stderr,">>>>>>>> new_apatseq\n");
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
if (circular != 0) circular=MAX_PAT_LEN;
|
if (circular != 0) circular=MAX_PAT_LEN;
|
||||||
@ -287,28 +228,26 @@ SeqPtr new_apatseq(const char *in,int32_t circular, int32_t seqlen,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
out->seqsiz = out->seqlen = seqlen;
|
|
||||||
out->circular = circular;
|
|
||||||
|
|
||||||
if (!out->data)
|
if (!out->data)
|
||||||
{
|
{
|
||||||
out->data = ECOMALLOC((out->seqlen+circular) *sizeof(uint8_t),
|
out->data = ECOMALLOC((seqlen+circular) *sizeof(uint8_t),
|
||||||
"Error in Allocation of a new Seq data member",
|
"Error in Allocation of a new Seq data member",
|
||||||
errno,errmsg);
|
errno,errmsg);
|
||||||
out->datsiz= out->seqlen+circular;
|
out->datsiz= seqlen+circular;
|
||||||
}
|
}
|
||||||
else if ((out->seqlen +circular) >= out->datsiz)
|
else if ((seqlen +circular) >= out->datsiz)
|
||||||
{
|
{
|
||||||
out->data = ECOREALLOC(out->data,(out->seqlen+circular) *sizeof(uint8_t),
|
out->data = ECOREALLOC(out->data,(seqlen+circular) *sizeof(uint8_t),
|
||||||
"Error during Seq data buffer realloc",
|
"Error during Seq data buffer realloc",
|
||||||
errno,errmsg);
|
errno,errmsg);
|
||||||
out->datsiz= out->seqlen+circular;
|
out->datsiz= seqlen+circular;
|
||||||
}
|
}
|
||||||
|
|
||||||
out->cseq = (char *)in;
|
out->circular = circular;
|
||||||
|
out->seqlen = seqlen;
|
||||||
EncodeSequence(out);
|
EncodeSequence(out,in);
|
||||||
|
// fprintf(stderr,">>>>>>>> Encodage ok\n");
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,6 +27,7 @@ var _AllocatedApaPattern = 0
|
|||||||
// Apat algorithm functions and methods
|
// Apat algorithm functions and methods
|
||||||
type _ApatPattern struct {
|
type _ApatPattern struct {
|
||||||
pointer *C.Pattern
|
pointer *C.Pattern
|
||||||
|
pattern string
|
||||||
}
|
}
|
||||||
|
|
||||||
type ApatPattern struct {
|
type ApatPattern struct {
|
||||||
@ -37,6 +38,7 @@ type ApatPattern struct {
|
|||||||
// Apat algorithm functions and methods
|
// Apat algorithm functions and methods
|
||||||
type _ApatSequence struct {
|
type _ApatSequence struct {
|
||||||
pointer *C.Seq
|
pointer *C.Seq
|
||||||
|
reference *obiseq.BioSequence
|
||||||
}
|
}
|
||||||
|
|
||||||
type ApatSequence struct {
|
type ApatSequence struct {
|
||||||
@ -88,7 +90,8 @@ func MakeApatPattern(pattern string, errormax int, allowsIndel bool) (ApatPatter
|
|||||||
return NilApatPattern, errors.New(message)
|
return NilApatPattern, errors.New(message)
|
||||||
}
|
}
|
||||||
|
|
||||||
ap := _ApatPattern{apc}
|
|
||||||
|
ap := _ApatPattern{apc,pattern}
|
||||||
|
|
||||||
runtime.SetFinalizer(&ap, func(p *_ApatPattern) {
|
runtime.SetFinalizer(&ap, func(p *_ApatPattern) {
|
||||||
// log.Printf("Finaliser called on %s\n", C.GoString(p.pointer.cpat))
|
// log.Printf("Finaliser called on %s\n", C.GoString(p.pointer.cpat))
|
||||||
@ -111,8 +114,8 @@ func (pattern ApatPattern) ReverseComplement() (ApatPattern, error) {
|
|||||||
C.free(unsafe.Pointer(errmsg))
|
C.free(unsafe.Pointer(errmsg))
|
||||||
return ApatPattern{nil}, errors.New(message)
|
return ApatPattern{nil}, errors.New(message)
|
||||||
}
|
}
|
||||||
|
spat := C.GoString(apc.cpat)
|
||||||
ap := _ApatPattern{apc}
|
ap := _ApatPattern{apc,spat}
|
||||||
|
|
||||||
runtime.SetFinalizer(&ap, func(p *_ApatPattern) {
|
runtime.SetFinalizer(&ap, func(p *_ApatPattern) {
|
||||||
// log.Printf("Finaliser called on %s\n", C.GoString(p.pointer.cpat))
|
// log.Printf("Finaliser called on %s\n", C.GoString(p.pointer.cpat))
|
||||||
@ -124,7 +127,8 @@ func (pattern ApatPattern) ReverseComplement() (ApatPattern, error) {
|
|||||||
|
|
||||||
// String method casts the ApatPattern to a Go String.
|
// String method casts the ApatPattern to a Go String.
|
||||||
func (pattern ApatPattern) String() string {
|
func (pattern ApatPattern) String() string {
|
||||||
return C.GoString(pattern.pointer.pointer.cpat)
|
return pattern.pointer.pattern
|
||||||
|
//return C.GoString(pattern.pointer.pointer.cpat)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Len method returns the length of the matched pattern.
|
// Len method returns the length of the matched pattern.
|
||||||
@ -166,7 +170,6 @@ func (pattern ApatPattern) Print() {
|
|||||||
func MakeApatSequence(sequence *obiseq.BioSequence, circular bool, recycle ...ApatSequence) (ApatSequence, error) {
|
func MakeApatSequence(sequence *obiseq.BioSequence, circular bool, recycle ...ApatSequence) (ApatSequence, error) {
|
||||||
var errno C.int32_t
|
var errno C.int32_t
|
||||||
var errmsg *C.char
|
var errmsg *C.char
|
||||||
var p unsafe.Pointer
|
|
||||||
seqlen := sequence.Len()
|
seqlen := sequence.Len()
|
||||||
|
|
||||||
ic := 0
|
ic := 0
|
||||||
@ -178,32 +181,13 @@ func MakeApatSequence(sequence *obiseq.BioSequence, circular bool, recycle ...Ap
|
|||||||
|
|
||||||
if len(recycle) > 0 {
|
if len(recycle) > 0 {
|
||||||
out = recycle[0].pointer.pointer
|
out = recycle[0].pointer.pointer
|
||||||
if (int(out.seqlen) < seqlen || int(out.seqlen) > 5*seqlen) && out.cseq != nil {
|
|
||||||
C.free(unsafe.Pointer(out.cseq))
|
|
||||||
out.cseq = nil
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
out = nil
|
out = nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if out == nil || out.cseq == nil {
|
|
||||||
|
|
||||||
p = C.malloc(C.size_t(seqlen) + 1)
|
|
||||||
// if p != nil {
|
|
||||||
// // atomic.AddInt64(&_AllocatedApaSequences, 1)
|
|
||||||
// }
|
|
||||||
} else {
|
|
||||||
p = unsafe.Pointer(out.cseq)
|
|
||||||
}
|
|
||||||
|
|
||||||
if p == nil {
|
|
||||||
log.Panicln("Cannot allocate memory chunk for Cseq Apat sequecence")
|
|
||||||
}
|
|
||||||
|
|
||||||
// copy the data into the buffer, by converting it to a Go array
|
// copy the data into the buffer, by converting it to a Go array
|
||||||
cBuf := (*[1 << 31]byte)(p)
|
p := unsafe.Pointer(unsafe.SliceData(sequence.Sequence()))
|
||||||
copy(cBuf[:], sequence.Sequence())
|
|
||||||
cBuf[sequence.Len()] = 0
|
|
||||||
|
|
||||||
pseqc := C.new_apatseq((*C.char)(p), C.int32_t(ic), C.int32_t(seqlen),
|
pseqc := C.new_apatseq((*C.char)(p), C.int32_t(ic), C.int32_t(seqlen),
|
||||||
(*C.Seq)(out),
|
(*C.Seq)(out),
|
||||||
@ -221,19 +205,14 @@ func MakeApatSequence(sequence *obiseq.BioSequence, circular bool, recycle ...Ap
|
|||||||
|
|
||||||
if out == nil {
|
if out == nil {
|
||||||
// log.Printf("Make ApatSeq called on %p -> %p\n", out, pseqc)
|
// log.Printf("Make ApatSeq called on %p -> %p\n", out, pseqc)
|
||||||
seq := _ApatSequence{pointer: pseqc}
|
seq := _ApatSequence{pointer: pseqc,reference: sequence}
|
||||||
|
|
||||||
runtime.SetFinalizer(&seq, func(apat_p *_ApatSequence) {
|
runtime.SetFinalizer(&seq, func(apat_p *_ApatSequence) {
|
||||||
var errno C.int32_t
|
var errno C.int32_t
|
||||||
var errmsg *C.char
|
var errmsg *C.char
|
||||||
// log.Printf("Finaliser called on %p\n", apat_p.pointer)
|
log.Debugf("Finaliser called on %p\n", apat_p.pointer)
|
||||||
|
|
||||||
if apat_p != nil && apat_p.pointer != nil {
|
if apat_p != nil && apat_p.pointer != nil {
|
||||||
if apat_p.pointer.cseq != nil {
|
|
||||||
C.free(unsafe.Pointer(apat_p.pointer.cseq))
|
|
||||||
apat_p.pointer.cseq = nil
|
|
||||||
// atomic.AddInt64(&_AllocatedApaSequences, -1)
|
|
||||||
}
|
|
||||||
C.delete_apatseq(apat_p.pointer, &errno, &errmsg)
|
C.delete_apatseq(apat_p.pointer, &errno, &errmsg)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@ -242,6 +221,7 @@ func MakeApatSequence(sequence *obiseq.BioSequence, circular bool, recycle ...Ap
|
|||||||
}
|
}
|
||||||
|
|
||||||
recycle[0].pointer.pointer = pseqc
|
recycle[0].pointer.pointer = pseqc
|
||||||
|
recycle[0].pointer.reference = sequence
|
||||||
|
|
||||||
//log.Println(C.GoString(pseq.cseq))
|
//log.Println(C.GoString(pseq.cseq))
|
||||||
|
|
||||||
@ -259,16 +239,9 @@ func (sequence ApatSequence) Free() {
|
|||||||
var errno C.int32_t
|
var errno C.int32_t
|
||||||
var errmsg *C.char
|
var errmsg *C.char
|
||||||
|
|
||||||
// log.Printf("Free called on %p\n", sequence.pointer.pointer)
|
log.Debugf("Free called on %p\n", sequence.pointer.pointer)
|
||||||
|
|
||||||
if sequence.pointer != nil && sequence.pointer.pointer != nil {
|
if sequence.pointer != nil && sequence.pointer.pointer != nil {
|
||||||
|
|
||||||
if sequence.pointer.pointer.cseq != nil {
|
|
||||||
C.free(unsafe.Pointer(sequence.pointer.pointer.cseq))
|
|
||||||
sequence.pointer.pointer.cseq = nil
|
|
||||||
// atomic.AddInt64(&_AllocatedApaSequences, -1)
|
|
||||||
}
|
|
||||||
|
|
||||||
C.delete_apatseq(sequence.pointer.pointer,
|
C.delete_apatseq(sequence.pointer.pointer,
|
||||||
&errno, &errmsg)
|
&errno, &errmsg)
|
||||||
|
|
||||||
@ -315,11 +288,11 @@ func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, begin, length int
|
|||||||
for i := 0; i < nhits; i++ {
|
for i := 0; i < nhits; i++ {
|
||||||
start := int(stktmp[i])
|
start := int(stktmp[i])
|
||||||
err := int(errtmp[i])
|
err := int(errtmp[i])
|
||||||
log.Debugln(C.GoString(pattern.pointer.pointer.cpat), start, err)
|
//log.Debugln(C.GoString(pattern.pointer.pointer.cpat), start, err)
|
||||||
loc = append(loc, [3]int{start, start + patlen, err})
|
loc = append(loc, [3]int{start, start + patlen, err})
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Debugln("------------")
|
//log.Debugln("------------")
|
||||||
return loc
|
return loc
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -359,16 +332,17 @@ func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) (
|
|||||||
end = obiutils.MinInt(end, sequence.Len())
|
end = obiutils.MinInt(end, sequence.Len())
|
||||||
|
|
||||||
cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat))
|
cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat))
|
||||||
cseq := (*[1 << 30]byte)(unsafe.Pointer(sequence.pointer.pointer.cseq))
|
frg := sequence.pointer.reference.Sequence()[start:end]
|
||||||
|
|
||||||
|
|
||||||
log.Debugln(
|
log.Debugln(
|
||||||
string((*cseq)[start:end]),
|
string(frg),
|
||||||
string((*cpattern)[0:int(pattern.pointer.pointer.patlen)]),
|
string((*cpattern)[0:int(pattern.pointer.pointer.patlen)]),
|
||||||
best[0], nerr, int(pattern.pointer.pointer.patlen),
|
best[0], nerr, int(pattern.pointer.pointer.patlen),
|
||||||
sequence.Len(), start, end)
|
sequence.Len(), start, end)
|
||||||
|
|
||||||
score, lali := obialign.FastLCSEGFScoreByte(
|
score, lali := obialign.FastLCSEGFScoreByte(
|
||||||
(*cseq)[start:end],
|
frg,
|
||||||
(*cpattern)[0:int(pattern.pointer.pointer.patlen)],
|
(*cpattern)[0:int(pattern.pointer.pointer.patlen)],
|
||||||
nerr, true, &buffer)
|
nerr, true, &buffer)
|
||||||
|
|
||||||
|
@ -255,8 +255,10 @@ func _Pcr(seq ApatSequence,
|
|||||||
(opt.MinLength() == 0 || length >= opt.MinLength()) &&
|
(opt.MinLength() == 0 || length >= opt.MinLength()) &&
|
||||||
(opt.MaxLength() == 0 || length <= opt.MaxLength()) {
|
(opt.MaxLength() == 0 || length <= opt.MaxLength()) {
|
||||||
amplicon, _ := sequence.Subsequence(fm[1], rm[0], opt.pointer.circular)
|
amplicon, _ := sequence.Subsequence(fm[1], rm[0], opt.pointer.circular)
|
||||||
|
log.Debugf("seq length : %d capacity : %d",amplicon.Len(),cap(amplicon.Sequence()))
|
||||||
annot := amplicon.Annotations()
|
annot := amplicon.Annotations()
|
||||||
obiutils.MustFillMap(annot, sequence.Annotations())
|
obiutils.MustFillMap(annot, sequence.Annotations())
|
||||||
|
|
||||||
annot["forward_primer"] = forward.String()
|
annot["forward_primer"] = forward.String()
|
||||||
|
|
||||||
match, _ := sequence.Subsequence(fm[0], fm[1], opt.pointer.circular)
|
match, _ := sequence.Subsequence(fm[0], fm[1], opt.pointer.circular)
|
||||||
@ -392,6 +394,7 @@ func _PCRSlice(sequences obiseq.BioSequenceSlice,
|
|||||||
results = append(results, amplicons...)
|
results = append(results, amplicons...)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log.Debugf("Number of sequences in the slice : %d",len(sequences))
|
||||||
for _, sequence := range sequences[1:] {
|
for _, sequence := range sequences[1:] {
|
||||||
seq, _ = MakeApatSequence(sequence, options.Circular(), seq)
|
seq, _ = MakeApatSequence(sequence, options.Circular(), seq)
|
||||||
amplicons = _Pcr(seq, sequence, options)
|
amplicons = _Pcr(seq, sequence, options)
|
||||||
@ -400,7 +403,7 @@ func _PCRSlice(sequences obiseq.BioSequenceSlice,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// log.Println(AllocatedApaSequences())
|
//log.Debugln(AllocatedApaSequences())
|
||||||
|
|
||||||
// seq.Free()
|
// seq.Free()
|
||||||
}
|
}
|
||||||
@ -426,7 +429,9 @@ func PCRSliceWorker(options ...WithOption) obiseq.SeqSliceWorker {
|
|||||||
|
|
||||||
opt := MakeOptions(options)
|
opt := MakeOptions(options)
|
||||||
worker := func(sequences obiseq.BioSequenceSlice) obiseq.BioSequenceSlice {
|
worker := func(sequences obiseq.BioSequenceSlice) obiseq.BioSequenceSlice {
|
||||||
return _PCRSlice(sequences, opt)
|
result := _PCRSlice(sequences, opt)
|
||||||
|
sequences.Recycle(true)
|
||||||
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
return worker
|
return worker
|
||||||
|
@ -46,7 +46,7 @@ func ISequenceChunk(iterator obiiter.IBioSequence,
|
|||||||
for data.Next() {
|
for data.Next() {
|
||||||
b := data.Get()
|
b := data.Get()
|
||||||
*chunk = append(*chunk, b.Slice()...)
|
*chunk = append(*chunk, b.Slice()...)
|
||||||
b.Recycle()
|
b.Recycle(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
jobDone.Done()
|
jobDone.Done()
|
||||||
|
@ -106,7 +106,7 @@ func ISequenceSubChunk(iterator obiiter.IBioSequence,
|
|||||||
batch.Slice()[i] = nil
|
batch.Slice()[i] = nil
|
||||||
}
|
}
|
||||||
|
|
||||||
batch.Recycle()
|
batch.Recycle(false)
|
||||||
|
|
||||||
_By(func(p1, p2 *sSS) bool {
|
_By(func(p1, p2 *sSS) bool {
|
||||||
return p1.code < p2.code
|
return p1.code < p2.code
|
||||||
|
@ -97,8 +97,7 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
|
|||||||
// No more sub classification of sequence or only a single sequence
|
// No more sub classification of sequence or only a single sequence
|
||||||
if opts.NoSingleton() && len(batch.Slice()) == 1 && batch.Slice()[0].Count() == 1 {
|
if opts.NoSingleton() && len(batch.Slice()) == 1 && batch.Slice()[0].Count() == 1 {
|
||||||
// We remove singleton from output
|
// We remove singleton from output
|
||||||
batch.Slice()[0].Recycle()
|
batch.Recycle(true)
|
||||||
batch.Recycle()
|
|
||||||
} else {
|
} else {
|
||||||
iUnique.Push(batch.Reorder(nextOrder()))
|
iUnique.Push(batch.Reorder(nextOrder()))
|
||||||
}
|
}
|
||||||
|
@ -69,7 +69,7 @@ func WriteFasta(iterator obiiter.IBioSequence,
|
|||||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
opt := MakeOptions(options)
|
opt := MakeOptions(options)
|
||||||
|
|
||||||
iterator = iterator.Rebatch(10000)
|
iterator = iterator.Rebatch(1000)
|
||||||
file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
|
file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
|
||||||
|
|
||||||
newIter := obiiter.MakeIBioSequence()
|
newIter := obiiter.MakeIBioSequence()
|
||||||
|
@ -57,7 +57,7 @@ func WriteFastq(iterator obiiter.IBioSequence,
|
|||||||
file io.WriteCloser,
|
file io.WriteCloser,
|
||||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
|
|
||||||
iterator = iterator.Rebatch(10000)
|
iterator = iterator.Rebatch(1000)
|
||||||
|
|
||||||
opt := MakeOptions(options)
|
opt := MakeOptions(options)
|
||||||
|
|
||||||
|
@ -29,15 +29,16 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func _ParseGenbankFile(source string,
|
func _ParseGenbankFile(source string,
|
||||||
input <-chan _FileChunk, out obiiter.IBioSequence) {
|
input <-chan _FileChunk, out obiiter.IBioSequence,
|
||||||
|
chunck_order func() int) {
|
||||||
|
|
||||||
state := inHeader
|
state := inHeader
|
||||||
|
|
||||||
for chunks := range input {
|
for chunks := range input {
|
||||||
log.Debugln("Chunk size", (chunks.raw.(*bytes.Buffer)).Len())
|
// log.Debugln("Chunk size", (chunks.raw.(*bytes.Buffer)).Len())
|
||||||
scanner := bufio.NewScanner(chunks.raw)
|
scanner := bufio.NewScanner(chunks.raw)
|
||||||
order := chunks.order
|
|
||||||
sequences := make(obiseq.BioSequenceSlice, 0, 100)
|
sequences := make(obiseq.BioSequenceSlice, 0, 100)
|
||||||
|
sumlength:=0
|
||||||
id := ""
|
id := ""
|
||||||
scientificName := ""
|
scientificName := ""
|
||||||
defBytes := new(bytes.Buffer)
|
defBytes := new(bytes.Buffer)
|
||||||
@ -67,7 +68,7 @@ func _ParseGenbankFile(source string,
|
|||||||
case strings.HasPrefix(line, "ORIGIN"):
|
case strings.HasPrefix(line, "ORIGIN"):
|
||||||
state = inSequence
|
state = inSequence
|
||||||
case line == "//":
|
case line == "//":
|
||||||
log.Debugln("Total lines := ", nl)
|
// log.Debugln("Total lines := ", nl)
|
||||||
sequence := obiseq.NewBioSequence(id,
|
sequence := obiseq.NewBioSequence(id,
|
||||||
seqBytes.Bytes(),
|
seqBytes.Bytes(),
|
||||||
defBytes.String())
|
defBytes.String())
|
||||||
@ -80,10 +81,17 @@ func _ParseGenbankFile(source string,
|
|||||||
annot["scientific_name"] = scientificName
|
annot["scientific_name"] = scientificName
|
||||||
annot["taxid"] = taxid
|
annot["taxid"] = taxid
|
||||||
// log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader))
|
// log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader))
|
||||||
log.Debugf("Read sequences %s: %dbp (%d)", sequence.Id(),
|
// log.Debugf("Read sequences %s: %dbp (%d)", sequence.Id(),
|
||||||
sequence.Len(), seqBytes.Len())
|
// sequence.Len(), seqBytes.Len())
|
||||||
|
|
||||||
sequences = append(sequences, sequence)
|
sequences = append(sequences, sequence)
|
||||||
|
sumlength+=sequence.Len()
|
||||||
|
|
||||||
|
if len(sequences) == 100 || sumlength > 1e7 {
|
||||||
|
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
|
||||||
|
sequences = make(obiseq.BioSequenceSlice, 0, 100)
|
||||||
|
sumlength = 0
|
||||||
|
}
|
||||||
defBytes = new(bytes.Buffer)
|
defBytes = new(bytes.Buffer)
|
||||||
featBytes = new(bytes.Buffer)
|
featBytes = new(bytes.Buffer)
|
||||||
seqBytes = new(bytes.Buffer)
|
seqBytes = new(bytes.Buffer)
|
||||||
@ -111,8 +119,10 @@ func _ParseGenbankFile(source string,
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
out.Push(obiiter.MakeBioSequenceBatch(order, sequences))
|
if len(sequences) > 0 {
|
||||||
}
|
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
out.Done()
|
out.Done()
|
||||||
|
|
||||||
@ -125,6 +135,7 @@ func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
|||||||
newIter := obiiter.MakeIBioSequence()
|
newIter := obiiter.MakeIBioSequence()
|
||||||
|
|
||||||
nworkers := opt.ParallelWorkers()
|
nworkers := opt.ParallelWorkers()
|
||||||
|
chunck_order := obiutils.AtomicCounter()
|
||||||
newIter.Add(nworkers)
|
newIter.Add(nworkers)
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
@ -133,7 +144,7 @@ func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
|||||||
|
|
||||||
// for j := 0; j < opt.ParallelWorkers(); j++ {
|
// for j := 0; j < opt.ParallelWorkers(); j++ {
|
||||||
for j := 0; j < nworkers; j++ {
|
for j := 0; j < nworkers; j++ {
|
||||||
go _ParseGenbankFile(opt.Source(),entry_channel, newIter)
|
go _ParseGenbankFile(opt.Source(), entry_channel, newIter,chunck_order)
|
||||||
}
|
}
|
||||||
|
|
||||||
go _ReadFlatFileChunk(reader, entry_channel)
|
go _ReadFlatFileChunk(reader, entry_channel)
|
||||||
@ -152,7 +163,6 @@ func ReadGenbankFromFile(filename string, options ...WithOption) (obiiter.IBioSe
|
|||||||
|
|
||||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||||
|
|
||||||
|
|
||||||
reader, err = os.Open(filename)
|
reader, err = os.Open(filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("open file error: %+v", err)
|
log.Printf("open file error: %+v", err)
|
||||||
|
@ -47,7 +47,7 @@ func (batch BioSequenceBatch) IsNil() bool {
|
|||||||
return batch.slice == nil
|
return batch.slice == nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (batch BioSequenceBatch) Recycle() {
|
func (batch BioSequenceBatch) Recycle(including_seq bool) {
|
||||||
batch.slice.Recycle()
|
batch.slice.Recycle(including_seq)
|
||||||
batch.slice = nil
|
batch.slice = nil
|
||||||
}
|
}
|
||||||
|
@ -435,7 +435,7 @@ func (iterator IBioSequence) Rebatch(size int) IBioSequence {
|
|||||||
buffer = obiseq.MakeBioSequenceSlice()
|
buffer = obiseq.MakeBioSequenceSlice()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
seqs.Recycle()
|
seqs.Recycle(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(buffer) > 0 {
|
if len(buffer) > 0 {
|
||||||
@ -461,11 +461,8 @@ func (iterator IBioSequence) Recycle() {
|
|||||||
// iterator.Get()
|
// iterator.Get()
|
||||||
batch := iterator.Get()
|
batch := iterator.Get()
|
||||||
log.Debugln("Recycling batch #", batch.Order())
|
log.Debugln("Recycling batch #", batch.Order())
|
||||||
for _, seq := range batch.Slice() {
|
recycled+=batch.Len()
|
||||||
seq.Recycle()
|
batch.Recycle(true)
|
||||||
recycled++
|
|
||||||
}
|
|
||||||
batch.Recycle()
|
|
||||||
}
|
}
|
||||||
log.Debugf("End of the recycling of %d Bioseq objects", recycled)
|
log.Debugf("End of the recycling of %d Bioseq objects", recycled)
|
||||||
}
|
}
|
||||||
@ -473,7 +470,7 @@ func (iterator IBioSequence) Recycle() {
|
|||||||
func (iterator IBioSequence) Consume() {
|
func (iterator IBioSequence) Consume() {
|
||||||
for iterator.Next() {
|
for iterator.Next() {
|
||||||
batch := iterator.Get()
|
batch := iterator.Get()
|
||||||
batch.Recycle()
|
batch.Recycle(false)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -490,12 +487,8 @@ func (iterator IBioSequence) Count(recycle bool) (int, int, int) {
|
|||||||
variants++
|
variants++
|
||||||
reads += seq.Count()
|
reads += seq.Count()
|
||||||
nucleotides += seq.Len()
|
nucleotides += seq.Len()
|
||||||
|
|
||||||
if recycle {
|
|
||||||
seq.Recycle()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
batch.Recycle()
|
batch.Recycle(recycle)
|
||||||
}
|
}
|
||||||
log.Debugf("End of the counting of %d Bioseq objects", variants)
|
log.Debugf("End of the counting of %d Bioseq objects", variants)
|
||||||
return variants, reads, nucleotides
|
return variants, reads, nucleotides
|
||||||
@ -547,7 +540,7 @@ func (iterator IBioSequence) DivideOn(predicate obiseq.SequencePredicate,
|
|||||||
falseSlice = obiseq.MakeBioSequenceSlice()
|
falseSlice = obiseq.MakeBioSequenceSlice()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
seqs.Recycle()
|
seqs.Recycle(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(trueSlice) > 0 {
|
if len(trueSlice) > 0 {
|
||||||
@ -688,7 +681,7 @@ func (iterator IBioSequence) Load() obiseq.BioSequenceSlice {
|
|||||||
b := iterator.Get()
|
b := iterator.Get()
|
||||||
log.Debugf("append %d sequences",b.Len())
|
log.Debugf("append %d sequences",b.Len())
|
||||||
chunck = append(chunck, b.Slice()...)
|
chunck = append(chunck, b.Slice()...)
|
||||||
b.Recycle()
|
b.Recycle(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
return chunck
|
return chunck
|
||||||
|
@ -92,7 +92,7 @@ func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, siz
|
|||||||
slices[key] = &s
|
slices[key] = &s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
seqs.Recycle()
|
seqs.Recycle(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
for key, slice := range slices {
|
for key, slice := range slices {
|
||||||
|
@ -119,7 +119,7 @@ func (iterator IBioSequence) MakeISliceWorker(worker obiseq.SeqSliceWorker, size
|
|||||||
for iterator.Next() {
|
for iterator.Next() {
|
||||||
batch := iterator.Get()
|
batch := iterator.Get()
|
||||||
batch.slice = worker(batch.slice)
|
batch.slice = worker(batch.slice)
|
||||||
newIter.pointer.channel <- batch
|
newIter.Push(batch)
|
||||||
}
|
}
|
||||||
newIter.Done()
|
newIter.Done()
|
||||||
}
|
}
|
||||||
|
@ -61,15 +61,20 @@ type BioSequence struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// MakeEmptyBioSequence() creates a new BioSequence object with no data
|
// MakeEmptyBioSequence() creates a new BioSequence object with no data
|
||||||
func MakeEmptyBioSequence() BioSequence {
|
func MakeEmptyBioSequence(preallocate int) BioSequence {
|
||||||
atomic.AddInt32(&_NewSeq, 1)
|
atomic.AddInt32(&_NewSeq, 1)
|
||||||
atomic.AddInt32(&_InMemSeq, 1)
|
atomic.AddInt32(&_InMemSeq, 1)
|
||||||
|
|
||||||
|
seq := []byte(nil)
|
||||||
|
if preallocate > 0 {
|
||||||
|
seq = GetSlice(preallocate)
|
||||||
|
}
|
||||||
|
|
||||||
return BioSequence{
|
return BioSequence{
|
||||||
id: "",
|
id: "",
|
||||||
definition: "",
|
definition: "",
|
||||||
source: "",
|
source: "",
|
||||||
sequence: nil,
|
sequence: seq,
|
||||||
qualities: nil,
|
qualities: nil,
|
||||||
feature: nil,
|
feature: nil,
|
||||||
paired: nil,
|
paired: nil,
|
||||||
@ -78,8 +83,8 @@ func MakeEmptyBioSequence() BioSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// `NewEmptyBioSequence()` returns a pointer to a new empty BioSequence
|
// `NewEmptyBioSequence()` returns a pointer to a new empty BioSequence
|
||||||
func NewEmptyBioSequence() *BioSequence {
|
func NewEmptyBioSequence(preallocate int) *BioSequence {
|
||||||
s := MakeEmptyBioSequence()
|
s := MakeEmptyBioSequence(preallocate)
|
||||||
return &s
|
return &s
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -87,7 +92,7 @@ func NewEmptyBioSequence() *BioSequence {
|
|||||||
func MakeBioSequence(id string,
|
func MakeBioSequence(id string,
|
||||||
sequence []byte,
|
sequence []byte,
|
||||||
definition string) BioSequence {
|
definition string) BioSequence {
|
||||||
bs := MakeEmptyBioSequence()
|
bs := MakeEmptyBioSequence(0)
|
||||||
bs.SetId(id)
|
bs.SetId(id)
|
||||||
bs.SetSequence(sequence)
|
bs.SetSequence(sequence)
|
||||||
bs.SetDefinition(definition)
|
bs.SetDefinition(definition)
|
||||||
@ -127,7 +132,7 @@ func (sequence *BioSequence) Recycle() {
|
|||||||
|
|
||||||
// Copying the BioSequence.
|
// Copying the BioSequence.
|
||||||
func (s *BioSequence) Copy() *BioSequence {
|
func (s *BioSequence) Copy() *BioSequence {
|
||||||
newSeq := MakeEmptyBioSequence()
|
newSeq := MakeEmptyBioSequence(0)
|
||||||
|
|
||||||
newSeq.id = s.id
|
newSeq.id = s.id
|
||||||
newSeq.definition = s.definition
|
newSeq.definition = s.definition
|
||||||
|
@ -34,14 +34,22 @@ func MakeBioSequenceSlice(size ...int) BioSequenceSlice {
|
|||||||
return *NewBioSequenceSlice(size...)
|
return *NewBioSequenceSlice(size...)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *BioSequenceSlice) Recycle() {
|
func (s *BioSequenceSlice) Recycle(including_seq bool) {
|
||||||
if s == nil {
|
if s == nil {
|
||||||
log.Panicln("Trying too recycle a nil pointer")
|
log.Panicln("Trying too recycle a nil pointer")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Code added to potentially limit memory leaks
|
// Code added to potentially limit memory leaks
|
||||||
for i := range *s {
|
if including_seq {
|
||||||
(*s)[i] = nil
|
for i := range *s {
|
||||||
|
(*s)[i] .Recycle()
|
||||||
|
(*s)[i] = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
for i := range *s {
|
||||||
|
(*s)[i] = nil
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = (*s)[:0]
|
*s = (*s)[:0]
|
||||||
|
@ -213,7 +213,7 @@ func (sequences BioSequenceSlice) Merge(na string, statsOn []string) *BioSequenc
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sequences.Recycle()
|
sequences.Recycle(false)
|
||||||
return seq
|
return seq
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -20,7 +20,9 @@ func RecycleSlice(s *[]byte) {
|
|||||||
if cap(*s) == 0 {
|
if cap(*s) == 0 {
|
||||||
log.Panicln("trying to store a NIL slice in the pool", s == nil, *s == nil, cap(*s))
|
log.Panicln("trying to store a NIL slice in the pool", s == nil, *s == nil, cap(*s))
|
||||||
}
|
}
|
||||||
_BioSequenceByteSlicePool.Put(s)
|
if cap(*s) <= 1024 {
|
||||||
|
_BioSequenceByteSlicePool.Put(s)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -28,7 +30,10 @@ func RecycleSlice(s *[]byte) {
|
|||||||
//
|
//
|
||||||
// the slice can be prefilled with the provided values
|
// the slice can be prefilled with the provided values
|
||||||
func GetSlice(capacity int) []byte {
|
func GetSlice(capacity int) []byte {
|
||||||
p := _BioSequenceByteSlicePool.Get().(*[]byte)
|
p := (*[]byte)(nil)
|
||||||
|
if capacity <= 1024 {
|
||||||
|
p = _BioSequenceByteSlicePool.Get().(*[]byte)
|
||||||
|
}
|
||||||
|
|
||||||
if p == nil || *p == nil || cap(*p) < capacity {
|
if p == nil || *p == nil || cap(*p) < capacity {
|
||||||
s := make([]byte, 0, capacity)
|
s := make([]byte, 0, capacity)
|
||||||
|
@ -8,7 +8,6 @@ import (
|
|||||||
// Returns a sub sequence start from position 'from' included,
|
// Returns a sub sequence start from position 'from' included,
|
||||||
// to position 'to' excluded. Coordinates start at position 0.
|
// to position 'to' excluded. Coordinates start at position 0.
|
||||||
func (sequence *BioSequence) Subsequence(from, to int, circular bool) (*BioSequence, error) {
|
func (sequence *BioSequence) Subsequence(from, to int, circular bool) (*BioSequence, error) {
|
||||||
|
|
||||||
if from >= to && !circular {
|
if from >= to && !circular {
|
||||||
return nil, errors.New("from greater than to")
|
return nil, errors.New("from greater than to")
|
||||||
}
|
}
|
||||||
@ -24,10 +23,11 @@ func (sequence *BioSequence) Subsequence(from, to int, circular bool) (*BioSeque
|
|||||||
var newSeq *BioSequence
|
var newSeq *BioSequence
|
||||||
|
|
||||||
if from < to {
|
if from < to {
|
||||||
newSeq = NewEmptyBioSequence()
|
newSeq = NewEmptyBioSequence(0)
|
||||||
newSeq.Write(sequence.Sequence()[from:to])
|
newSeq.sequence = CopySlice(sequence.Sequence()[from:to])
|
||||||
|
|
||||||
if sequence.HasQualities() {
|
if sequence.HasQualities() {
|
||||||
|
newSeq.qualities = CopySlice(sequence.Qualities()[from:to])
|
||||||
newSeq.WriteQualities(sequence.Qualities()[from:to])
|
newSeq.WriteQualities(sequence.Qualities()[from:to])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user