diff --git a/pkg/obiapat/apat.h b/pkg/obiapat/apat.h index c536523..5916459 100644 --- a/pkg/obiapat/apat.h +++ b/pkg/obiapat/apat.h @@ -106,7 +106,6 @@ typedef struct { /* sequence */ int32_t datsiz; /* data buffer size */ int32_t circular; uint8_t *data; /* data buffer */ - char *cseq; /* sequence buffer */ StackiPtr hitpos[MAX_PATTERN]; /* stack of hit pos. */ StackiPtr hiterr[MAX_PATTERN]; /* stack of errors */ } Seq, *SeqPtr; diff --git a/pkg/obiapat/obiapat.c b/pkg/obiapat/obiapat.c index 85f0c55..38b9f56 100644 --- a/pkg/obiapat/obiapat.c +++ b/pkg/obiapat/obiapat.c @@ -6,7 +6,7 @@ #include "obiapat.h" -static void EncodeSequence(SeqPtr seq); +static void EncodeSequence(SeqPtr seq, const char *in); static void UpperSequence(char *seq); /* @@ -142,64 +142,6 @@ char *ecoComplementPattern(char *nucAcSeq) return reverseSequence(LXBioSeqComplement(nucAcSeq),1); } -char *ecoComplementSequence(char *nucAcSeq) -{ - return reverseSequence(LXBioSeqComplement(nucAcSeq),0); -} - - -char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end, - int *errno, char **errmsg) -/* - extract subsequence from nucAcSeq [begin,end[ -*/ -{ - static char *buffer = NULL; - static int32_t buffSize= 0; - int32_t length; - - if (begin < end) - { - length = end - begin; - - if (length >= buffSize) - { - buffSize = length+1; - if (buffer) - buffer=ECOREALLOC(buffer,buffSize, - "Error in reallocating sub sequence buffer",errno,errmsg); - else - buffer=ECOMALLOC(buffSize, - "Error in allocating sub sequence buffer",errno,errmsg); - - } - - strncpy(buffer,nucAcSeq + begin,length); - buffer[length]=0; - } - else - { - length = end + strlen(nucAcSeq) - begin; - - if (length >= buffSize) - { - buffSize = length+1; - if (buffer) - buffer=ECOREALLOC(buffer,buffSize, - "Error in reallocating sub sequence buffer",errno,errmsg); - else - buffer=ECOMALLOC(buffSize, - "Error in allocating sub sequence buffer",errno,errmsg); - - } - strncpy(buffer,nucAcSeq+begin,length - end); - strncpy(buffer+(length-end),nucAcSeq ,end); - buffer[length]=0; - } - - return buffer; -} - /* -------------------------------------------- */ /* uppercase sequence */ @@ -229,29 +171,27 @@ void UpperSequence(char *seq) /* -------------------------------------------- */ #define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'Z')) +#define IS_LOWER(c) (((c) >= 'a') && ((c) <= 'z')) -void EncodeSequence(SeqPtr seq) +void EncodeSequence(SeqPtr seq, const char *in) { int i; uint8_t *data; - char *cseq; + const char *cseq; char nuc; data = seq->data; - cseq = seq->cseq; - while (*cseq) { - nuc = *cseq & (~32); - *data = (IS_UPPER(nuc) ? nuc - 'A' : 0x0); - data++; - cseq++; + for (i=0,cseq=in; i < seq->seqlen; i++,cseq++,data++) { + nuc = *cseq; + *data = (IS_LOWER(nuc) ? nuc - 'a' : 0x0); } - - for (i=0,cseq=seq->cseq;i < seq->circular; i++,cseq++,data++) { - nuc = *cseq & (~32); - *data = (IS_UPPER(nuc) ? nuc - 'A' : 0x0); + + for (i=0,cseq=in; i < seq->circular; i++,cseq++,data++) { + nuc = *cseq; + *data = (IS_LOWER(nuc) ? nuc - 'a' : 0x0); } for (i = 0 ; i < MAX_PATTERN ; i++) @@ -266,6 +206,7 @@ SeqPtr new_apatseq(const char *in,int32_t circular, int32_t seqlen, SeqPtr out, int *errno, char **errmsg) { + // fprintf(stderr,">>>>>>>> new_apatseq\n"); int i; if (circular != 0) circular=MAX_PAT_LEN; @@ -287,28 +228,26 @@ SeqPtr new_apatseq(const char *in,int32_t circular, int32_t seqlen, } - out->seqsiz = out->seqlen = seqlen; - out->circular = circular; if (!out->data) { - out->data = ECOMALLOC((out->seqlen+circular) *sizeof(uint8_t), + out->data = ECOMALLOC((seqlen+circular) *sizeof(uint8_t), "Error in Allocation of a new Seq data member", errno,errmsg); - out->datsiz= out->seqlen+circular; + out->datsiz= seqlen+circular; } - else if ((out->seqlen +circular) >= out->datsiz) + else if ((seqlen +circular) >= out->datsiz) { - out->data = ECOREALLOC(out->data,(out->seqlen+circular) *sizeof(uint8_t), + out->data = ECOREALLOC(out->data,(seqlen+circular) *sizeof(uint8_t), "Error during Seq data buffer realloc", errno,errmsg); - out->datsiz= out->seqlen+circular; + out->datsiz= seqlen+circular; } - - out->cseq = (char *)in; - EncodeSequence(out); - + out->circular = circular; + out->seqlen = seqlen; + EncodeSequence(out,in); + // fprintf(stderr,">>>>>>>> Encodage ok\n"); return out; } diff --git a/pkg/obiapat/pattern.go b/pkg/obiapat/pattern.go index 5f86ab1..6016d93 100644 --- a/pkg/obiapat/pattern.go +++ b/pkg/obiapat/pattern.go @@ -27,6 +27,7 @@ var _AllocatedApaPattern = 0 // Apat algorithm functions and methods type _ApatPattern struct { pointer *C.Pattern + pattern string } type ApatPattern struct { @@ -37,6 +38,7 @@ type ApatPattern struct { // Apat algorithm functions and methods type _ApatSequence struct { pointer *C.Seq + reference *obiseq.BioSequence } type ApatSequence struct { @@ -88,7 +90,8 @@ func MakeApatPattern(pattern string, errormax int, allowsIndel bool) (ApatPatter return NilApatPattern, errors.New(message) } - ap := _ApatPattern{apc} + + ap := _ApatPattern{apc,pattern} runtime.SetFinalizer(&ap, func(p *_ApatPattern) { // log.Printf("Finaliser called on %s\n", C.GoString(p.pointer.cpat)) @@ -111,8 +114,8 @@ func (pattern ApatPattern) ReverseComplement() (ApatPattern, error) { C.free(unsafe.Pointer(errmsg)) return ApatPattern{nil}, errors.New(message) } - - ap := _ApatPattern{apc} + spat := C.GoString(apc.cpat) + ap := _ApatPattern{apc,spat} runtime.SetFinalizer(&ap, func(p *_ApatPattern) { // log.Printf("Finaliser called on %s\n", C.GoString(p.pointer.cpat)) @@ -124,7 +127,8 @@ func (pattern ApatPattern) ReverseComplement() (ApatPattern, error) { // String method casts the ApatPattern to a Go String. func (pattern ApatPattern) String() string { - return C.GoString(pattern.pointer.pointer.cpat) + return pattern.pointer.pattern + //return C.GoString(pattern.pointer.pointer.cpat) } // Len method returns the length of the matched pattern. @@ -166,7 +170,6 @@ func (pattern ApatPattern) Print() { func MakeApatSequence(sequence *obiseq.BioSequence, circular bool, recycle ...ApatSequence) (ApatSequence, error) { var errno C.int32_t var errmsg *C.char - var p unsafe.Pointer seqlen := sequence.Len() ic := 0 @@ -178,33 +181,14 @@ func MakeApatSequence(sequence *obiseq.BioSequence, circular bool, recycle ...Ap if len(recycle) > 0 { out = recycle[0].pointer.pointer - if (int(out.seqlen) < seqlen || int(out.seqlen) > 5*seqlen) && out.cseq != nil { - C.free(unsafe.Pointer(out.cseq)) - out.cseq = nil - } } else { out = nil } - if out == nil || out.cseq == nil { - - p = C.malloc(C.size_t(seqlen) + 1) - // if p != nil { - // // atomic.AddInt64(&_AllocatedApaSequences, 1) - // } - } else { - p = unsafe.Pointer(out.cseq) - } - - if p == nil { - log.Panicln("Cannot allocate memory chunk for Cseq Apat sequecence") - } // copy the data into the buffer, by converting it to a Go array - cBuf := (*[1 << 31]byte)(p) - copy(cBuf[:], sequence.Sequence()) - cBuf[sequence.Len()] = 0 - + p := unsafe.Pointer(unsafe.SliceData(sequence.Sequence())) + pseqc := C.new_apatseq((*C.char)(p), C.int32_t(ic), C.int32_t(seqlen), (*C.Seq)(out), &errno, &errmsg) @@ -221,19 +205,14 @@ func MakeApatSequence(sequence *obiseq.BioSequence, circular bool, recycle ...Ap if out == nil { // log.Printf("Make ApatSeq called on %p -> %p\n", out, pseqc) - seq := _ApatSequence{pointer: pseqc} + seq := _ApatSequence{pointer: pseqc,reference: sequence} runtime.SetFinalizer(&seq, func(apat_p *_ApatSequence) { var errno C.int32_t var errmsg *C.char - // log.Printf("Finaliser called on %p\n", apat_p.pointer) + log.Debugf("Finaliser called on %p\n", apat_p.pointer) if apat_p != nil && apat_p.pointer != nil { - if apat_p.pointer.cseq != nil { - C.free(unsafe.Pointer(apat_p.pointer.cseq)) - apat_p.pointer.cseq = nil - // atomic.AddInt64(&_AllocatedApaSequences, -1) - } C.delete_apatseq(apat_p.pointer, &errno, &errmsg) } }) @@ -242,6 +221,7 @@ func MakeApatSequence(sequence *obiseq.BioSequence, circular bool, recycle ...Ap } recycle[0].pointer.pointer = pseqc + recycle[0].pointer.reference = sequence //log.Println(C.GoString(pseq.cseq)) @@ -259,16 +239,9 @@ func (sequence ApatSequence) Free() { var errno C.int32_t var errmsg *C.char - // log.Printf("Free called on %p\n", sequence.pointer.pointer) + log.Debugf("Free called on %p\n", sequence.pointer.pointer) if sequence.pointer != nil && sequence.pointer.pointer != nil { - - if sequence.pointer.pointer.cseq != nil { - C.free(unsafe.Pointer(sequence.pointer.pointer.cseq)) - sequence.pointer.pointer.cseq = nil - // atomic.AddInt64(&_AllocatedApaSequences, -1) - } - C.delete_apatseq(sequence.pointer.pointer, &errno, &errmsg) @@ -315,11 +288,11 @@ func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, begin, length int for i := 0; i < nhits; i++ { start := int(stktmp[i]) err := int(errtmp[i]) - log.Debugln(C.GoString(pattern.pointer.pointer.cpat), start, err) + //log.Debugln(C.GoString(pattern.pointer.pointer.cpat), start, err) loc = append(loc, [3]int{start, start + patlen, err}) } - log.Debugln("------------") + //log.Debugln("------------") return loc } @@ -359,16 +332,17 @@ func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) ( end = obiutils.MinInt(end, sequence.Len()) cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat)) - cseq := (*[1 << 30]byte)(unsafe.Pointer(sequence.pointer.pointer.cseq)) + frg := sequence.pointer.reference.Sequence()[start:end] + log.Debugln( - string((*cseq)[start:end]), + string(frg), string((*cpattern)[0:int(pattern.pointer.pointer.patlen)]), best[0], nerr, int(pattern.pointer.pointer.patlen), sequence.Len(), start, end) score, lali := obialign.FastLCSEGFScoreByte( - (*cseq)[start:end], + frg, (*cpattern)[0:int(pattern.pointer.pointer.patlen)], nerr, true, &buffer) diff --git a/pkg/obiapat/pcr.go b/pkg/obiapat/pcr.go index d9966e0..a99792c 100644 --- a/pkg/obiapat/pcr.go +++ b/pkg/obiapat/pcr.go @@ -255,8 +255,10 @@ func _Pcr(seq ApatSequence, (opt.MinLength() == 0 || length >= opt.MinLength()) && (opt.MaxLength() == 0 || length <= opt.MaxLength()) { amplicon, _ := sequence.Subsequence(fm[1], rm[0], opt.pointer.circular) + log.Debugf("seq length : %d capacity : %d",amplicon.Len(),cap(amplicon.Sequence())) annot := amplicon.Annotations() obiutils.MustFillMap(annot, sequence.Annotations()) + annot["forward_primer"] = forward.String() match, _ := sequence.Subsequence(fm[0], fm[1], opt.pointer.circular) @@ -392,6 +394,7 @@ func _PCRSlice(sequences obiseq.BioSequenceSlice, results = append(results, amplicons...) } + log.Debugf("Number of sequences in the slice : %d",len(sequences)) for _, sequence := range sequences[1:] { seq, _ = MakeApatSequence(sequence, options.Circular(), seq) amplicons = _Pcr(seq, sequence, options) @@ -400,7 +403,7 @@ func _PCRSlice(sequences obiseq.BioSequenceSlice, } } - // log.Println(AllocatedApaSequences()) + //log.Debugln(AllocatedApaSequences()) // seq.Free() } @@ -426,7 +429,9 @@ func PCRSliceWorker(options ...WithOption) obiseq.SeqSliceWorker { opt := MakeOptions(options) worker := func(sequences obiseq.BioSequenceSlice) obiseq.BioSequenceSlice { - return _PCRSlice(sequences, opt) + result := _PCRSlice(sequences, opt) + sequences.Recycle(true) + return result } return worker diff --git a/pkg/obichunk/chunks.go b/pkg/obichunk/chunks.go index f88e779..00e426b 100644 --- a/pkg/obichunk/chunks.go +++ b/pkg/obichunk/chunks.go @@ -46,7 +46,7 @@ func ISequenceChunk(iterator obiiter.IBioSequence, for data.Next() { b := data.Get() *chunk = append(*chunk, b.Slice()...) - b.Recycle() + b.Recycle(false) } jobDone.Done() diff --git a/pkg/obichunk/subchunks.go b/pkg/obichunk/subchunks.go index 7250946..240d2f6 100644 --- a/pkg/obichunk/subchunks.go +++ b/pkg/obichunk/subchunks.go @@ -106,7 +106,7 @@ func ISequenceSubChunk(iterator obiiter.IBioSequence, batch.Slice()[i] = nil } - batch.Recycle() + batch.Recycle(false) _By(func(p1, p2 *sSS) bool { return p1.code < p2.code diff --git a/pkg/obichunk/unique.go b/pkg/obichunk/unique.go index aa3631b..0a81c86 100644 --- a/pkg/obichunk/unique.go +++ b/pkg/obichunk/unique.go @@ -97,8 +97,7 @@ func IUniqueSequence(iterator obiiter.IBioSequence, // No more sub classification of sequence or only a single sequence if opts.NoSingleton() && len(batch.Slice()) == 1 && batch.Slice()[0].Count() == 1 { // We remove singleton from output - batch.Slice()[0].Recycle() - batch.Recycle() + batch.Recycle(true) } else { iUnique.Push(batch.Reorder(nextOrder())) } diff --git a/pkg/obiformats/fastseq_write_fasta.go b/pkg/obiformats/fastseq_write_fasta.go index d9d1b88..20952a5 100644 --- a/pkg/obiformats/fastseq_write_fasta.go +++ b/pkg/obiformats/fastseq_write_fasta.go @@ -69,7 +69,7 @@ func WriteFasta(iterator obiiter.IBioSequence, options ...WithOption) (obiiter.IBioSequence, error) { opt := MakeOptions(options) - iterator = iterator.Rebatch(10000) + iterator = iterator.Rebatch(1000) file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile()) newIter := obiiter.MakeIBioSequence() diff --git a/pkg/obiformats/fastseq_write_fastq.go b/pkg/obiformats/fastseq_write_fastq.go index eaf62ce..7825482 100644 --- a/pkg/obiformats/fastseq_write_fastq.go +++ b/pkg/obiformats/fastseq_write_fastq.go @@ -57,7 +57,7 @@ func WriteFastq(iterator obiiter.IBioSequence, file io.WriteCloser, options ...WithOption) (obiiter.IBioSequence, error) { - iterator = iterator.Rebatch(10000) + iterator = iterator.Rebatch(1000) opt := MakeOptions(options) diff --git a/pkg/obiformats/genbank_read.go b/pkg/obiformats/genbank_read.go index 195c974..7bbb7cf 100644 --- a/pkg/obiformats/genbank_read.go +++ b/pkg/obiformats/genbank_read.go @@ -29,15 +29,16 @@ const ( ) func _ParseGenbankFile(source string, - input <-chan _FileChunk, out obiiter.IBioSequence) { + input <-chan _FileChunk, out obiiter.IBioSequence, + chunck_order func() int) { state := inHeader for chunks := range input { - log.Debugln("Chunk size", (chunks.raw.(*bytes.Buffer)).Len()) + // log.Debugln("Chunk size", (chunks.raw.(*bytes.Buffer)).Len()) scanner := bufio.NewScanner(chunks.raw) - order := chunks.order sequences := make(obiseq.BioSequenceSlice, 0, 100) + sumlength:=0 id := "" scientificName := "" defBytes := new(bytes.Buffer) @@ -67,7 +68,7 @@ func _ParseGenbankFile(source string, case strings.HasPrefix(line, "ORIGIN"): state = inSequence case line == "//": - log.Debugln("Total lines := ", nl) + // log.Debugln("Total lines := ", nl) sequence := obiseq.NewBioSequence(id, seqBytes.Bytes(), defBytes.String()) @@ -80,10 +81,17 @@ func _ParseGenbankFile(source string, annot["scientific_name"] = scientificName annot["taxid"] = taxid // log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader)) - log.Debugf("Read sequences %s: %dbp (%d)", sequence.Id(), - sequence.Len(), seqBytes.Len()) + // log.Debugf("Read sequences %s: %dbp (%d)", sequence.Id(), + // sequence.Len(), seqBytes.Len()) sequences = append(sequences, sequence) + sumlength+=sequence.Len() + + if len(sequences) == 100 || sumlength > 1e7 { + out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences)) + sequences = make(obiseq.BioSequenceSlice, 0, 100) + sumlength = 0 + } defBytes = new(bytes.Buffer) featBytes = new(bytes.Buffer) seqBytes = new(bytes.Buffer) @@ -111,8 +119,10 @@ func _ParseGenbankFile(source string, } } - out.Push(obiiter.MakeBioSequenceBatch(order, sequences)) - } + if len(sequences) > 0 { + out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences)) + } +} out.Done() @@ -125,6 +135,7 @@ func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence { newIter := obiiter.MakeIBioSequence() nworkers := opt.ParallelWorkers() + chunck_order := obiutils.AtomicCounter() newIter.Add(nworkers) go func() { @@ -133,7 +144,7 @@ func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence { // for j := 0; j < opt.ParallelWorkers(); j++ { for j := 0; j < nworkers; j++ { - go _ParseGenbankFile(opt.Source(),entry_channel, newIter) + go _ParseGenbankFile(opt.Source(), entry_channel, newIter,chunck_order) } go _ReadFlatFileChunk(reader, entry_channel) @@ -152,7 +163,6 @@ func ReadGenbankFromFile(filename string, options ...WithOption) (obiiter.IBioSe options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename))))) - reader, err = os.Open(filename) if err != nil { log.Printf("open file error: %+v", err) diff --git a/pkg/obiiter/batch.go b/pkg/obiiter/batch.go index 47b7b82..bf7ac2f 100644 --- a/pkg/obiiter/batch.go +++ b/pkg/obiiter/batch.go @@ -47,7 +47,7 @@ func (batch BioSequenceBatch) IsNil() bool { return batch.slice == nil } -func (batch BioSequenceBatch) Recycle() { - batch.slice.Recycle() +func (batch BioSequenceBatch) Recycle(including_seq bool) { + batch.slice.Recycle(including_seq) batch.slice = nil } diff --git a/pkg/obiiter/batchiterator.go b/pkg/obiiter/batchiterator.go index 6e86d82..927f46c 100644 --- a/pkg/obiiter/batchiterator.go +++ b/pkg/obiiter/batchiterator.go @@ -435,7 +435,7 @@ func (iterator IBioSequence) Rebatch(size int) IBioSequence { buffer = obiseq.MakeBioSequenceSlice() } } - seqs.Recycle() + seqs.Recycle(false) } if len(buffer) > 0 { @@ -461,11 +461,8 @@ func (iterator IBioSequence) Recycle() { // iterator.Get() batch := iterator.Get() log.Debugln("Recycling batch #", batch.Order()) - for _, seq := range batch.Slice() { - seq.Recycle() - recycled++ - } - batch.Recycle() + recycled+=batch.Len() + batch.Recycle(true) } log.Debugf("End of the recycling of %d Bioseq objects", recycled) } @@ -473,7 +470,7 @@ func (iterator IBioSequence) Recycle() { func (iterator IBioSequence) Consume() { for iterator.Next() { batch := iterator.Get() - batch.Recycle() + batch.Recycle(false) } } @@ -490,12 +487,8 @@ func (iterator IBioSequence) Count(recycle bool) (int, int, int) { variants++ reads += seq.Count() nucleotides += seq.Len() - - if recycle { - seq.Recycle() - } } - batch.Recycle() + batch.Recycle(recycle) } log.Debugf("End of the counting of %d Bioseq objects", variants) return variants, reads, nucleotides @@ -547,7 +540,7 @@ func (iterator IBioSequence) DivideOn(predicate obiseq.SequencePredicate, falseSlice = obiseq.MakeBioSequenceSlice() } } - seqs.Recycle() + seqs.Recycle(false) } if len(trueSlice) > 0 { @@ -688,7 +681,7 @@ func (iterator IBioSequence) Load() obiseq.BioSequenceSlice { b := iterator.Get() log.Debugf("append %d sequences",b.Len()) chunck = append(chunck, b.Slice()...) - b.Recycle() + b.Recycle(false) } return chunck diff --git a/pkg/obiiter/distribute.go b/pkg/obiiter/distribute.go index 45e755a..bba3f79 100644 --- a/pkg/obiiter/distribute.go +++ b/pkg/obiiter/distribute.go @@ -92,7 +92,7 @@ func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, siz slices[key] = &s } } - seqs.Recycle() + seqs.Recycle(false) } for key, slice := range slices { diff --git a/pkg/obiiter/workers.go b/pkg/obiiter/workers.go index 8003851..b7adc6e 100644 --- a/pkg/obiiter/workers.go +++ b/pkg/obiiter/workers.go @@ -119,7 +119,7 @@ func (iterator IBioSequence) MakeISliceWorker(worker obiseq.SeqSliceWorker, size for iterator.Next() { batch := iterator.Get() batch.slice = worker(batch.slice) - newIter.pointer.channel <- batch + newIter.Push(batch) } newIter.Done() } diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index e38b685..76d0e9d 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -61,15 +61,20 @@ type BioSequence struct { } // MakeEmptyBioSequence() creates a new BioSequence object with no data -func MakeEmptyBioSequence() BioSequence { +func MakeEmptyBioSequence(preallocate int) BioSequence { atomic.AddInt32(&_NewSeq, 1) atomic.AddInt32(&_InMemSeq, 1) + seq := []byte(nil) + if preallocate > 0 { + seq = GetSlice(preallocate) + } + return BioSequence{ id: "", definition: "", source: "", - sequence: nil, + sequence: seq, qualities: nil, feature: nil, paired: nil, @@ -78,8 +83,8 @@ func MakeEmptyBioSequence() BioSequence { } // `NewEmptyBioSequence()` returns a pointer to a new empty BioSequence -func NewEmptyBioSequence() *BioSequence { - s := MakeEmptyBioSequence() +func NewEmptyBioSequence(preallocate int) *BioSequence { + s := MakeEmptyBioSequence(preallocate) return &s } @@ -87,7 +92,7 @@ func NewEmptyBioSequence() *BioSequence { func MakeBioSequence(id string, sequence []byte, definition string) BioSequence { - bs := MakeEmptyBioSequence() + bs := MakeEmptyBioSequence(0) bs.SetId(id) bs.SetSequence(sequence) bs.SetDefinition(definition) @@ -127,7 +132,7 @@ func (sequence *BioSequence) Recycle() { // Copying the BioSequence. func (s *BioSequence) Copy() *BioSequence { - newSeq := MakeEmptyBioSequence() + newSeq := MakeEmptyBioSequence(0) newSeq.id = s.id newSeq.definition = s.definition diff --git a/pkg/obiseq/biosequenceslice.go b/pkg/obiseq/biosequenceslice.go index 46d2380..3337512 100644 --- a/pkg/obiseq/biosequenceslice.go +++ b/pkg/obiseq/biosequenceslice.go @@ -34,14 +34,22 @@ func MakeBioSequenceSlice(size ...int) BioSequenceSlice { return *NewBioSequenceSlice(size...) } -func (s *BioSequenceSlice) Recycle() { +func (s *BioSequenceSlice) Recycle(including_seq bool) { if s == nil { log.Panicln("Trying too recycle a nil pointer") } // Code added to potentially limit memory leaks - for i := range *s { - (*s)[i] = nil + if including_seq { + for i := range *s { + (*s)[i] .Recycle() + (*s)[i] = nil + } + + } else { + for i := range *s { + (*s)[i] = nil + } } *s = (*s)[:0] diff --git a/pkg/obiseq/merge.go b/pkg/obiseq/merge.go index c84b232..ca1ea41 100644 --- a/pkg/obiseq/merge.go +++ b/pkg/obiseq/merge.go @@ -213,7 +213,7 @@ func (sequences BioSequenceSlice) Merge(na string, statsOn []string) *BioSequenc } } - sequences.Recycle() + sequences.Recycle(false) return seq } diff --git a/pkg/obiseq/pool.go b/pkg/obiseq/pool.go index c03c6f7..08d2806 100644 --- a/pkg/obiseq/pool.go +++ b/pkg/obiseq/pool.go @@ -20,7 +20,9 @@ func RecycleSlice(s *[]byte) { if cap(*s) == 0 { log.Panicln("trying to store a NIL slice in the pool", s == nil, *s == nil, cap(*s)) } - _BioSequenceByteSlicePool.Put(s) + if cap(*s) <= 1024 { + _BioSequenceByteSlicePool.Put(s) + } } } @@ -28,7 +30,10 @@ func RecycleSlice(s *[]byte) { // // the slice can be prefilled with the provided values func GetSlice(capacity int) []byte { - p := _BioSequenceByteSlicePool.Get().(*[]byte) + p := (*[]byte)(nil) + if capacity <= 1024 { + p = _BioSequenceByteSlicePool.Get().(*[]byte) + } if p == nil || *p == nil || cap(*p) < capacity { s := make([]byte, 0, capacity) diff --git a/pkg/obiseq/subseq.go b/pkg/obiseq/subseq.go index 6cc5778..0e49da3 100644 --- a/pkg/obiseq/subseq.go +++ b/pkg/obiseq/subseq.go @@ -8,7 +8,6 @@ import ( // Returns a sub sequence start from position 'from' included, // to position 'to' excluded. Coordinates start at position 0. func (sequence *BioSequence) Subsequence(from, to int, circular bool) (*BioSequence, error) { - if from >= to && !circular { return nil, errors.New("from greater than to") } @@ -24,10 +23,11 @@ func (sequence *BioSequence) Subsequence(from, to int, circular bool) (*BioSeque var newSeq *BioSequence if from < to { - newSeq = NewEmptyBioSequence() - newSeq.Write(sequence.Sequence()[from:to]) + newSeq = NewEmptyBioSequence(0) + newSeq.sequence = CopySlice(sequence.Sequence()[from:to]) if sequence.HasQualities() { + newSeq.qualities = CopySlice(sequence.Qualities()[from:to]) newSeq.WriteQualities(sequence.Qualities()[from:to]) }