2022-01-13 23:27:39 +01:00
|
|
|
package obiformats
|
|
|
|
|
|
|
|
// #cgo CFLAGS: -g -Wall
|
|
|
|
// #cgo LDFLAGS: -lz
|
|
|
|
// #include <stdlib.h>
|
|
|
|
// #include "fastseq_read.h"
|
|
|
|
import "C"
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"log"
|
|
|
|
"os"
|
|
|
|
"time"
|
|
|
|
"unsafe"
|
|
|
|
|
2022-01-13 23:43:01 +01:00
|
|
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/cutils"
|
|
|
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
2022-01-13 23:27:39 +01:00
|
|
|
)
|
|
|
|
|
2022-01-16 00:21:42 +01:00
|
|
|
func _FastseqReader(seqfile C.fast_kseq_p,
|
2022-01-13 23:27:39 +01:00
|
|
|
iterator obiseq.IBioSequenceBatch,
|
|
|
|
batch_size int) {
|
|
|
|
var comment string
|
|
|
|
i := 0
|
|
|
|
ii := 0
|
|
|
|
|
2022-02-18 22:53:09 +01:00
|
|
|
slice := obiseq.GetBioSequenceSlice()
|
2022-01-13 23:27:39 +01:00
|
|
|
|
|
|
|
for l := int64(C.next_fast_sek(seqfile)); l > 0; l = int64(C.next_fast_sek(seqfile)) {
|
|
|
|
|
|
|
|
s := seqfile.seq
|
|
|
|
|
2022-01-16 00:21:42 +01:00
|
|
|
csequence := cutils.ByteSlice(unsafe.Pointer(s.seq.s), int(s.seq.l))
|
|
|
|
sequence := obiseq.GetSlice()
|
|
|
|
sequence = append(sequence, csequence...)
|
|
|
|
|
|
|
|
//sequence := C.GoBytes(unsafe.Pointer(s.seq.s),
|
|
|
|
// C.int(s.seq.l))
|
2022-01-13 23:27:39 +01:00
|
|
|
|
|
|
|
name := C.GoString(s.name.s)
|
|
|
|
|
|
|
|
if s.comment.l > C.ulong(0) {
|
|
|
|
comment = C.GoString(s.comment.s)
|
|
|
|
} else {
|
|
|
|
comment = ""
|
|
|
|
}
|
|
|
|
|
|
|
|
rep := obiseq.MakeBioSequence(name, sequence, comment)
|
|
|
|
|
|
|
|
if s.qual.l > C.ulong(0) {
|
|
|
|
cquality := cutils.ByteSlice(unsafe.Pointer(s.qual.s), int(s.qual.l))
|
|
|
|
l := int(s.qual.l)
|
2022-01-16 00:21:42 +01:00
|
|
|
quality := obiseq.GetSlice()
|
2022-01-13 23:27:39 +01:00
|
|
|
shift := uint8(seqfile.shift)
|
|
|
|
for j := 0; j < l; j++ {
|
2022-01-16 00:21:42 +01:00
|
|
|
quality = append(quality, uint8(cquality[j])-shift)
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
rep.SetQualities(quality)
|
|
|
|
}
|
|
|
|
slice = append(slice, rep)
|
|
|
|
ii++
|
|
|
|
if ii >= batch_size {
|
|
|
|
// log.Printf("\n==> Pushing sequence batch\n")
|
|
|
|
// start := time.Now()
|
|
|
|
|
|
|
|
iterator.Channel() <- obiseq.MakeBioSequenceBatch(i, slice...)
|
|
|
|
// elapsed := time.Since(start)
|
|
|
|
// log.Printf("\n==>sequences pushed after %s\n", elapsed)
|
|
|
|
|
|
|
|
slice = make(obiseq.BioSequenceSlice, 0, batch_size)
|
|
|
|
i++
|
|
|
|
ii = 0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if len(slice) > 0 {
|
|
|
|
iterator.Channel() <- obiseq.MakeBioSequenceBatch(i, slice...)
|
|
|
|
}
|
|
|
|
iterator.Done()
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
func ReadFastSeqBatchFromFile(filename string, options ...WithOption) (obiseq.IBioSequenceBatch, error) {
|
|
|
|
opt := MakeOptions(options)
|
|
|
|
|
|
|
|
name := C.CString(filename)
|
|
|
|
defer C.free(unsafe.Pointer(name))
|
|
|
|
|
|
|
|
pointer := C.open_fast_sek_file(name, C.int32_t(opt.QualityShift()))
|
|
|
|
|
|
|
|
var err error
|
|
|
|
err = nil
|
|
|
|
|
|
|
|
if pointer == nil {
|
2022-01-14 16:10:19 +01:00
|
|
|
err = fmt.Errorf("cannot open file %s", filename)
|
2022-01-13 23:27:39 +01:00
|
|
|
return obiseq.NilIBioSequenceBatch, err
|
|
|
|
}
|
|
|
|
|
|
|
|
size := int64(-1)
|
|
|
|
fi, err := os.Stat(filename)
|
|
|
|
if err == nil {
|
|
|
|
size = fi.Size()
|
|
|
|
log.Printf("File size of %s is %d bytes\n", filename, size)
|
|
|
|
} else {
|
|
|
|
size = -1
|
|
|
|
}
|
|
|
|
|
2022-01-14 17:32:12 +01:00
|
|
|
newIter := obiseq.MakeIBioSequenceBatch(opt.BufferSize())
|
|
|
|
newIter.Add(1)
|
2022-01-13 23:27:39 +01:00
|
|
|
|
|
|
|
go func() {
|
2022-01-14 17:32:12 +01:00
|
|
|
newIter.Wait()
|
|
|
|
for len(newIter.Channel()) > 0 {
|
2022-01-13 23:27:39 +01:00
|
|
|
time.Sleep(time.Millisecond)
|
|
|
|
}
|
2022-01-14 17:32:12 +01:00
|
|
|
close(newIter.Channel())
|
2022-01-13 23:27:39 +01:00
|
|
|
|
|
|
|
log.Println("End of the fastq file reading")
|
|
|
|
}()
|
|
|
|
|
|
|
|
log.Println("Start of the fastq file reading")
|
|
|
|
|
2022-01-16 00:21:42 +01:00
|
|
|
go _FastseqReader(pointer, newIter, opt.BatchSize())
|
2022-01-13 23:27:39 +01:00
|
|
|
parser := opt.ParseFastSeqHeader()
|
2022-02-09 22:00:38 +01:00
|
|
|
|
2022-01-13 23:27:39 +01:00
|
|
|
if parser != nil {
|
2022-01-14 17:32:12 +01:00
|
|
|
return IParseFastSeqHeaderBatch(newIter, options...), err
|
2022-02-18 22:53:09 +01:00
|
|
|
}
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2022-01-14 17:32:12 +01:00
|
|
|
return newIter, err
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
func ReadFastSeqFromFile(filename string, options ...WithOption) (obiseq.IBioSequence, error) {
|
|
|
|
ib, err := ReadFastSeqBatchFromFile(filename, options...)
|
|
|
|
return ib.SortBatches().IBioSequence(), err
|
|
|
|
}
|
|
|
|
|
|
|
|
func ReadFastSeqBatchFromStdin(options ...WithOption) obiseq.IBioSequenceBatch {
|
|
|
|
opt := MakeOptions(options)
|
2022-01-14 17:32:12 +01:00
|
|
|
newIter := obiseq.MakeIBioSequenceBatch(opt.BufferSize())
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2022-01-14 17:32:12 +01:00
|
|
|
newIter.Add(1)
|
2022-01-13 23:27:39 +01:00
|
|
|
|
|
|
|
go func() {
|
2022-01-14 17:32:12 +01:00
|
|
|
newIter.Wait()
|
|
|
|
close(newIter.Channel())
|
2022-01-13 23:27:39 +01:00
|
|
|
}()
|
|
|
|
|
2022-01-16 00:21:42 +01:00
|
|
|
go _FastseqReader(C.open_fast_sek_stdin(C.int32_t(opt.QualityShift())), newIter, opt.BatchSize())
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2022-01-14 17:32:12 +01:00
|
|
|
return newIter
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
func ReadFastSeqFromStdin(options ...WithOption) obiseq.IBioSequence {
|
|
|
|
ib := ReadFastSeqBatchFromStdin(options...)
|
|
|
|
return ib.SortBatches().IBioSequence()
|
|
|
|
}
|