Add some code refactoring from the blackboard branch

This commit is contained in:
Eric Coissac
2024-08-02 12:35:46 +02:00
parent bc1aaaf7d9
commit 1b1cd41fd3
38 changed files with 491 additions and 330 deletions

View File

@ -20,6 +20,37 @@
### New features ### New features
- The output of the obitools will evolve to produce results only in standard
formats such as fasta and fastq. For non-sequential data, the output will be
in CSV format, with the separator `,`, the decimal separator `.`, and a
header line with the column names. It is more convenient to use the output
in other programs. For example, you can use the `csvtomd` command to
reformat the csv output into a markdown table. The first command to initiate
this change is `obicount`, which now produces a 3-line CSV output.
```bash
obicount data.csv | csvtomd
```
- Adds the new experimental `obicleandb` utility to clean up reference
database files created with `obipcr`. An easy way to create a reference
database for `obitag` is to use `obipcr` on a local copy of Genbank or EMBL.
However, these sequence databases are known to contain many taxonomic
errors, such as bacterial sequences annotated with the taxid of their host
species. obicleandb tries to detect these errors. To do this, it first keeps
only sequences annotated with the taxid to which a species, genus, and
family taxid can be assigned. Then, for each sequence, it compares the
distance of the sequence to the other sequences belonging to the same genus
to the same number of distances between the considered sequence and a
randomly selected set of sequences belonging to another family using a
Mann-Whitney U test. The alternative hypothesis is that out-of-family
distances are greater than intrageneric distances. Sequences are annotated
with the p-value of the Mann-Whitney U test in the **obicleandb_trusted**
slot. Later, the distribution of this p-value can be analyzed to determine a
threshold. Empirically, a threshold of 0.05 is a good compromise and allows
to filter out less than 1‰ of the sequences. These sequences can then be
removed using `obigrep`.
- Adds a new `obijoin` utility to join information contained in a sequence - Adds a new `obijoin` utility to join information contained in a sequence
file with that contained in another sequence or CSV file. The command allows file with that contained in another sequence or CSV file. The command allows
you to specify the names of the keys in the main sequence file and in the you to specify the names of the keys in the main sequence file and in the

View File

@ -45,17 +45,18 @@ func main() {
nvariant, nread, nsymbol := fs.Count(true) nvariant, nread, nsymbol := fs.Count(true)
fmt.Print("entites,n\n")
if obicount.CLIIsPrintingVariantCount() { if obicount.CLIIsPrintingVariantCount() {
fmt.Printf(" %d", nvariant) fmt.Printf("variants,%d\n", nvariant)
} }
if obicount.CLIIsPrintingReadCount() { if obicount.CLIIsPrintingReadCount() {
fmt.Printf(" %d", nread) fmt.Printf("reads,%d\n", nread)
} }
if obicount.CLIIsPrintingSymbolCount() { if obicount.CLIIsPrintingSymbolCount() {
fmt.Printf(" %d", nsymbol) fmt.Printf("symbols,%d\n", nsymbol)
} }
fmt.Printf("\n")
} }

View File

@ -73,11 +73,11 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
panic(err) panic(err)
} }
chunck := iseq.Load() source, chunk := iseq.Load()
newIter.Push(obiiter.MakeBioSequenceBatch(order, chunck)) newIter.Push(obiiter.MakeBioSequenceBatch(source, order, chunk))
log.Infof("Start processing of batch %d/%d : %d sequences", log.Infof("Start processing of batch %d/%d : %d sequences",
order, nbatch, len(chunck)) order, nbatch, len(chunk))
} }

View File

@ -28,6 +28,7 @@ func ISequenceChunk(iterator obiiter.IBioSequence,
jobDone := sync.WaitGroup{} jobDone := sync.WaitGroup{}
chunks := make(map[int]*obiseq.BioSequenceSlice, 1000) chunks := make(map[int]*obiseq.BioSequenceSlice, 1000)
sources := make(map[int]string, 1000)
for newflux := range dispatcher.News() { for newflux := range dispatcher.News() {
jobDone.Add(1) jobDone.Add(1)
@ -43,12 +44,18 @@ func ISequenceChunk(iterator obiiter.IBioSequence,
chunks[newflux] = chunk chunks[newflux] = chunk
lock.Unlock() lock.Unlock()
source := ""
for data.Next() { for data.Next() {
b := data.Get() b := data.Get()
source = b.Source()
*chunk = append(*chunk, b.Slice()...) *chunk = append(*chunk, b.Slice()...)
b.Recycle(false) b.Recycle(false)
} }
lock.Lock()
sources[newflux] = source
lock.Unlock()
jobDone.Done() jobDone.Done()
}(newflux) }(newflux)
} }
@ -56,10 +63,10 @@ func ISequenceChunk(iterator obiiter.IBioSequence,
jobDone.Wait() jobDone.Wait()
order := 0 order := 0
for _, chunck := range chunks { for i, chunk := range chunks {
if len(*chunck) > 0 { if len(*chunk) > 0 {
newIter.Push(obiiter.MakeBioSequenceBatch(order, *chunck)) newIter.Push(obiiter.MakeBioSequenceBatch(sources[i], order, *chunk))
order++ order++
} }

View File

@ -90,7 +90,7 @@ func ISequenceSubChunk(iterator obiiter.IBioSequence,
for iterator.Next() { for iterator.Next() {
batch := iterator.Get() batch := iterator.Get()
source := batch.Source()
if batch.Len() > 1 { if batch.Len() > 1 {
classifier.Reset() classifier.Reset()
@ -117,7 +117,7 @@ func ISequenceSubChunk(iterator obiiter.IBioSequence,
ss := obiseq.MakeBioSequenceSlice() ss := obiseq.MakeBioSequenceSlice()
for i, v := range ordered { for i, v := range ordered {
if v.code != last { if v.code != last {
newIter.Push(obiiter.MakeBioSequenceBatch(nextOrder(), ss)) newIter.Push(obiiter.MakeBioSequenceBatch(source, nextOrder(), ss))
ss = obiseq.MakeBioSequenceSlice() ss = obiseq.MakeBioSequenceSlice()
last = v.code last = v.code
} }
@ -127,7 +127,7 @@ func ISequenceSubChunk(iterator obiiter.IBioSequence,
} }
if len(ss) > 0 { if len(ss) > 0 {
newIter.Push(obiiter.MakeBioSequenceBatch(nextOrder(), ss)) newIter.Push(obiiter.MakeBioSequenceBatch(source, nextOrder(), ss))
} }
} else { } else {
newIter.Push(batch.Reorder(nextOrder())) newIter.Push(batch.Reorder(nextOrder()))

View File

@ -111,14 +111,14 @@ func _ParseCsvFile(source string,
slice = append(slice, sequence) slice = append(slice, sequence)
if len(slice) >= batchSize { if len(slice) >= batchSize {
out.Push(obiiter.MakeBioSequenceBatch(o, slice)) out.Push(obiiter.MakeBioSequenceBatch(source, o, slice))
o++ o++
slice = obiseq.MakeBioSequenceSlice() slice = obiseq.MakeBioSequenceSlice()
} }
} }
if len(slice) > 0 { if len(slice) > 0 {
out.Push(obiiter.MakeBioSequenceBatch(o, slice)) out.Push(obiiter.MakeBioSequenceBatch(source, o, slice))
} }
out.Done() out.Done()

View File

@ -142,7 +142,7 @@ func WriteCSV(iterator obiiter.IBioSequence,
nwriters := opt.ParallelWorkers() nwriters := opt.ParallelWorkers()
obiiter.RegisterAPipe() obiiter.RegisterAPipe()
chunkchan := make(chan FileChunck) chunkchan := make(chan FileChunk)
newIter.Add(nwriters) newIter.Add(nwriters)
var waitWriter sync.WaitGroup var waitWriter sync.WaitGroup
@ -161,7 +161,7 @@ func WriteCSV(iterator obiiter.IBioSequence,
batch := iterator.Get() batch := iterator.Get()
chunkchan <- FileChunck{ chunkchan <- FileChunk{
FormatCVSBatch(batch, opt), FormatCVSBatch(batch, opt),
batch.Order(), batch.Order(),
} }
@ -171,7 +171,7 @@ func WriteCSV(iterator obiiter.IBioSequence,
} }
next_to_send := 0 next_to_send := 0
received := make(map[int]FileChunck, 100) received := make(map[int]FileChunk, 100)
waitWriter.Add(1) waitWriter.Add(1)
go func() { go func() {

View File

@ -122,7 +122,7 @@ func __read_ecopcr_bioseq__(file *__ecopcr_file__) (*obiseq.BioSequence, error)
return bseq, nil return bseq, nil
} }
func ReadEcoPCR(reader io.Reader, options ...WithOption) obiiter.IBioSequence { func ReadEcoPCR(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
tag := make([]byte, 11) tag := make([]byte, 11)
n, _ := reader.Read(tag) n, _ := reader.Read(tag)
@ -187,7 +187,7 @@ func ReadEcoPCR(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
slice = append(slice, seq) slice = append(slice, seq)
ii++ ii++
if ii >= opt.BatchSize() { if ii >= opt.BatchSize() {
newIter.Push(obiiter.MakeBioSequenceBatch(i, slice)) newIter.Push(obiiter.MakeBioSequenceBatch(opt.Source(), i, slice))
slice = obiseq.MakeBioSequenceSlice() slice = obiseq.MakeBioSequenceSlice()
i++ i++
ii = 0 ii = 0
@ -198,7 +198,7 @@ func ReadEcoPCR(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
} }
if len(slice) > 0 { if len(slice) > 0 {
newIter.Push(obiiter.MakeBioSequenceBatch(i, slice)) newIter.Push(obiiter.MakeBioSequenceBatch(opt.Source(), i, slice))
} }
newIter.Done() newIter.Done()
@ -213,7 +213,7 @@ func ReadEcoPCR(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
newIter = newIter.CompleteFileIterator() newIter = newIter.CompleteFileIterator()
} }
return newIter return newIter, nil
} }
func ReadEcoPCRFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) { func ReadEcoPCRFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
@ -235,5 +235,5 @@ func ReadEcoPCRFromFile(filename string, options ...WithOption) (obiiter.IBioSeq
reader = greader reader = greader
} }
return ReadEcoPCR(reader, options...), nil return ReadEcoPCR(reader, options...)
} }

View File

@ -15,7 +15,7 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
) )
// _EndOfLastEntry finds the index of the last entry in the given byte slice 'buff' // EndOfLastFlatFileEntry finds the index of the last entry in the given byte slice 'buff'
// using a pattern match of the form: // using a pattern match of the form:
// <CR>?<LF>//<CR>?<LF> // <CR>?<LF>//<CR>?<LF>
// where <CR> and <LF> are the ASCII codes for carriage return and line feed, // where <CR> and <LF> are the ASCII codes for carriage return and line feed,
@ -27,7 +27,7 @@ import (
// //
// Returns: // Returns:
// int - the index of the end of the last entry or -1 if no match is found. // int - the index of the end of the last entry or -1 if no match is found.
func _EndOfLastEntry(buff []byte) int { func EndOfLastFlatFileEntry(buff []byte) int {
// 6 5 43 2 1 // 6 5 43 2 1
// <CR>?<LF>//<CR>?<LF> // <CR>?<LF>//<CR>?<LF>
var i int var i int
@ -87,15 +87,9 @@ func _EndOfLastEntry(buff []byte) int {
return -1 return -1
} }
func _ParseEmblFile(source string, input ChannelSeqFileChunk, func EmblChunkParser(withFeatureTable bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
out obiiter.IBioSequence, parser := func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
withFeatureTable bool, scanner := bufio.NewScanner(input)
batch_size int,
total_seq_size int) {
for chunks := range input {
scanner := bufio.NewScanner(chunks.raw)
order := chunks.order
sequences := make(obiseq.BioSequenceSlice, 0, 100) sequences := make(obiseq.BioSequenceSlice, 0, 100)
id := "" id := ""
scientificName := "" scientificName := ""
@ -156,7 +150,31 @@ func _ParseEmblFile(source string, input ChannelSeqFileChunk,
seqBytes = new(bytes.Buffer) seqBytes = new(bytes.Buffer)
} }
} }
out.Push(obiiter.MakeBioSequenceBatch(order, sequences))
return sequences, nil
}
return parser
}
func _ParseEmblFile(
input ChannelSeqFileChunk,
out obiiter.IBioSequence,
withFeatureTable bool,
) {
parser := EmblChunkParser(withFeatureTable)
for chunks := range input {
order := chunks.Order
sequences, err := parser(chunks.Source, chunks.Raw)
if err != nil {
log.Fatalf("%s : Cannot parse the embl file : %v", chunks.Source, err)
}
out.Push(obiiter.MakeBioSequenceBatch(chunks.Source, order, sequences))
} }
out.Done() out.Done()
@ -166,12 +184,18 @@ func _ParseEmblFile(source string, input ChannelSeqFileChunk,
// 6 5 43 2 1 // 6 5 43 2 1
// //
// <CR>?<LF>//<CR>?<LF> // <CR>?<LF>//<CR>?<LF>
func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence { func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
opt := MakeOptions(options) opt := MakeOptions(options)
buff := make([]byte, 1024*1024*1024*256) buff := make([]byte, 1024*1024*1024*256)
entry_channel := ReadSeqFileChunk(reader, buff, _EndOfLastEntry) entry_channel := ReadSeqFileChunk(
opt.Source(),
reader,
buff,
EndOfLastFlatFileEntry,
)
newIter := obiiter.MakeIBioSequence() newIter := obiiter.MakeIBioSequence()
nworkers := opt.ParallelWorkers() nworkers := opt.ParallelWorkers()
@ -179,10 +203,11 @@ func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
// for j := 0; j < opt.ParallelWorkers(); j++ { // for j := 0; j < opt.ParallelWorkers(); j++ {
for j := 0; j < nworkers; j++ { for j := 0; j < nworkers; j++ {
newIter.Add(1) newIter.Add(1)
go _ParseEmblFile(opt.Source(), entry_channel, newIter, go _ParseEmblFile(
entry_channel,
newIter,
opt.WithFeatureTable(), opt.WithFeatureTable(),
opt.BatchSize(), )
opt.TotalSeqSize())
} }
go func() { go func() {
@ -193,7 +218,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
newIter = newIter.CompleteFileIterator() newIter = newIter.CompleteFileIterator()
} }
return newIter return newIter, nil
} }
func ReadEMBLFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) { func ReadEMBLFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
@ -214,5 +239,5 @@ func ReadEMBLFromFile(filename string, options ...WithOption) (obiiter.IBioSeque
return obiiter.NilIBioSequence, err return obiiter.NilIBioSequence, err
} }
return ReadEMBL(reader, options...), nil return ReadEMBL(reader, options...)
} }

View File

@ -14,7 +14,7 @@ import (
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
func _EndOfLastFastaEntry(buffer []byte) int { func EndOfLastFastaEntry(buffer []byte) int {
var i int var i int
imax := len(buffer) imax := len(buffer)
@ -39,24 +39,18 @@ func _EndOfLastFastaEntry(buffer []byte) int {
return last return last
} }
func _ParseFastaFile(source string, func FastaChunkParser() func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
input ChannelSeqFileChunk,
out obiiter.IBioSequence,
no_order bool,
batch_size int,
chunck_order func() int,
) {
var identifier string parser := func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
var definition string var identifier string
var definition string
idBytes := bytes.Buffer{} idBytes := bytes.Buffer{}
defBytes := bytes.Buffer{} defBytes := bytes.Buffer{}
seqBytes := bytes.Buffer{} seqBytes := bytes.Buffer{}
for chunks := range input {
state := 0 state := 0
scanner := bufio.NewReader(chunks.raw) scanner := bufio.NewReader(input)
start, _ := scanner.Peek(20) start, _ := scanner.Peek(20)
if start[0] != '>' { if start[0] != '>' {
log.Fatalf("%s : first character is not '>'", string(start)) log.Fatalf("%s : first character is not '>'", string(start))
@ -64,7 +58,8 @@ func _ParseFastaFile(source string,
if start[1] == ' ' { if start[1] == ' ' {
log.Fatalf("%s :Strange", string(start)) log.Fatalf("%s :Strange", string(start))
} }
sequences := make(obiseq.BioSequenceSlice, 0, batch_size)
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
previous := byte(0) previous := byte(0)
@ -160,12 +155,6 @@ func _ParseFastaFile(source string,
s := obiseq.NewBioSequence(identifier, rawseq, definition) s := obiseq.NewBioSequence(identifier, rawseq, definition)
s.SetSource(source) s.SetSource(source)
sequences = append(sequences, s) sequences = append(sequences, s)
if no_order {
if len(sequences) == batch_size {
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
sequences = make(obiseq.BioSequenceSlice, 0, batch_size)
}
}
state = 1 state = 1
} else { } else {
// Error // Error
@ -209,13 +198,28 @@ func _ParseFastaFile(source string,
sequences = append(sequences, s) sequences = append(sequences, s)
} }
if len(sequences) > 0 { return sequences, nil
co := chunks.order }
if no_order {
co = chunck_order() return parser
} }
out.Push(obiiter.MakeBioSequenceBatch(co, sequences))
func _ParseFastaFile(
input ChannelSeqFileChunk,
out obiiter.IBioSequence,
) {
parser := FastaChunkParser()
for chunks := range input {
sequences, err := parser(chunks.Source, chunks.Raw)
if err != nil {
log.Fatalf("File %s : Cannot parse the fasta file : %v", chunks.Source, err)
} }
out.Push(obiiter.MakeBioSequenceBatch(chunks.Source, chunks.Order, sequences))
} }
out.Done() out.Done()
@ -230,17 +234,16 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
buff := make([]byte, 1024*1024*1024) buff := make([]byte, 1024*1024*1024)
chkchan := ReadSeqFileChunk(reader, buff, _EndOfLastFastaEntry) chkchan := ReadSeqFileChunk(
chunck_order := obiutils.AtomicCounter() opt.Source(),
reader,
buff,
EndOfLastFastaEntry,
)
for i := 0; i < nworker; i++ { for i := 0; i < nworker; i++ {
out.Add(1) out.Add(1)
go _ParseFastaFile(opt.Source(), go _ParseFastaFile(chkchan, out)
chkchan,
out,
opt.NoOrder(),
opt.BatchSize(),
chunck_order)
} }
go func() { go func() {
@ -282,7 +285,7 @@ func ReadFastaFromFile(filename string, options ...WithOption) (obiiter.IBioSequ
} }
func ReadFastaFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) { func ReadFastaFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin"))) options = append(options, OptionsSource("stdin"))
input, err := Buf(os.Stdin) input, err := Buf(os.Stdin)
if err == ErrNoContent { if err == ErrNoContent {

View File

@ -14,7 +14,7 @@ import (
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
func _EndOfLastFastqEntry(buffer []byte) int { func EndOfLastFastqEntry(buffer []byte) int {
var i int var i int
imax := len(buffer) imax := len(buffer)
@ -117,27 +117,20 @@ func _storeSequenceQuality(bytes *bytes.Buffer, out *obiseq.BioSequence, quality
out.SetQualities(q) out.SetQualities(q)
} }
func _ParseFastqFile(source string, func FastqChunkParser(quality_shift byte) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
input ChannelSeqFileChunk, parser := func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
out obiiter.IBioSequence,
quality_shift byte,
no_order bool,
batch_size int,
chunck_order func() int,
) {
var identifier string var identifier string
var definition string var definition string
idBytes := bytes.Buffer{} idBytes := bytes.Buffer{}
defBytes := bytes.Buffer{} defBytes := bytes.Buffer{}
qualBytes := bytes.Buffer{} qualBytes := bytes.Buffer{}
seqBytes := bytes.Buffer{} seqBytes := bytes.Buffer{}
for chunks := range input {
state := 0 state := 0
scanner := bufio.NewReader(chunks.raw) scanner := bufio.NewReader(input)
sequences := make(obiseq.BioSequenceSlice, 0, 100) sequences := obiseq.MakeBioSequenceSlice(100)[:0]
previous := byte(0) previous := byte(0)
for C, err := scanner.ReadByte(); err != io.EOF; C, err = scanner.ReadByte() { for C, err := scanner.ReadByte(); err != io.EOF; C, err = scanner.ReadByte() {
@ -257,14 +250,6 @@ func _ParseFastqFile(source string,
case 10: case 10:
if is_end_of_line { if is_end_of_line {
_storeSequenceQuality(&qualBytes, sequences[len(sequences)-1], quality_shift) _storeSequenceQuality(&qualBytes, sequences[len(sequences)-1], quality_shift)
if no_order {
if len(sequences) == batch_size {
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
sequences = make(obiseq.BioSequenceSlice, 0, batch_size)
}
}
state = 11 state = 11
} else { } else {
qualBytes.WriteByte(C) qualBytes.WriteByte(C)
@ -288,14 +273,31 @@ func _ParseFastqFile(source string,
_storeSequenceQuality(&qualBytes, sequences[len(sequences)-1], quality_shift) _storeSequenceQuality(&qualBytes, sequences[len(sequences)-1], quality_shift)
state = 1 state = 1
} }
co := chunks.order
if no_order {
co = chunck_order()
}
out.Push(obiiter.MakeBioSequenceBatch(co, sequences))
} }
return sequences, nil
}
return parser
}
func _ParseFastqFile(
input ChannelSeqFileChunk,
out obiiter.IBioSequence,
quality_shift byte,
) {
parser := FastqChunkParser(quality_shift)
for chunks := range input {
sequences, err := parser(chunks.Source, chunks.Raw)
if err != nil {
log.Fatalf("File %s : Cannot parse the fastq file : %v", chunks.Source, err)
}
out.Push(obiiter.MakeBioSequenceBatch(chunks.Source, chunks.Order, sequences))
} }
out.Done() out.Done()
@ -307,21 +309,23 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
out := obiiter.MakeIBioSequence() out := obiiter.MakeIBioSequence()
nworker := opt.ParallelWorkers() nworker := opt.ParallelWorkers()
chunkorder := obiutils.AtomicCounter()
buff := make([]byte, 1024*1024*1024) buff := make([]byte, 1024*1024*1024)
chkchan := ReadSeqFileChunk(reader, buff, _EndOfLastFastqEntry) chkchan := ReadSeqFileChunk(
opt.Source(),
reader,
buff,
EndOfLastFastqEntry,
)
for i := 0; i < nworker; i++ { for i := 0; i < nworker; i++ {
out.Add(1) out.Add(1)
go _ParseFastqFile(opt.Source(), go _ParseFastqFile(
chkchan, chkchan,
out, out,
byte(obioptions.InputQualityShift()), byte(obioptions.InputQualityShift()),
opt.NoOrder(), )
opt.BatchSize(),
chunkorder)
} }
go func() { go func() {

View File

@ -69,7 +69,7 @@ func _FastseqReader(source string,
slice = append(slice, rep) slice = append(slice, rep)
ii++ ii++
if ii >= batch_size { if ii >= batch_size {
iterator.Push(obiiter.MakeBioSequenceBatch(i, slice)) iterator.Push(obiiter.MakeBioSequenceBatch(source, i, slice))
slice = obiseq.MakeBioSequenceSlice() slice = obiseq.MakeBioSequenceSlice()
i++ i++
ii = 0 ii = 0
@ -77,7 +77,7 @@ func _FastseqReader(source string,
} }
if len(slice) > 0 { if len(slice) > 0 {
iterator.Push(obiiter.MakeBioSequenceBatch(i, slice)) iterator.Push(obiiter.MakeBioSequenceBatch(source, i, slice))
} }
iterator.Done() iterator.Done()

View File

@ -7,8 +7,6 @@ import (
"io" "io"
"os" "os"
"strings" "strings"
"sync"
"time"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
@ -76,7 +74,7 @@ func FormatFasta(seq *obiseq.BioSequence, formater FormatHeader) string {
// - skipEmpty: a boolean indicating whether empty sequences should be skipped or not. // - skipEmpty: a boolean indicating whether empty sequences should be skipped or not.
// //
// It returns a byte array containing the formatted sequences. // It returns a byte array containing the formatted sequences.
func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) []byte { func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer {
// Create a buffer to store the formatted sequences // Create a buffer to store the formatted sequences
var bs bytes.Buffer var bs bytes.Buffer
@ -116,7 +114,7 @@ func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, ski
} }
// Return the byte array representation of the buffer // Return the byte array representation of the buffer
return bs.Bytes() return &bs
} }
// WriteFasta writes a given iterator of bio sequences to a file in FASTA format. // WriteFasta writes a given iterator of bio sequences to a file in FASTA format.
@ -135,21 +133,16 @@ func WriteFasta(iterator obiiter.IBioSequence,
nwriters := opt.ParallelWorkers() nwriters := opt.ParallelWorkers()
obiiter.RegisterAPipe() chunkchan := WriteSeqFileChunk(file, opt.CloseFile())
chunkchan := make(chan FileChunck)
header_format := opt.FormatFastSeqHeader() header_format := opt.FormatFastSeqHeader()
newIter.Add(nwriters) newIter.Add(nwriters)
var waitWriter sync.WaitGroup
go func() { go func() {
newIter.WaitAndClose() newIter.WaitAndClose()
for len(chunkchan) > 0 {
time.Sleep(time.Millisecond)
}
close(chunkchan) close(chunkchan)
waitWriter.Wait() log.Warnf("Writing fasta file done")
}() }()
ff := func(iterator obiiter.IBioSequence) { ff := func(iterator obiiter.IBioSequence) {
@ -159,10 +152,12 @@ func WriteFasta(iterator obiiter.IBioSequence,
log.Debugf("Formating fasta chunk %d", batch.Order()) log.Debugf("Formating fasta chunk %d", batch.Order())
chunkchan <- FileChunck{ chunkchan <- SeqFileChunk{
FormatFastaBatch(batch, header_format, opt.SkipEmptySequence()), Source: batch.Source(),
batch.Order(), Raw: FormatFastaBatch(batch, header_format, opt.SkipEmptySequence()),
Order: batch.Order(),
} }
log.Debugf("Fasta chunk %d formated", batch.Order()) log.Debugf("Fasta chunk %d formated", batch.Order())
newIter.Push(batch) newIter.Push(batch)
@ -176,39 +171,6 @@ func WriteFasta(iterator obiiter.IBioSequence,
go ff(iterator.Split()) go ff(iterator.Split())
} }
next_to_send := 0
received := make(map[int]FileChunck, 100)
waitWriter.Add(1)
go func() {
for chunk := range chunkchan {
if chunk.order == next_to_send {
file.Write(chunk.text)
log.Debugf("Fasta chunk %d written", chunk.order)
next_to_send++
chunk, ok := received[next_to_send]
for ok {
file.Write(chunk.text)
log.Debugf("Fasta chunk %d written", chunk.order)
delete(received, next_to_send)
next_to_send++
chunk, ok = received[next_to_send]
}
} else {
log.Debugf("Store Fasta chunk %d", chunk.order)
received[chunk.order] = chunk
}
}
file.Close()
log.Debugln("End of the fasta file writing")
obiiter.UnregisterPipe()
waitWriter.Done()
}()
return newIter, nil return newIter, nil
} }

View File

@ -14,6 +14,8 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
) )
type FormatSeqBatch func(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer
func _formatFastq(buff *bytes.Buffer, seq *obiseq.BioSequence, formater FormatHeader) { func _formatFastq(buff *bytes.Buffer, seq *obiseq.BioSequence, formater FormatHeader) {
info := "" info := ""
@ -49,7 +51,7 @@ func FormatFastq(seq *obiseq.BioSequence, formater FormatHeader) string {
} }
func FormatFastqBatch(batch obiiter.BioSequenceBatch, func FormatFastqBatch(batch obiiter.BioSequenceBatch,
formater FormatHeader, skipEmpty bool) []byte { formater FormatHeader, skipEmpty bool) *bytes.Buffer {
var bs bytes.Buffer var bs bytes.Buffer
lt := 0 lt := 0
@ -82,12 +84,10 @@ func FormatFastqBatch(batch obiiter.BioSequenceBatch,
} }
chunk := bs.Bytes() return &bs
return chunk
} }
type FileChunck struct { type FileChunk struct {
text []byte text []byte
order int order int
} }
@ -105,8 +105,7 @@ func WriteFastq(iterator obiiter.IBioSequence,
nwriters := opt.ParallelWorkers() nwriters := opt.ParallelWorkers()
obiiter.RegisterAPipe() chunkchan := WriteSeqFileChunk(file, opt.CloseFile())
chunkchan := make(chan FileChunck)
header_format := opt.FormatFastSeqHeader() header_format := opt.FormatFastSeqHeader()
@ -126,9 +125,10 @@ func WriteFastq(iterator obiiter.IBioSequence,
ff := func(iterator obiiter.IBioSequence) { ff := func(iterator obiiter.IBioSequence) {
for iterator.Next() { for iterator.Next() {
batch := iterator.Get() batch := iterator.Get()
chunk := FileChunck{ chunk := SeqFileChunk{
FormatFastqBatch(batch, header_format, opt.SkipEmptySequence()), Source: batch.Source(),
batch.Order(), Raw: FormatFastqBatch(batch, header_format, opt.SkipEmptySequence()),
Order: batch.Order(),
} }
chunkchan <- chunk chunkchan <- chunk
newIter.Push(batch) newIter.Push(batch)
@ -142,44 +142,6 @@ func WriteFastq(iterator obiiter.IBioSequence,
go ff(iterator.Split()) go ff(iterator.Split())
} }
next_to_send := 0
received := make(map[int]FileChunck, 100)
waitWriter.Add(1)
go func() {
for chunk := range chunkchan {
if chunk.order == next_to_send {
if chunk.text[0] != '@' {
log.Panicln("WriteFastq: FASTQ format error")
}
file.Write(chunk.text)
next_to_send++
chunk, ok := received[next_to_send]
for ok {
if chunk.text[0] != '@' {
log.Panicln("WriteFastq: FASTQ format error")
}
file.Write(chunk.text)
delete(received, next_to_send)
next_to_send++
chunk, ok = received[next_to_send]
}
} else {
if _, ok := received[chunk.order]; ok {
log.Panicln("WriteFastq: Two chunks with the same number")
}
received[chunk.order] = chunk
}
}
file.Close()
log.Debugln("End of the fastq file writing")
obiiter.UnregisterPipe()
waitWriter.Done()
}()
return newIter, nil return newIter, nil
} }

View File

@ -29,27 +29,11 @@ const (
var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp") var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
func _ParseGenbankFile(source string, func GenbankChunkParser(withFeatureTable bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
input ChannelSeqFileChunk, return func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
out obiiter.IBioSequence, state := inHeader
chunck_order func() int, scanner := bufio.NewReader(input)
withFeatureTable bool, sequences := obiseq.MakeBioSequenceSlice(100)[:0]
batch_size int,
total_seq_size int) {
state := inHeader
previous_chunk := -1
for chunks := range input {
if state != inHeader {
log.Fatalf("Unexpected state %d starting new chunk (id = %d, previous_chunk = %d)",
state, chunks.order, previous_chunk)
}
previous_chunk = chunks.order
scanner := bufio.NewReader(chunks.raw)
sequences := make(obiseq.BioSequenceSlice, 0, 100)
sumlength := 0
id := "" id := ""
lseq := -1 lseq := -1
scientificName := "" scientificName := ""
@ -64,7 +48,7 @@ func _ParseGenbankFile(source string,
nl++ nl++
line = string(bline) line = string(bline)
if is_prefix || len(line) > 100 { if is_prefix || len(line) > 100 {
log.Fatalf("Chunk %d : Line too long: %s", chunks.order, line) log.Fatalf("From %s:Line too long: %s", source, line)
} }
processed := false processed := false
for !processed { for !processed {
@ -165,15 +149,6 @@ func _ParseGenbankFile(source string,
// sequence.Len(), seqBytes.Len()) // sequence.Len(), seqBytes.Len())
sequences = append(sequences, sequence) sequences = append(sequences, sequence)
sumlength += sequence.Len()
if len(sequences) == batch_size || sumlength > total_seq_size {
oo := chunck_order()
log.Debugln("Pushing sequence batch ", oo, " with ", len(sequences), " sequences")
out.Push(obiiter.MakeBioSequenceBatch(oo, sequences))
sequences = make(obiseq.BioSequenceSlice, 0, 100)
sumlength = 0
}
defBytes = bytes.NewBuffer(obiseq.GetSlice(200)) defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
featBytes = new(bytes.Buffer) featBytes = new(bytes.Buffer)
@ -219,11 +194,24 @@ func _ParseGenbankFile(source string,
} }
if len(sequences) > 0 { return sequences, nil
oo := chunck_order() }
log.Debugln("Pushing sequence batch ", oo, " with ", len(sequences), " sequences") }
out.Push(obiiter.MakeBioSequenceBatch(oo, sequences))
func _ParseGenbankFile(input ChannelSeqFileChunk,
out obiiter.IBioSequence,
withFeatureTable bool) {
parser := GenbankChunkParser(withFeatureTable)
for chunks := range input {
sequences, err := parser(chunks.Source, chunks.Raw)
if err != nil {
log.Fatalf("File %s : Cannot parse the genbank file : %v", chunks.Source, err)
} }
out.Push(obiiter.MakeBioSequenceBatch(chunks.Source, chunks.Order, sequences))
} }
log.Debug("End of the Genbank thread") log.Debug("End of the Genbank thread")
@ -231,26 +219,31 @@ func _ParseGenbankFile(source string,
} }
func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence { func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
opt := MakeOptions(options) opt := MakeOptions(options)
// entry_channel := make(chan _FileChunk) // entry_channel := make(chan _FileChunk)
buff := make([]byte, 1024*1024*1024*256) buff := make([]byte, 1024*1024*1024*256)
entry_channel := ReadSeqFileChunk(reader, buff, _EndOfLastEntry) entry_channel := ReadSeqFileChunk(
opt.Source(),
reader,
buff,
EndOfLastFlatFileEntry,
)
newIter := obiiter.MakeIBioSequence() newIter := obiiter.MakeIBioSequence()
nworkers := opt.ParallelWorkers() nworkers := opt.ParallelWorkers()
chunck_order := obiutils.AtomicCounter()
// for j := 0; j < opt.ParallelWorkers(); j++ { // for j := 0; j < opt.ParallelWorkers(); j++ {
for j := 0; j < nworkers; j++ { for j := 0; j < nworkers; j++ {
newIter.Add(1) newIter.Add(1)
go _ParseGenbankFile(opt.Source(), go _ParseGenbankFile(
entry_channel, newIter, chunck_order, entry_channel,
newIter,
opt.WithFeatureTable(), opt.WithFeatureTable(),
opt.BatchSize(), )
opt.TotalSeqSize())
} }
// go _ReadFlatFileChunk(reader, entry_channel) // go _ReadFlatFileChunk(reader, entry_channel)
@ -264,7 +257,7 @@ func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
newIter = newIter.CompleteFileIterator() newIter = newIter.CompleteFileIterator()
} }
return newIter return newIter, nil
} }
func ReadGenbankFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) { func ReadGenbankFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
@ -285,5 +278,5 @@ func ReadGenbankFromFile(filename string, options ...WithOption) (obiiter.IBioSe
return obiiter.NilIBioSequence, err return obiiter.NilIBioSequence, err
} }
return ReadGenbank(reader, options...), nil return ReadGenbank(reader, options...)
} }

View File

@ -3,7 +3,6 @@ package obiformats
import ( import (
"bufio" "bufio"
"bytes" "bytes"
"github.com/goccy/go-json"
"io" "io"
"os" "os"
"strconv" "strconv"
@ -11,6 +10,8 @@ import (
"sync" "sync"
"time" "time"
"github.com/goccy/go-json"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
@ -87,7 +88,7 @@ func WriteJSON(iterator obiiter.IBioSequence,
nwriters := opt.ParallelWorkers() nwriters := opt.ParallelWorkers()
obiiter.RegisterAPipe() obiiter.RegisterAPipe()
chunkchan := make(chan FileChunck) chunkchan := make(chan FileChunk)
newIter.Add(nwriters) newIter.Add(nwriters)
var waitWriter sync.WaitGroup var waitWriter sync.WaitGroup
@ -106,7 +107,7 @@ func WriteJSON(iterator obiiter.IBioSequence,
batch := iterator.Get() batch := iterator.Get()
chunkchan <- FileChunck{ chunkchan <- FileChunk{
FormatJSONBatch(batch), FormatJSONBatch(batch),
batch.Order(), batch.Order(),
} }
@ -116,7 +117,7 @@ func WriteJSON(iterator obiiter.IBioSequence,
} }
next_to_send := 0 next_to_send := 0
received := make(map[int]FileChunck, 100) received := make(map[int]FileChunk, 100)
waitWriter.Add(1) waitWriter.Add(1)
go func() { go func() {

View File

@ -7,7 +7,8 @@ import (
type __options__ struct { type __options__ struct {
fastseq_header_parser obiseq.SeqAnnotator fastseq_header_parser obiseq.SeqAnnotator
fastseq_header_writer func(*obiseq.BioSequence) string fastseq_header_writer BioSequenceFormater
seqBatchFormater FormatSeqBatch
with_progress_bar bool with_progress_bar bool
buffer_size int buffer_size int
batch_size int batch_size int
@ -44,6 +45,7 @@ func MakeOptions(setters []WithOption) Options {
o := __options__{ o := __options__{
fastseq_header_parser: ParseGuessedFastSeqHeader, fastseq_header_parser: ParseGuessedFastSeqHeader,
fastseq_header_writer: FormatFastSeqJsonHeader, fastseq_header_writer: FormatFastSeqJsonHeader,
seqBatchFormater: nil,
with_progress_bar: false, with_progress_bar: false,
buffer_size: 2, buffer_size: 2,
parallel_workers: obioptions.CLIReadParallelWorkers(), parallel_workers: obioptions.CLIReadParallelWorkers(),
@ -103,6 +105,10 @@ func (opt Options) FormatFastSeqHeader() func(*obiseq.BioSequence) string {
return opt.pointer.fastseq_header_writer return opt.pointer.fastseq_header_writer
} }
func (opt Options) SequenceFormater() FormatSeqBatch {
return opt.pointer.seqBatchFormater
}
func (opt Options) NoOrder() bool { func (opt Options) NoOrder() bool {
return opt.pointer.no_order return opt.pointer.no_order
} }
@ -219,8 +225,6 @@ func OptionNoOrder(no_order bool) WithOption {
return f return f
} }
func OptionsCompressed(compressed bool) WithOption { func OptionsCompressed(compressed bool) WithOption {
f := WithOption(func(opt Options) { f := WithOption(func(opt Options) {
opt.pointer.compressed = compressed opt.pointer.compressed = compressed
@ -271,6 +275,14 @@ func OptionsFastSeqHeaderFormat(format func(*obiseq.BioSequence) string) WithOpt
return f return f
} }
func OptionsSequenceFormater(formater FormatSeqBatch) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.seqBatchFormater = formater
})
return f
}
func OptionsParallelWorkers(nworkers int) WithOption { func OptionsParallelWorkers(nworkers int) WithOption {
f := WithOption(func(opt Options) { f := WithOption(func(opt Options) {
opt.pointer.parallel_workers = nworkers opt.pointer.parallel_workers = nworkers

View File

@ -5,14 +5,18 @@ import (
"io" "io"
"slices" "slices"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
var _FileChunkSize = 1 << 28 var _FileChunkSize = 1024 * 1024 * 10
type SeqFileChunkParser func(string, io.Reader) (obiseq.BioSequenceSlice, error)
type SeqFileChunk struct { type SeqFileChunk struct {
raw io.Reader Source string
order int Raw *bytes.Buffer
Order int
} }
type ChannelSeqFileChunk chan SeqFileChunk type ChannelSeqFileChunk chan SeqFileChunk
@ -32,7 +36,9 @@ type LastSeqRecord func([]byte) int
// //
// Returns: // Returns:
// None // None
func ReadSeqFileChunk(reader io.Reader, func ReadSeqFileChunk(
source string,
reader io.Reader,
buff []byte, buff []byte,
splitter LastSeqRecord) ChannelSeqFileChunk { splitter LastSeqRecord) ChannelSeqFileChunk {
var err error var err error
@ -88,7 +94,7 @@ func ReadSeqFileChunk(reader io.Reader,
if len(buff) > 0 { if len(buff) > 0 {
io := bytes.NewBuffer(slices.Clone(buff)) io := bytes.NewBuffer(slices.Clone(buff))
chunk_channel <- SeqFileChunk{io, i} chunk_channel <- SeqFileChunk{source, io, i}
i++ i++
} }
@ -96,7 +102,7 @@ func ReadSeqFileChunk(reader io.Reader,
buff = fullbuff[0:lremain] buff = fullbuff[0:lremain]
lcp := copy(buff, fullbuff[pnext:]) lcp := copy(buff, fullbuff[pnext:])
if lcp < lremain { if lcp < lremain {
log.Fatalf("Error copying remaining data of chunck %d : %d < %d", i, lcp, lremain) log.Fatalf("Error copying remaining data of chunk %d : %d < %d", i, lcp, lremain)
} }
} else { } else {
buff = buff[:0] buff = buff[:0]
@ -112,7 +118,7 @@ func ReadSeqFileChunk(reader io.Reader,
// Send the last chunk to the channel // Send the last chunk to the channel
if len(buff) > 0 { if len(buff) > 0 {
io := bytes.NewBuffer(slices.Clone(buff)) io := bytes.NewBuffer(slices.Clone(buff))
chunk_channel <- SeqFileChunk{io, i} chunk_channel <- SeqFileChunk{source, io, i}
} }
// Close the readers channel when the end of the file is reached // Close the readers channel when the end of the file is reached

View File

@ -0,0 +1,51 @@
package obiformats
import (
"io"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
log "github.com/sirupsen/logrus"
)
func WriteSeqFileChunk(
writer io.WriteCloser,
toBeClosed bool) ChannelSeqFileChunk {
obiiter.RegisterAPipe()
chunk_channel := make(ChannelSeqFileChunk)
go func() {
nextToPrint := 0
toBePrinted := make(map[int]SeqFileChunk)
for chunk := range chunk_channel {
if chunk.Order == nextToPrint {
_, _ = writer.Write(chunk.Raw.Bytes())
nextToPrint++
chunk, ok := toBePrinted[nextToPrint]
for ok {
_, _ = writer.Write(chunk.Raw.Bytes())
delete(toBePrinted, nextToPrint)
nextToPrint++
chunk, ok = toBePrinted[nextToPrint]
}
} else {
toBePrinted[chunk.Order] = chunk
}
}
if toBeClosed {
err := writer.Close()
if err != nil {
log.Fatalf("Cannot close the writer : %v", err)
}
}
obiiter.UnregisterPipe()
log.Warnf("The writer has been closed")
}()
return chunk_channel
}

View File

@ -15,6 +15,8 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
) )
type SequenceReader func(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error)
// OBIMimeTypeGuesser is a function that takes an io.Reader as input and guesses the MIME type of the data. // OBIMimeTypeGuesser is a function that takes an io.Reader as input and guesses the MIME type of the data.
// It uses several detectors to identify specific file formats, such as FASTA, FASTQ, ecoPCR2, GenBank, and EMBL. // It uses several detectors to identify specific file formats, such as FASTA, FASTQ, ecoPCR2, GenBank, and EMBL.
// The function reads data from the input stream and analyzes it using the mimetype library. // The function reads data from the input stream and analyzes it using the mimetype library.
@ -172,11 +174,11 @@ func ReadSequencesFromFile(filename string,
case "text/fasta": case "text/fasta":
return ReadFasta(reader, options...) return ReadFasta(reader, options...)
case "text/ecopcr2": case "text/ecopcr2":
return ReadEcoPCR(reader, options...), nil return ReadEcoPCR(reader, options...)
case "text/embl": case "text/embl":
return ReadEMBL(reader, options...), nil return ReadEMBL(reader, options...)
case "text/genbank": case "text/genbank":
return ReadGenbank(reader, options...), nil return ReadGenbank(reader, options...)
case "text/csv": case "text/csv":
return ReadCSV(reader, options...) return ReadCSV(reader, options...)
default: default:

View File

@ -3,50 +3,118 @@ package obiiter
import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
type BioSequenceBatch struct { type BioSequenceBatch struct {
slice obiseq.BioSequenceSlice source string
order int slice obiseq.BioSequenceSlice
order int
} }
var NilBioSequenceBatch = BioSequenceBatch{nil, -1} var NilBioSequenceBatch = BioSequenceBatch{"", nil, -1}
func MakeBioSequenceBatch(order int, // MakeBioSequenceBatch creates a new BioSequenceBatch with the given source, order, and sequences.
//
// Parameters:
// - source: The source of the BioSequenceBatch.
// - order: The order of the BioSequenceBatch.
// - sequences: The slice of BioSequence.
//
// Returns:
// - BioSequenceBatch: The newly created BioSequenceBatch.
func MakeBioSequenceBatch(
source string,
order int,
sequences obiseq.BioSequenceSlice) BioSequenceBatch { sequences obiseq.BioSequenceSlice) BioSequenceBatch {
return BioSequenceBatch{ return BioSequenceBatch{
slice: sequences, source: source,
order: order, slice: sequences,
order: order,
} }
} }
// Order returns the order of the BioSequenceBatch.
//
// Returns:
// - int: The order of the BioSequenceBatch.
func (batch BioSequenceBatch) Order() int { func (batch BioSequenceBatch) Order() int {
return batch.order return batch.order
} }
// Source returns the source of the BioSequenceBatch.
//
// Returns:
// - string: The source of the BioSequenceBatch.
func (batch BioSequenceBatch) Source() string {
return batch.source
}
// Reorder updates the order of the BioSequenceBatch and returns the updated batch.
//
// Parameters:
// - newOrder: The new order value to assign to the BioSequenceBatch.
//
// Returns:
// - BioSequenceBatch: The updated BioSequenceBatch with the new order value.
func (batch BioSequenceBatch) Reorder(newOrder int) BioSequenceBatch { func (batch BioSequenceBatch) Reorder(newOrder int) BioSequenceBatch {
batch.order = newOrder batch.order = newOrder
return batch return batch
} }
// Slice returns the BioSequenceSlice contained within the BioSequenceBatch.
//
// Returns:
// - obiseq.BioSequenceSlice: The BioSequenceSlice contained within the BioSequenceBatch.
func (batch BioSequenceBatch) Slice() obiseq.BioSequenceSlice { func (batch BioSequenceBatch) Slice() obiseq.BioSequenceSlice {
return batch.slice return batch.slice
} }
// Len returns the number of BioSequence elements in the given BioSequenceBatch.
//
// Parameters:
// - batch: The BioSequenceBatch to get the length from.
//
// Return type:
// - int: The number of BioSequence elements in the BioSequenceBatch.
func (batch BioSequenceBatch) Len() int { func (batch BioSequenceBatch) Len() int {
return len(batch.slice) return len(batch.slice)
} }
// NotEmpty returns whether the BioSequenceBatch is empty or not.
//
// It checks if the BioSequenceSlice contained within the BioSequenceBatch is not empty.
//
// Returns:
// - bool: True if the BioSequenceBatch is not empty, false otherwise.
func (batch BioSequenceBatch) NotEmpty() bool { func (batch BioSequenceBatch) NotEmpty() bool {
return batch.slice.NotEmpty() return batch.slice.NotEmpty()
} }
// Pop0 returns and removes the first element of the BioSequenceBatch.
//
// It does not take any parameters.
// It returns a pointer to a BioSequence object.
func (batch BioSequenceBatch) Pop0() *obiseq.BioSequence { func (batch BioSequenceBatch) Pop0() *obiseq.BioSequence {
return batch.slice.Pop0() return batch.slice.Pop0()
} }
// IsNil checks if the BioSequenceBatch's slice is nil.
//
// This function takes a BioSequenceBatch as a parameter and returns a boolean value indicating whether the slice of the BioSequenceBatch is nil or not.
//
// Parameters:
// - batch: The BioSequenceBatch to check for nil slice.
//
// Returns:
// - bool: True if the BioSequenceBatch's slice is nil, false otherwise.
func (batch BioSequenceBatch) IsNil() bool { func (batch BioSequenceBatch) IsNil() bool {
return batch.slice == nil return batch.slice == nil
} }
// Recycle cleans up the BioSequenceBatch by recycling its elements and resetting its slice.
//
// If including_seq is true, each element of the BioSequenceBatch's slice is recycled using the Recycle method,
// and then set to nil. If including_seq is false, each element is simply set to nil.
//
// This function does not return anything.
func (batch BioSequenceBatch) Recycle(including_seq bool) { func (batch BioSequenceBatch) Recycle(including_seq bool) {
batch.slice.Recycle(including_seq) batch.slice.Recycle(including_seq)
batch.slice = nil batch.slice = nil

View File

@ -424,9 +424,11 @@ func (iterator IBioSequence) Rebatch(size int) IBioSequence {
order := 0 order := 0
iterator = iterator.SortBatches() iterator = iterator.SortBatches()
buffer := obiseq.MakeBioSequenceSlice() buffer := obiseq.MakeBioSequenceSlice()
source := ""
for iterator.Next() { for iterator.Next() {
seqs := iterator.Get() seqs := iterator.Get()
source = seqs.Source()
lc := seqs.Len() lc := seqs.Len()
remains := lc remains := lc
i := 0 i := 0
@ -436,7 +438,7 @@ func (iterator IBioSequence) Rebatch(size int) IBioSequence {
remains = lc - to_push - i remains = lc - to_push - i
buffer = append(buffer, seqs.Slice()[i:(i+to_push)]...) buffer = append(buffer, seqs.Slice()[i:(i+to_push)]...)
if len(buffer) == size { if len(buffer) == size {
newIter.Push(MakeBioSequenceBatch(order, buffer)) newIter.Push(MakeBioSequenceBatch(source, order, buffer))
log.Debugf("Rebatch #%d pushd", order) log.Debugf("Rebatch #%d pushd", order)
order++ order++
buffer = obiseq.MakeBioSequenceSlice() buffer = obiseq.MakeBioSequenceSlice()
@ -447,7 +449,7 @@ func (iterator IBioSequence) Rebatch(size int) IBioSequence {
} }
log.Debug("End of the rebatch loop") log.Debug("End of the rebatch loop")
if len(buffer) > 0 { if len(buffer) > 0 {
newIter.Push(MakeBioSequenceBatch(order, buffer)) newIter.Push(MakeBioSequenceBatch(source, order, buffer))
log.Debugf("Final Rebatch #%d pushd", order) log.Debugf("Final Rebatch #%d pushd", order)
} }
@ -526,12 +528,14 @@ func (iterator IBioSequence) DivideOn(predicate obiseq.SequencePredicate,
trueOrder := 0 trueOrder := 0
falseOrder := 0 falseOrder := 0
iterator = iterator.SortBatches() iterator = iterator.SortBatches()
source := ""
trueSlice := obiseq.MakeBioSequenceSlice() trueSlice := obiseq.MakeBioSequenceSlice()
falseSlice := obiseq.MakeBioSequenceSlice() falseSlice := obiseq.MakeBioSequenceSlice()
for iterator.Next() { for iterator.Next() {
seqs := iterator.Get() seqs := iterator.Get()
source = seqs.Source()
for _, s := range seqs.slice { for _, s := range seqs.slice {
if predicate(s) { if predicate(s) {
trueSlice = append(trueSlice, s) trueSlice = append(trueSlice, s)
@ -540,13 +544,13 @@ func (iterator IBioSequence) DivideOn(predicate obiseq.SequencePredicate,
} }
if len(trueSlice) == size { if len(trueSlice) == size {
trueIter.Push(MakeBioSequenceBatch(trueOrder, trueSlice)) trueIter.Push(MakeBioSequenceBatch(source, trueOrder, trueSlice))
trueOrder++ trueOrder++
trueSlice = obiseq.MakeBioSequenceSlice() trueSlice = obiseq.MakeBioSequenceSlice()
} }
if len(falseSlice) == size { if len(falseSlice) == size {
falseIter.Push(MakeBioSequenceBatch(falseOrder, falseSlice)) falseIter.Push(MakeBioSequenceBatch(source, falseOrder, falseSlice))
falseOrder++ falseOrder++
falseSlice = obiseq.MakeBioSequenceSlice() falseSlice = obiseq.MakeBioSequenceSlice()
} }
@ -555,11 +559,11 @@ func (iterator IBioSequence) DivideOn(predicate obiseq.SequencePredicate,
} }
if len(trueSlice) > 0 { if len(trueSlice) > 0 {
trueIter.Push(MakeBioSequenceBatch(trueOrder, trueSlice)) trueIter.Push(MakeBioSequenceBatch(source, trueOrder, trueSlice))
} }
if len(falseSlice) > 0 { if len(falseSlice) > 0 {
falseIter.Push(MakeBioSequenceBatch(falseOrder, falseSlice)) falseIter.Push(MakeBioSequenceBatch(source, falseOrder, falseSlice))
} }
trueIter.Done() trueIter.Done()
@ -686,17 +690,22 @@ func (iterator IBioSequence) FilterAnd(predicate obiseq.SequencePredicate,
// Load all sequences availables from an IBioSequenceBatch iterator into // Load all sequences availables from an IBioSequenceBatch iterator into
// a large obiseq.BioSequenceSlice. // a large obiseq.BioSequenceSlice.
func (iterator IBioSequence) Load() obiseq.BioSequenceSlice { func (iterator IBioSequence) Load() (string, obiseq.BioSequenceSlice) {
chunk := obiseq.MakeBioSequenceSlice()
source := ""
chunck := obiseq.MakeBioSequenceSlice()
for iterator.Next() { for iterator.Next() {
b := iterator.Get() b := iterator.Get()
if source == "" {
source = b.Source()
}
log.Debugf("append %d sequences", b.Len()) log.Debugf("append %d sequences", b.Len())
chunck = append(chunck, b.Slice()...) chunk = append(chunk, b.Slice()...)
b.Recycle(false) b.Recycle(false)
} }
return chunck return source, chunk
} }
// CompleteFileIterator generates a new iterator for reading a complete file. // CompleteFileIterator generates a new iterator for reading a complete file.
@ -718,10 +727,10 @@ func (iterator IBioSequence) CompleteFileIterator() IBioSequence {
}() }()
go func() { go func() {
slice := iterator.Load() source, slice := iterator.Load()
log.Printf("A batch of %d sequence is read", len(slice)) log.Printf("A batch of %d sequence is read", len(slice))
if len(slice) > 0 { if len(slice) > 0 {
newIter.Push(MakeBioSequenceBatch(0, slice)) newIter.Push(MakeBioSequenceBatch(source, 0, slice))
} }
newIter.Done() newIter.Done()
}() }()
@ -735,7 +744,7 @@ func (iterator IBioSequence) CompleteFileIterator() IBioSequence {
// It takes a slice of BioSequence objects, and returns an iterator that will return batches of // It takes a slice of BioSequence objects, and returns an iterator that will return batches of
// BioSequence objects // BioSequence objects
func IBatchOver(data obiseq.BioSequenceSlice, func IBatchOver(source string, data obiseq.BioSequenceSlice,
size int, sizes ...int) IBioSequence { size int, sizes ...int) IBioSequence {
newIter := MakeIBioSequence() newIter := MakeIBioSequence()
@ -755,7 +764,7 @@ func IBatchOver(data obiseq.BioSequenceSlice,
if next > ldata { if next > ldata {
next = ldata next = ldata
} }
newIter.Push(MakeBioSequenceBatch(batchid, data[i:next])) newIter.Push(MakeBioSequenceBatch(source, batchid, data[i:next]))
batchid++ batchid++
} }

View File

@ -61,9 +61,12 @@ func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, siz
go func() { go func() {
iterator = iterator.SortBatches() iterator = iterator.SortBatches()
source := ""
for iterator.Next() { for iterator.Next() {
seqs := iterator.Get() seqs := iterator.Get()
source = seqs.Source()
for _, s := range seqs.Slice() { for _, s := range seqs.Slice() {
key := class.Code(s) key := class.Code(s)
slice, ok := slices[key] slice, ok := slices[key]
@ -84,7 +87,7 @@ func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, siz
*slice = append(*slice, s) *slice = append(*slice, s)
if len(*slice) == batchsize { if len(*slice) == batchsize {
outputs[key].Push(MakeBioSequenceBatch(orders[key], *slice)) outputs[key].Push(MakeBioSequenceBatch(source, orders[key], *slice))
orders[key]++ orders[key]++
s := obiseq.MakeBioSequenceSlice() s := obiseq.MakeBioSequenceSlice()
slices[key] = &s slices[key] = &s
@ -95,7 +98,7 @@ func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, siz
for key, slice := range slices { for key, slice := range slices {
if len(*slice) > 0 { if len(*slice) > 0 {
outputs[key].Push(MakeBioSequenceBatch(orders[key], *slice)) outputs[key].Push(MakeBioSequenceBatch(source, orders[key], *slice))
} }
} }

View File

@ -20,9 +20,11 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable {
}() }()
f := func(iterator IBioSequence, id int) { f := func(iterator IBioSequence, id int) {
source := ""
for iterator.Next() { for iterator.Next() {
news := obiseq.MakeBioSequenceSlice() news := obiseq.MakeBioSequenceSlice()
sl := iterator.Get() sl := iterator.Get()
source = sl.Source()
for _, s := range sl.Slice() { for _, s := range sl.Slice() {
if s.Len() <= minsize { if s.Len() <= minsize {
@ -52,7 +54,7 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable {
s.Recycle() s.Recycle()
} }
} // End of the slice loop } // End of the slice loop
newiter.Push(MakeBioSequenceBatch(sl.Order(), news)) newiter.Push(MakeBioSequenceBatch(source, sl.Order(), news))
sl.Recycle(false) sl.Recycle(false)
} // End of the iterator loop } // End of the iterator loop

View File

@ -9,9 +9,11 @@ func (b BioSequenceBatch) IsPaired() bool {
} }
func (b BioSequenceBatch) PairedWith() BioSequenceBatch { func (b BioSequenceBatch) PairedWith() BioSequenceBatch {
return MakeBioSequenceBatch(b.order, return MakeBioSequenceBatch(
*b.slice.PairedWith()) b.Source(),
b.order,
*b.slice.PairedWith(),
)
} }
func (b *BioSequenceBatch) PairTo(p *BioSequenceBatch) { func (b *BioSequenceBatch) PairTo(p *BioSequenceBatch) {

View File

@ -225,7 +225,7 @@ func LuaProcessor(iterator obiiter.IBioSequence, name, program string, breakOnEr
} }
} }
newIter.Push(obiiter.MakeBioSequenceBatch(seqs.Order(), ns)) newIter.Push(obiiter.MakeBioSequenceBatch(seqs.Source(), seqs.Order(), ns))
seqs.Recycle(false) seqs.Recycle(false)
} }

View File

@ -7,7 +7,7 @@ import (
// TODO: The version number is extracted from git. This induces that the version // TODO: The version number is extracted from git. This induces that the version
// corresponds to the last commit, and not the one when the file will be // corresponds to the last commit, and not the one when the file will be
// commited // commited
var _Commit = "2247c3b" var _Commit = "bc1aaaf"
var _Version = "" var _Version = ""
// Version returns the version of the obitools package. // Version returns the version of the obitools package.

View File

@ -57,7 +57,7 @@ func buildSamples(dataset obiseq.BioSequenceSlice,
return samples return samples
} }
func annotateOBIClean(dataset obiseq.BioSequenceSlice, func annotateOBIClean(source string, dataset obiseq.BioSequenceSlice,
sample map[string]*([]*seqPCR), sample map[string]*([]*seqPCR),
tag, NAValue string) obiiter.IBioSequence { tag, NAValue string) obiiter.IBioSequence {
batchsize := 1000 batchsize := 1000
@ -91,7 +91,7 @@ func annotateOBIClean(dataset obiseq.BioSequenceSlice,
return data, nil return data, nil
} }
iter := obiiter.IBatchOver(dataset, batchsize) iter := obiiter.IBatchOver(source, dataset, batchsize)
riter := iter.MakeISliceWorker(annot, false) riter := iter.MakeISliceWorker(annot, false)
return riter return riter
@ -288,7 +288,7 @@ func Weight(sequence *obiseq.BioSequence) map[string]int {
func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence { func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence {
db := itertator.Load() source, db := itertator.Load()
log.Infof("Sequence dataset of %d sequeences loaded\n", len(db)) log.Infof("Sequence dataset of %d sequeences loaded\n", len(db))
@ -365,7 +365,7 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence {
EmpiricalDistCsv(RatioTableFilename(), all_ratio) EmpiricalDistCsv(RatioTableFilename(), all_ratio)
} }
iter := annotateOBIClean(db, samples, SampleAttribute(), "NA") iter := annotateOBIClean(source, db, samples, SampleAttribute(), "NA")
if OnlyHead() { if OnlyHead() {
iter = iter.FilterOn(IsHead, 1000) iter = iter.FilterOn(IsHead, 1000)

View File

@ -274,11 +274,11 @@ func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence {
// obioptions.CLIParallelWorkers(), // obioptions.CLIParallelWorkers(),
// ) // )
references := annotated.Load() source, references := annotated.Load()
mannwithney := MakeSequenceFamilyGenusWorker(references) mannwithney := MakeSequenceFamilyGenusWorker(references)
partof := obiiter.IBatchOver(references, partof := obiiter.IBatchOver(source, references,
obioptions.CLIBatchSize()) obioptions.CLIBatchSize())
// genera_iterator, err := obichunk.ISequenceChunk( // genera_iterator, err := obichunk.ISequenceChunk(

View File

@ -46,7 +46,12 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
if err == nil { if err == nil {
defer fasta.Close() defer fasta.Close()
fasta.Write(obiformats.FormatFastaBatch(obiiter.MakeBioSequenceBatch(0, seqs), obiformats.FormatFastSeqJsonHeader, false)) fasta.Write(obiformats.FormatFastaBatch(obiiter.MakeBioSequenceBatch(
fmt.Sprintf("%s_consensus", consensus_id),
0,
seqs,
),
obiformats.FormatFastSeqJsonHeader, false).Bytes())
fasta.Close() fasta.Close()
} }
@ -333,7 +338,7 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
dirname := CLIGraphFilesDirectory() dirname := CLIGraphFilesDirectory()
newIter := obiiter.MakeIBioSequence() newIter := obiiter.MakeIBioSequence()
db := itertator.Load() source, db := itertator.Load()
log.Infof("Sequence dataset of %d sequeences loaded\n", len(db)) log.Infof("Sequence dataset of %d sequeences loaded\n", len(db))
@ -394,7 +399,7 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
CLISampleAttribute(), CLISampleAttribute(),
CLIKmerSize()) CLIKmerSize())
newIter.Push(obiiter.MakeBioSequenceBatch(sample_order, denoised)) newIter.Push(obiiter.MakeBioSequenceBatch(source, sample_order, denoised))
sample_order++ sample_order++
} }

View File

@ -14,7 +14,7 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
) )
func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) { func ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
var err error var err error
list_of_files := orderedset.NewOrderedSet() list_of_files := orderedset.NewOrderedSet()
for _, fn := range filenames { for _, fn := range filenames {
@ -39,7 +39,7 @@ func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
if info.IsDir() { if info.IsDir() {
if path != fn { if path != fn {
subdir, e := _ExpandListOfFiles(true, path) subdir, e := ExpandListOfFiles(true, path)
if e != nil { if e != nil {
return e return e
} }
@ -113,19 +113,26 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
log.Printf("Reading sequences from stdin in %s\n", CLIInputFormat()) log.Printf("Reading sequences from stdin in %s\n", CLIInputFormat())
opts = append(opts, obiformats.OptionsSource("stdin")) opts = append(opts, obiformats.OptionsSource("stdin"))
var err error
switch CLIInputFormat() { switch CLIInputFormat() {
case "ecopcr": case "ecopcr":
iterator = obiformats.ReadEcoPCR(os.Stdin, opts...) iterator, err = obiformats.ReadEcoPCR(os.Stdin, opts...)
case "embl": case "embl":
iterator = obiformats.ReadEMBL(os.Stdin, opts...) iterator, err = obiformats.ReadEMBL(os.Stdin, opts...)
case "genbank": case "genbank":
iterator = obiformats.ReadGenbank(os.Stdin, opts...) iterator, err = obiformats.ReadGenbank(os.Stdin, opts...)
default: default:
iterator = obiformats.ReadFastSeqFromStdin(opts...) iterator = obiformats.ReadFastSeqFromStdin(opts...)
} }
if err != nil {
return obiiter.NilIBioSequence, err
}
} else { } else {
list_of_files, err := _ExpandListOfFiles(false, filenames...) list_of_files, err := ExpandListOfFiles(false, filenames...)
if err != nil { if err != nil {
return obiiter.NilIBioSequence, err return obiiter.NilIBioSequence, err
} }

View File

@ -129,7 +129,7 @@ func CLIJoinSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
log.Fatalf("Cannot read the data file to merge with: %s %v", CLIJoinWith(), err) log.Fatalf("Cannot read the data file to merge with: %s %v", CLIJoinWith(), err)
} }
data := data_iter.Load() _, data := data_iter.Load()
keys := CLIBy() keys := CLIBy()

View File

@ -103,7 +103,7 @@ func MapOnLandmarkSequences(library obiseq.BioSequenceSlice, landmark_idx []int,
// which landmark it corresponds. // which landmark it corresponds.
func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence { func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
library := iterator.Load() source, library := iterator.Load()
library_size := len(library) library_size := len(library)
n_landmark := CLINCenter() n_landmark := CLINCenter()
@ -191,6 +191,6 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
} }
} }
return obiiter.IBatchOver(library, obioptions.CLIBatchSize()) return obiiter.IBatchOver(source, library, obioptions.CLIBatchSize())
} }

View File

@ -255,6 +255,7 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
delta, minOverlap, minIdentity, withStats, true, fastAlign, fastModeRel, arena, &shifts) delta, minOverlap, minIdentity, withStats, true, fastAlign, fastModeRel, arena, &shifts)
} }
newIter.Push(obiiter.MakeBioSequenceBatch( newIter.Push(obiiter.MakeBioSequenceBatch(
batch.Source(),
batch.Order(), batch.Order(),
cons, cons,
)) ))

View File

@ -130,7 +130,7 @@ func MakeIndexingSliceWorker(indexslot, idslot string,
func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
log.Infoln("Family level reference database indexing...") log.Infoln("Family level reference database indexing...")
log.Infoln("Loading database...") log.Infoln("Loading database...")
references := iterator.Load() source, references := iterator.Load()
nref := len(references) nref := len(references)
log.Infof("Done. Database contains %d sequences", nref) log.Infof("Done. Database contains %d sequences", nref)
@ -154,7 +154,7 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
log.Info("done") log.Info("done")
partof := obiiter.IBatchOver(references, partof := obiiter.IBatchOver(source, references,
obioptions.CLIBatchSize()).MakeIWorker(taxonomy.MakeSetSpeciesWorker(), obioptions.CLIBatchSize()).MakeIWorker(taxonomy.MakeSetSpeciesWorker(),
false, false,
obioptions.CLIParallelWorkers(), obioptions.CLIParallelWorkers(),
@ -243,7 +243,7 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
waiting.Wait() waiting.Wait()
results := obiiter.IBatchOver(references, results := obiiter.IBatchOver(source, references,
obioptions.CLIBatchSize()).Speed("Writing db", nref) obioptions.CLIBatchSize()).Speed("Writing db", nref)
return results return results

View File

@ -125,7 +125,7 @@ func IndexSequence(seqidx int,
func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
log.Infoln("Loading database...") log.Infoln("Loading database...")
references := iterator.Load() source, references := iterator.Load()
log.Infof("Done. Database contains %d sequences", len(references)) log.Infof("Done. Database contains %d sequences", len(references))
taxo, error := obifind.CLILoadSelectedTaxonomy() taxo, error := obifind.CLILoadSelectedTaxonomy()
@ -204,7 +204,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
sl = append(sl, iref) sl = append(sl, iref)
bar.Add(1) bar.Add(1)
} }
indexed.Push(obiiter.MakeBioSequenceBatch(l[0]/10, sl)) indexed.Push(obiiter.MakeBioSequenceBatch(source, l[0]/10, sl))
} }
indexed.Done() indexed.Done()

View File

@ -57,7 +57,9 @@ func CLIRefDB() obiseq.BioSequenceSlice {
log.Panicf("Cannot open the reference library file : %s\n", _RefDB) log.Panicf("Cannot open the reference library file : %s\n", _RefDB)
} }
return refdb.Load() _, db := refdb.Load()
return db
} }
func CLIGeometricMode() bool { func CLIGeometricMode() bool {
@ -70,7 +72,7 @@ func CLIShouldISaveRefDB() bool {
func CLISaveRefetenceDB(db obiseq.BioSequenceSlice) { func CLISaveRefetenceDB(db obiseq.BioSequenceSlice) {
if CLIShouldISaveRefDB() { if CLIShouldISaveRefDB() {
idb := obiiter.IBatchOver(db, 1000) idb := obiiter.IBatchOver("", db, 1000)
var newIter obiiter.IBioSequence var newIter obiiter.IBioSequence

View File

@ -57,7 +57,9 @@ func CLIRefDB() obiseq.BioSequenceSlice {
log.Panicf("Cannot open the reference library file : %s\n", _RefDB) log.Panicf("Cannot open the reference library file : %s\n", _RefDB)
} }
return refdb.Load() _, db := refdb.Load()
return db
} }
func CLIGeometricMode() bool { func CLIGeometricMode() bool {
@ -70,7 +72,7 @@ func CLIShouldISaveRefDB() bool {
func CLISaveRefetenceDB(db obiseq.BioSequenceSlice) { func CLISaveRefetenceDB(db obiseq.BioSequenceSlice) {
if CLIShouldISaveRefDB() { if CLIShouldISaveRefDB() {
idb := obiiter.IBatchOver(db, 1000) idb := obiiter.IBatchOver("", db, 1000)
var newIter obiiter.IBioSequence var newIter obiiter.IBioSequence