From 98b3bc2a8c1a4bbb55125510fb9d565c675d05e8 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 27 May 2024 10:17:17 +0200 Subject: [PATCH] Patch a bug on the reading of each last sequence of a chunck in the fasta reader Former-commit-id: eacf64112582befa4751f66352999a28abf349f7 --- pkg/obiformats/fastaseq_read.go | 6 ++++++ pkg/obiformats/seqfile_chunck_read.go | 1 - pkg/obitools/obiconvert/sequence_reader.go | 19 ++++++++++++++----- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/pkg/obiformats/fastaseq_read.go b/pkg/obiformats/fastaseq_read.go index 65d6940..10038b5 100644 --- a/pkg/obiformats/fastaseq_read.go +++ b/pkg/obiformats/fastaseq_read.go @@ -153,6 +153,12 @@ func _ParseFastaFile(source string, } } + if state == 6 { + s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition) + s.SetSource(source) + sequences = append(sequences, s) + } + if len(sequences) > 0 { if no_order { out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences)) diff --git a/pkg/obiformats/seqfile_chunck_read.go b/pkg/obiformats/seqfile_chunck_read.go index cc4c864..eb98424 100644 --- a/pkg/obiformats/seqfile_chunck_read.go +++ b/pkg/obiformats/seqfile_chunck_read.go @@ -80,7 +80,6 @@ func ReadSeqFileChunk(reader io.Reader, end = len(buff) } - pnext := end lremain := len(buff) - pnext buff = buff[:end] diff --git a/pkg/obitools/obiconvert/sequence_reader.go b/pkg/obitools/obiconvert/sequence_reader.go index 2d91cf3..22e7272 100644 --- a/pkg/obitools/obiconvert/sequence_reader.go +++ b/pkg/obitools/obiconvert/sequence_reader.go @@ -7,6 +7,7 @@ import ( "strings" log "github.com/sirupsen/logrus" + "github.com/goombaio/orderedset" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" @@ -15,7 +16,7 @@ import ( func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) { var err error - list_of_files := make([]string, 0, 100) + list_of_files := orderedset.NewOrderedSet() for _, fn := range filenames { err = filepath.Walk(fn, @@ -42,7 +43,9 @@ func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) { if e != nil { return e } - list_of_files = append(list_of_files, subdir...) + for _, f := range subdir { + list_of_files.Add(f) + } } else { check_ext = true } @@ -60,8 +63,8 @@ func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) { strings.HasSuffix(path, "dat.gz") || strings.HasSuffix(path, "ecopcr") || strings.HasSuffix(path, "ecopcr.gz") { - log.Printf("Appending %s file\n", path) - list_of_files = append(list_of_files, path) + log.Debugf("Appending %s file\n", path) + list_of_files.Add(path) } } return nil @@ -72,7 +75,13 @@ func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) { } } - return list_of_files, nil + res := make([]string, 0, list_of_files.Size()) + for _, v := range list_of_files.Values() { + res = append(res, v.(string)) + } + + log.Infof("Found %d files to process", len(res)) + return res, nil } func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {