2023-09-01 09:30:12 +02:00
|
|
|
package obiformats
|
|
|
|
|
|
|
|
import (
|
2024-05-01 00:50:23 +02:00
|
|
|
"bufio"
|
2023-09-01 09:30:12 +02:00
|
|
|
"bytes"
|
|
|
|
"io"
|
|
|
|
"os"
|
|
|
|
"path"
|
2024-05-01 00:50:23 +02:00
|
|
|
"slices"
|
2023-09-01 09:30:12 +02:00
|
|
|
|
2023-11-29 12:14:37 +01:00
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
2023-09-01 09:30:12 +02:00
|
|
|
|
|
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
)
|
|
|
|
|
2024-05-01 00:50:23 +02:00
|
|
|
func _EndOfLastFastaEntry(buffer []byte) int {
|
|
|
|
var i int
|
2023-09-01 09:30:12 +02:00
|
|
|
|
|
|
|
imax := len(buffer)
|
|
|
|
last := 0
|
|
|
|
state := 0
|
2024-05-01 00:50:23 +02:00
|
|
|
|
|
|
|
for i = imax - 1; i >= 0 && state < 2; i-- {
|
|
|
|
C := buffer[i]
|
|
|
|
if C == '>' && state == 0 {
|
2023-09-01 09:30:12 +02:00
|
|
|
state = 1
|
|
|
|
last = i
|
2024-05-01 00:50:23 +02:00
|
|
|
} else if state == 1 && (C == '\n' || C == '\r') {
|
|
|
|
state = 2
|
2023-09-01 09:30:12 +02:00
|
|
|
} else {
|
|
|
|
state = 0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-05-01 00:50:23 +02:00
|
|
|
if i == 0 || state != 2 {
|
|
|
|
return -1
|
2023-09-01 09:30:12 +02:00
|
|
|
}
|
2024-05-01 00:50:23 +02:00
|
|
|
return last
|
2023-09-01 09:30:12 +02:00
|
|
|
}
|
|
|
|
|
2024-05-01 00:50:23 +02:00
|
|
|
func _ParseFastaFile(source string,
|
|
|
|
input ChannelSeqFileChunk,
|
2024-05-16 15:18:30 +02:00
|
|
|
out obiiter.IBioSequence,
|
|
|
|
no_order bool,
|
|
|
|
batch_size int,
|
|
|
|
chunck_order func() int,
|
|
|
|
) {
|
2023-09-01 09:30:12 +02:00
|
|
|
|
2024-05-01 00:50:23 +02:00
|
|
|
var identifier string
|
|
|
|
var definition string
|
2023-10-13 21:52:57 +02:00
|
|
|
|
2024-05-01 00:50:23 +02:00
|
|
|
state := 0
|
2023-10-13 21:52:57 +02:00
|
|
|
|
2024-05-01 00:50:23 +02:00
|
|
|
idBytes := new(bytes.Buffer)
|
|
|
|
defBytes := new(bytes.Buffer)
|
|
|
|
seqBytes := new(bytes.Buffer)
|
2023-09-01 09:30:12 +02:00
|
|
|
|
2024-05-01 00:50:23 +02:00
|
|
|
for chunks := range input {
|
|
|
|
scanner := bufio.NewReader(chunks.raw)
|
2024-05-16 15:18:30 +02:00
|
|
|
sequences := make(obiseq.BioSequenceSlice, 0, batch_size)
|
2024-05-01 00:50:23 +02:00
|
|
|
for C, err := scanner.ReadByte(); err != io.EOF; C, err = scanner.ReadByte() {
|
2023-09-01 09:30:12 +02:00
|
|
|
|
2024-05-01 00:50:23 +02:00
|
|
|
is_end_of_line := C == '\r' || C == '\n'
|
|
|
|
is_space := C == ' ' || C == '\t'
|
|
|
|
is_sep := is_space || is_end_of_line
|
2023-09-01 09:30:12 +02:00
|
|
|
|
2024-05-01 00:50:23 +02:00
|
|
|
switch state {
|
|
|
|
case 0:
|
|
|
|
if C == '>' {
|
|
|
|
// Beginning of sequence
|
|
|
|
state = 1
|
|
|
|
}
|
|
|
|
case 1:
|
|
|
|
if is_sep {
|
|
|
|
// No identifier -> ERROR
|
|
|
|
log.Errorf("%s : sequence entry does not have an identifier", source)
|
2023-10-13 21:52:57 +02:00
|
|
|
} else {
|
2024-05-01 00:50:23 +02:00
|
|
|
// Beginning of identifier
|
|
|
|
idBytes.Reset()
|
|
|
|
state = 2
|
|
|
|
idBytes.WriteByte(C)
|
2023-09-01 09:30:12 +02:00
|
|
|
}
|
2024-05-01 00:50:23 +02:00
|
|
|
case 2:
|
|
|
|
if is_sep {
|
|
|
|
// End of identifier
|
|
|
|
identifier = idBytes.String()
|
|
|
|
idBytes.Reset()
|
|
|
|
state = 3
|
|
|
|
} else {
|
|
|
|
idBytes.WriteByte(C)
|
2023-09-01 09:30:12 +02:00
|
|
|
}
|
2024-05-01 00:50:23 +02:00
|
|
|
if is_end_of_line {
|
|
|
|
// Definition empty
|
|
|
|
definition = ""
|
|
|
|
state = 5
|
2023-10-13 21:52:57 +02:00
|
|
|
}
|
2024-05-01 00:50:23 +02:00
|
|
|
case 3:
|
|
|
|
if is_end_of_line {
|
|
|
|
// Definition empty
|
|
|
|
definition = ""
|
|
|
|
state = 5
|
|
|
|
} else if !is_space {
|
|
|
|
// Beginning of definition
|
|
|
|
defBytes.Reset()
|
|
|
|
defBytes.WriteByte(C)
|
|
|
|
state = 4
|
2023-09-01 09:30:12 +02:00
|
|
|
}
|
2024-05-01 00:50:23 +02:00
|
|
|
case 4:
|
|
|
|
if is_end_of_line {
|
|
|
|
definition = defBytes.String()
|
|
|
|
state = 5
|
2024-05-07 10:54:12 +02:00
|
|
|
} else {
|
|
|
|
defBytes.WriteByte(C)
|
2023-09-01 09:40:02 +02:00
|
|
|
}
|
2024-05-01 00:50:23 +02:00
|
|
|
case 5:
|
|
|
|
if !is_end_of_line {
|
|
|
|
// Beginning of sequence
|
|
|
|
seqBytes.Reset()
|
|
|
|
if C >= 'A' && C <= 'Z' {
|
|
|
|
C = C + 'a' - 'A'
|
|
|
|
}
|
2023-09-01 09:30:12 +02:00
|
|
|
|
2024-05-01 00:50:23 +02:00
|
|
|
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
|
|
|
|
seqBytes.WriteByte(C)
|
|
|
|
}
|
|
|
|
state = 6
|
2023-09-01 09:30:12 +02:00
|
|
|
}
|
2024-05-01 00:50:23 +02:00
|
|
|
case 6:
|
|
|
|
if C == '>' {
|
|
|
|
// End of sequence
|
|
|
|
s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition)
|
|
|
|
s.SetSource(source)
|
|
|
|
sequences = append(sequences, s)
|
2024-05-16 15:18:30 +02:00
|
|
|
if no_order {
|
|
|
|
if len(sequences) == batch_size {
|
|
|
|
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
|
|
|
|
sequences = make(obiseq.BioSequenceSlice, 0, batch_size)
|
|
|
|
}
|
|
|
|
}
|
2024-05-01 00:50:23 +02:00
|
|
|
state = 1
|
|
|
|
|
|
|
|
} else if !is_sep {
|
|
|
|
if C >= 'A' && C <= 'Z' {
|
|
|
|
C = C + 'a' - 'A'
|
|
|
|
}
|
|
|
|
// Removing white space from the sequence
|
|
|
|
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
|
|
|
|
seqBytes.WriteByte(C)
|
|
|
|
}
|
2023-09-01 09:30:12 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-05-01 00:50:23 +02:00
|
|
|
|
|
|
|
if len(sequences) > 0 {
|
2024-05-16 15:18:30 +02:00
|
|
|
if no_order {
|
|
|
|
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
|
|
|
|
} else {
|
|
|
|
out.Push(obiiter.MakeBioSequenceBatch(chunks.order, sequences))
|
|
|
|
}
|
2024-05-01 00:50:23 +02:00
|
|
|
}
|
2023-09-01 09:30:12 +02:00
|
|
|
}
|
|
|
|
|
2024-05-01 00:50:23 +02:00
|
|
|
out.Done()
|
|
|
|
|
2023-09-01 09:30:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
|
|
|
opt := MakeOptions(options)
|
|
|
|
out := obiiter.MakeIBioSequence()
|
|
|
|
|
2024-05-16 15:18:30 +02:00
|
|
|
nworker := opt.ParallelWorkers()
|
2023-09-01 09:30:12 +02:00
|
|
|
|
2024-05-01 00:50:23 +02:00
|
|
|
chkchan := ReadSeqFileChunk(reader, _EndOfLastFastaEntry)
|
2024-05-16 15:18:30 +02:00
|
|
|
chunck_order := obiutils.AtomicCounter()
|
2023-09-01 09:30:12 +02:00
|
|
|
|
2024-05-01 00:50:23 +02:00
|
|
|
for i := 0; i < nworker; i++ {
|
|
|
|
out.Add(1)
|
2024-05-16 15:18:30 +02:00
|
|
|
go _ParseFastaFile(opt.Source(),
|
|
|
|
chkchan,
|
|
|
|
out,
|
|
|
|
opt.NoOrder(),
|
|
|
|
opt.BatchSize(),
|
|
|
|
chunck_order)
|
2023-09-01 09:30:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
go func() {
|
|
|
|
out.WaitAndClose()
|
|
|
|
}()
|
|
|
|
|
|
|
|
newIter := out.SortBatches().Rebatch(opt.BatchSize())
|
|
|
|
|
|
|
|
log.Debugln("Full file batch mode : ", opt.FullFileBatch())
|
2024-05-01 00:50:23 +02:00
|
|
|
|
2023-09-01 09:30:12 +02:00
|
|
|
if opt.FullFileBatch() {
|
|
|
|
newIter = newIter.CompleteFileIterator()
|
|
|
|
}
|
|
|
|
|
|
|
|
annotParser := opt.ParseFastSeqHeader()
|
|
|
|
|
|
|
|
if annotParser != nil {
|
|
|
|
return IParseFastSeqHeaderBatch(newIter, options...), nil
|
|
|
|
}
|
|
|
|
|
|
|
|
return newIter, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func ReadFastaFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
|
|
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
|
|
|
|
|
|
|
file, err := Ropen(filename)
|
|
|
|
|
2023-10-05 07:21:12 +02:00
|
|
|
if err == ErrNoContent {
|
|
|
|
log.Infof("file %s is empty", filename)
|
|
|
|
return ReadEmptyFile(options...)
|
|
|
|
}
|
|
|
|
|
2023-09-01 09:30:12 +02:00
|
|
|
if err != nil {
|
|
|
|
return obiiter.NilIBioSequence, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return ReadFasta(file, options...)
|
|
|
|
}
|
|
|
|
|
|
|
|
func ReadFastaFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
|
|
|
options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin")))
|
|
|
|
input, err := Buf(os.Stdin)
|
|
|
|
|
2023-10-05 07:21:12 +02:00
|
|
|
if err == ErrNoContent {
|
|
|
|
log.Infof("stdin is empty")
|
|
|
|
return ReadEmptyFile(options...)
|
|
|
|
}
|
|
|
|
|
2023-09-01 09:30:12 +02:00
|
|
|
if err != nil {
|
|
|
|
log.Fatalf("open file error: %v", err)
|
|
|
|
return obiiter.NilIBioSequence, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return ReadFasta(input, options...)
|
|
|
|
}
|