Files
obitools4/pkg/obiblackboard/read_sequences.go

535 lines
13 KiB
Go

package obiblackboard
import (
"bufio"
"bytes"
"fmt"
"io"
"os"
"path"
"path/filepath"
"regexp"
"strings"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/gabriel-vasile/mimetype"
"github.com/goombaio/orderedset"
log "github.com/sirupsen/logrus"
)
func ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
res, err := _ExpandListOfFiles(check_ext, filenames...)
if err != nil {
log.Infof("Found %d files to process", len(res))
}
return res, err
}
func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
var err error
list_of_files := orderedset.NewOrderedSet()
for _, fn := range filenames {
// Special case for stdin
if fn == "-" {
list_of_files.Add(fn)
continue
}
err = filepath.Walk(fn,
func(path string, info os.FileInfo, err error) error {
var e error
if info == nil {
return fmt.Errorf("cannot open path")
}
for info.Mode()&os.ModeSymlink == os.ModeSymlink {
path, e = filepath.EvalSymlinks(path)
if e != nil {
return e
}
info, e = os.Stat(path)
if e != nil {
return e
}
}
if info.IsDir() {
if path != fn {
subdir, e := ExpandListOfFiles(true, path)
if e != nil {
return e
}
for _, f := range subdir {
list_of_files.Add(f)
}
} else {
check_ext = true
}
} else {
if !check_ext ||
strings.HasSuffix(path, "csv") ||
strings.HasSuffix(path, "csv.gz") ||
strings.HasSuffix(path, "fasta") ||
strings.HasSuffix(path, "fasta.gz") ||
strings.HasSuffix(path, "fastq") ||
strings.HasSuffix(path, "fastq.gz") ||
strings.HasSuffix(path, "seq") ||
strings.HasSuffix(path, "seq.gz") ||
strings.HasSuffix(path, "gb") ||
strings.HasSuffix(path, "gb.gz") ||
strings.HasSuffix(path, "dat") ||
strings.HasSuffix(path, "dat.gz") ||
strings.HasSuffix(path, "ecopcr") ||
strings.HasSuffix(path, "ecopcr.gz") {
log.Debugf("Appending %s file\n", path)
list_of_files.Add(path)
}
}
return nil
})
if err != nil {
return nil, err
}
}
res := make([]string, 0, list_of_files.Size())
for _, v := range list_of_files.Values() {
res = append(res, v.(string))
}
return res, nil
}
// OBIMimeTypeGuesser is a function that takes an io.Reader as input and guesses the MIME type of the data.
// It uses several detectors to identify specific file formats, such as FASTA, FASTQ, ecoPCR2, GenBank, and EMBL.
// The function reads data from the input stream and analyzes it using the mimetype library.
// It then returns the detected MIME type, a modified reader with the read data, and any error encountered during the process.
//
// The following file types are recognized:
// - "text/ecopcr": if the first line starts with "#@ecopcr-v2".
// - "text/fasta": if the first line starts with ">".
// - "text/fastq": if the first line starts with "@".
// - "text/embl": if the first line starts with "ID ".
// - "text/genbank": if the first line starts with "LOCUS ".
// - "text/genbank" (special case): if the first line "Genetic Sequence Data Bank" (for genbank release files).
// - "text/csv"
//
// Parameters:
// - stream: An io.Reader representing the input stream to read data from.
//
// Returns:
// - *mimetype.MIME: The detected MIME type of the data.
// - io.Reader: A modified reader with the read data.
// - error: Any error encountered during the process.
func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
fastaDetector := func(raw []byte, limit uint32) bool {
ok, err := regexp.Match("^>[^ ]", raw)
return ok && err == nil
}
fastqDetector := func(raw []byte, limit uint32) bool {
ok, err := regexp.Match("^@[^ ].*\n[^ ]+\n\\+", raw)
return ok && err == nil
}
ecoPCR2Detector := func(raw []byte, limit uint32) bool {
ok := bytes.HasPrefix(raw, []byte("#@ecopcr-v2"))
return ok
}
genbankDetector := func(raw []byte, limit uint32) bool {
ok2 := bytes.HasPrefix(raw, []byte("LOCUS "))
ok1, err := regexp.Match("^[^ ]* +Genetic Sequence Data Bank *\n", raw)
return ok2 || (ok1 && err == nil)
}
emblDetector := func(raw []byte, limit uint32) bool {
ok := bytes.HasPrefix(raw, []byte("ID "))
return ok
}
mimetype.Lookup("text/plain").Extend(fastaDetector, "text/fasta", ".fasta")
mimetype.Lookup("text/plain").Extend(fastqDetector, "text/fastq", ".fastq")
mimetype.Lookup("text/plain").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
mimetype.Lookup("text/plain").Extend(genbankDetector, "text/genbank", ".seq")
mimetype.Lookup("text/plain").Extend(emblDetector, "text/embl", ".dat")
mimetype.Lookup("application/octet-stream").Extend(fastaDetector, "text/fasta", ".fasta")
mimetype.Lookup("application/octet-stream").Extend(fastqDetector, "text/fastq", ".fastq")
mimetype.Lookup("application/octet-stream").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
mimetype.Lookup("application/octet-stream").Extend(genbankDetector, "text/genbank", ".seq")
mimetype.Lookup("application/octet-stream").Extend(emblDetector, "text/embl", ".dat")
// Create a buffer to store the read data
buf := make([]byte, 1024*128)
n, err := io.ReadFull(stream, buf)
if err != nil && err != io.ErrUnexpectedEOF {
return nil, nil, err
}
// Detect the MIME type using the mimetype library
mimeType := mimetype.Detect(buf)
if mimeType == nil {
return nil, nil, err
}
// Create a new reader based on the read data
newReader := io.Reader(bytes.NewReader(buf[:n]))
if err == nil {
newReader = io.MultiReader(newReader, stream)
}
return mimeType, newReader, nil
}
func TextChunkParser(parser obiformats.SeqFileChunkParser, target string) DoTask {
return func(bb *Blackboard, task *Task) *Task {
chunk := task.Body.(obiformats.SeqFileChunk)
sequences, err := parser(chunk.Source, chunk.Raw)
if err != nil {
return nil
}
nt := task.GetNext(target, false, false)
nt.Body = obiiter.MakeBioSequenceBatch(
chunk.Source,
chunk.Order,
sequences)
return nt
}
}
func SeqAnnotParser(parser obiseq.SeqAnnotator, target string) DoTask {
worker := obiseq.SeqToSliceWorker(obiseq.AnnotatorToSeqWorker(parser), false)
return func(bb *Blackboard, task *Task) *Task {
batch := task.Body.(obiiter.BioSequenceBatch)
sequences, err := worker(batch.Slice())
if err != nil {
log.Errorf("SeqAnnotParser on %s[%d]: %v", batch.Source(), batch.Order(), err)
return nil
}
nt := task.GetNext(target, false, false)
nt.Body = obiiter.MakeBioSequenceBatch(
batch.Source(),
batch.Order(),
sequences,
)
return nt
}
}
// OpenStream opens a file specified by the given filename and returns a reader for the file,
// the detected MIME type of the file, and any error encountered during the process.
//
// Parameters:
// - filename: A string representing the path to the file to be opened. If the filename is "-",
// the function opens the standard input stream.
//
// Returns:
// - io.Reader: A reader for the file.
// - *mimetype.MIME: The detected MIME type of the file.
// - error: Any error encountered during the process.
func OpenStream(filename string) (io.Reader, *mimetype.MIME, error) {
var stream io.Reader
var err error
if filename == "-" {
stream, err = obiformats.Buf(os.Stdin)
} else {
stream, err = obiformats.Ropen(filename)
}
if err != nil {
return nil, nil, err
}
// Detect the MIME type using the mimetype library
mimeType, newReader, err := OBIMimeTypeGuesser(stream)
if err != nil {
return nil, nil, err
}
log.Infof("%s mime type: %s", filename, mimeType.String())
return bufio.NewReader(newReader), mimeType, nil
}
type OpenedStreamBody struct {
Stream io.Reader
Filename string
Source string
Mime *mimetype.MIME
ToBeClosed bool
}
func FilenameToStream(target string) DoTask {
return func(bb *Blackboard, task *Task) *Task {
filename := task.Body.(Iteration[string]).Value
stream, mimetype, err := OpenStream(filename)
if err != nil {
log.Errorf("Error opening %s: %v", filename, err)
return nil
}
tobeclosed := filename != "-"
switch mimetype.String() {
case "text/fasta", "text/fastq", "text/ecopcr2", "text/genbank", "text/embl", "text/csv":
nt := task.GetNext(target+":"+mimetype.String(), false, false)
nt.Body = OpenedStreamBody{
Stream: stream,
Mime: mimetype,
Filename: filename,
Source: obiutils.RemoveAllExt((path.Base(filename))),
ToBeClosed: tobeclosed,
}
return nt
default:
log.Errorf("File %s (mime type %s) is an unsupported format", filename, mimetype.String())
return nil
}
}
}
type TextChunkIteratorBody struct {
Chunks obiformats.ChannelSeqFileChunk
Stream io.Reader
Source string
ToBeClosed bool
}
func StreamToTextChunkReader(lastEntry obiformats.LastSeqRecord, target string) DoTask {
return func(bb *Blackboard, task *Task) *Task {
body := task.Body.(OpenedStreamBody)
iterator := obiformats.ReadSeqFileChunk(
body.Source,
body.Stream,
make([]byte, 64*1024*1024),
lastEntry,
)
nt := task.GetNext(target, false, false)
nt.Body = TextChunkIteratorBody{
Chunks: iterator,
Stream: body.Stream,
Source: body.Source,
ToBeClosed: body.ToBeClosed,
}
return nt
}
}
func TextChuckIterator(endTask *Task, target string) DoTask {
return func(bb *Blackboard, task *Task) *Task {
body := task.Body.(TextChunkIteratorBody)
chunk, ok := <-body.Chunks
if !ok {
return endTask
}
var nt *Task
if bb.Len() > bb.TargetSize {
nt = task.GetNext(target, false, true)
} else {
nt = task.GetNext(target, false, false)
bb.PushTask(task)
}
nt.Body = chunk
return nt
}
}
type SequenceIteratorBody struct {
Iterator obiiter.IBioSequence
Stream io.Reader
Source string
ToBeClosed bool
}
func StreamToSequenceReader(
reader obiformats.SequenceReader,
options []obiformats.WithOption,
target string) DoTask {
return func(bb *Blackboard, task *Task) *Task {
body := task.Body.(OpenedStreamBody)
iterator, err := reader(body.Stream, options...)
if err != nil {
log.Errorf("Error opening %s: %v", body.Filename, err)
return nil
}
nt := task.GetNext(target, false, false)
nt.Body = SequenceIteratorBody{
Iterator: iterator,
Stream: body.Stream,
Source: body.Source,
ToBeClosed: body.ToBeClosed,
}
return nt
}
}
func SequenceIterator(endTask *Task, target string) DoTask {
return func(bb *Blackboard, task *Task) *Task {
body := task.Body.(SequenceIteratorBody)
if body.Iterator.Next() {
batch := body.Iterator.Get()
var nt *Task
if bb.Len() > bb.TargetSize {
nt = task.GetNext(target, false, true)
} else {
nt = task.GetNext(target, false, false)
bb.PushTask(task)
}
nt.Body = batch
return nt
} else {
return endTask
}
}
}
func (bb *Blackboard) ReadSequences(filepath []string, options ...obiformats.WithOption) {
var err error
opts := obiformats.MakeOptions(options)
if len(filepath) == 0 {
filepath = []string{"-"}
}
filepath, err = ExpandListOfFiles(false, filepath...)
if err != nil {
log.Fatalf("Cannot expand list of files : %v", err)
}
bb.RegisterRunner(
"initial",
DoIterateSlice(filepath, "filename"),
)
bb.RegisterRunner(
"filename",
FilenameToStream("stream"),
)
bb.RegisterRunner("stream:text/fasta",
StreamToTextChunkReader(
obiformats.EndOfLastFastaEntry,
"fasta_text_reader",
))
bb.RegisterRunner("fasta_text_reader",
TextChuckIterator(NewInitialTask(), "fasta_text_chunk"),
)
bb.RegisterRunner(
"fasta_text_chunk",
TextChunkParser(
obiformats.FastaChunkParser(),
"unannotated_sequences",
),
)
bb.RegisterRunner("stream:text/fastq",
StreamToTextChunkReader(obiformats.EndOfLastFastqEntry,
"fastq_text_reader"))
bb.RegisterRunner("fastq_text_reader",
TextChuckIterator(NewInitialTask(), "fastq_text_chunk"),
)
bb.RegisterRunner(
"fastq_text_chunk",
TextChunkParser(
obiformats.FastqChunkParser(obioptions.InputQualityShift()),
"unannotated_sequences",
),
)
bb.RegisterRunner("stream:text/embl",
StreamToTextChunkReader(obiformats.EndOfLastFlatFileEntry,
"embl_text_reader"))
bb.RegisterRunner("embl_text_reader",
TextChuckIterator(NewInitialTask(), "embl_text_chunk"),
)
bb.RegisterRunner(
"embl_text_chunk",
TextChunkParser(
obiformats.EmblChunkParser(opts.WithFeatureTable()),
"sequences",
),
)
bb.RegisterRunner("stream:text/genbank",
StreamToTextChunkReader(obiformats.EndOfLastFlatFileEntry,
"genbank_text_reader"))
bb.RegisterRunner("genbank_text_reader",
TextChuckIterator(NewInitialTask(), "genbank_text_chunk"),
)
bb.RegisterRunner(
"genbank_text_chunk",
TextChunkParser(
obiformats.GenbankChunkParser(opts.WithFeatureTable()),
"sequences",
),
)
bb.RegisterRunner(
"unannotated_sequences",
SeqAnnotParser(
opts.ParseFastSeqHeader(),
"sequences",
),
)
bb.RegisterRunner("stream:text/csv",
StreamToSequenceReader(obiformats.ReadCSV, options, "sequence_reader"))
bb.RegisterRunner("stream:text/ecopcr2",
StreamToSequenceReader(obiformats.ReadEcoPCR, options, "sequence_reader"))
bb.RegisterRunner("sequence_reader",
SequenceIterator(NewInitialTask(), "sequences"),
)
}