obitools4/pkg/obiblackboard/read_sequences.go

package obiblackboard

import (
	"bufio"
	"bytes"
	"fmt"
	"io"
	"os"
	"path"
	"path/filepath"
	"regexp"
	"strings"

	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"

	"github.com/gabriel-vasile/mimetype"
	"github.com/goombaio/orderedset"
	log "github.com/sirupsen/logrus"
)

func ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
	res, err := _ExpandListOfFiles(check_ext, filenames...)

	if err != nil {
		log.Infof("Found %d files to process", len(res))
	}

	return res, err
}

func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
	var err error
	list_of_files := orderedset.NewOrderedSet()
	for _, fn := range filenames {
		// Special case for stdin
		if fn == "-" {
			list_of_files.Add(fn)
			continue
		}

		err = filepath.Walk(fn,
			func(path string, info os.FileInfo, err error) error {
				var e error
				if info == nil {
					return fmt.Errorf("cannot open path")
				}
				for info.Mode()&os.ModeSymlink == os.ModeSymlink {
					path, e = filepath.EvalSymlinks(path)
					if e != nil {
						return e
					}

					info, e = os.Stat(path)
					if e != nil {
						return e
					}
				}

				if info.IsDir() {
					if path != fn {
						subdir, e := ExpandListOfFiles(true, path)
						if e != nil {
							return e
						}
						for _, f := range subdir {
							list_of_files.Add(f)
						}
					} else {
						check_ext = true
					}
				} else {
					if !check_ext ||
						strings.HasSuffix(path, "csv") ||
						strings.HasSuffix(path, "csv.gz") ||
						strings.HasSuffix(path, "fasta") ||
						strings.HasSuffix(path, "fasta.gz") ||
						strings.HasSuffix(path, "fastq") ||
						strings.HasSuffix(path, "fastq.gz") ||
						strings.HasSuffix(path, "seq") ||
						strings.HasSuffix(path, "seq.gz") ||
						strings.HasSuffix(path, "gb") ||
						strings.HasSuffix(path, "gb.gz") ||
						strings.HasSuffix(path, "dat") ||
						strings.HasSuffix(path, "dat.gz") ||
						strings.HasSuffix(path, "ecopcr") ||
						strings.HasSuffix(path, "ecopcr.gz") {
						log.Debugf("Appending %s file\n", path)
						list_of_files.Add(path)
					}
				}
				return nil
			})

		if err != nil {
			return nil, err
		}
	}

	res := make([]string, 0, list_of_files.Size())
	for _, v := range list_of_files.Values() {
		res = append(res, v.(string))
	}

	return res, nil
}

// OBIMimeTypeGuesser is a function that takes an io.Reader as input and guesses the MIME type of the data.
// It uses several detectors to identify specific file formats, such as FASTA, FASTQ, ecoPCR2, GenBank, and EMBL.
// The function reads data from the input stream and analyzes it using the mimetype library.
// It then returns the detected MIME type, a modified reader with the read data, and any error encountered during the process.
//
// The following file types are recognized:
// - "text/ecopcr": if the first line starts with "#@ecopcr-v2".
// - "text/fasta": if the first line starts with ">".
// - "text/fastq": if the first line starts with "@".
// - "text/embl": if the first line starts with "ID   ".
// - "text/genbank": if the first line starts with "LOCUS       ".
// - "text/genbank" (special case): if the first line "Genetic Sequence Data Bank" (for genbank release files).
// - "text/csv"
//
// Parameters:
// - stream: An io.Reader representing the input stream to read data from.
//
// Returns:
// - *mimetype.MIME: The detected MIME type of the data.
// - io.Reader: A modified reader with the read data.
// - error: Any error encountered during the process.
func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
	fastaDetector := func(raw []byte, limit uint32) bool {
		ok, err := regexp.Match("^>[^ ]", raw)
		return ok && err == nil
	}

	fastqDetector := func(raw []byte, limit uint32) bool {
		ok, err := regexp.Match("^@[^ ].*\n[^ ]+\n\\+", raw)
		return ok && err == nil
	}

	ecoPCR2Detector := func(raw []byte, limit uint32) bool {
		ok := bytes.HasPrefix(raw, []byte("#@ecopcr-v2"))
		return ok
	}

	genbankDetector := func(raw []byte, limit uint32) bool {
		ok2 := bytes.HasPrefix(raw, []byte("LOCUS       "))
		ok1, err := regexp.Match("^[^ ]* +Genetic Sequence Data Bank *\n", raw)
		return ok2 || (ok1 && err == nil)
	}

	emblDetector := func(raw []byte, limit uint32) bool {
		ok := bytes.HasPrefix(raw, []byte("ID   "))
		return ok
	}

	mimetype.Lookup("text/plain").Extend(fastaDetector, "text/fasta", ".fasta")
	mimetype.Lookup("text/plain").Extend(fastqDetector, "text/fastq", ".fastq")
	mimetype.Lookup("text/plain").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
	mimetype.Lookup("text/plain").Extend(genbankDetector, "text/genbank", ".seq")
	mimetype.Lookup("text/plain").Extend(emblDetector, "text/embl", ".dat")

	mimetype.Lookup("application/octet-stream").Extend(fastaDetector, "text/fasta", ".fasta")
	mimetype.Lookup("application/octet-stream").Extend(fastqDetector, "text/fastq", ".fastq")
	mimetype.Lookup("application/octet-stream").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
	mimetype.Lookup("application/octet-stream").Extend(genbankDetector, "text/genbank", ".seq")
	mimetype.Lookup("application/octet-stream").Extend(emblDetector, "text/embl", ".dat")

	// Create a buffer to store the read data
	buf := make([]byte, 1024*128)
	n, err := io.ReadFull(stream, buf)

	if err != nil && err != io.ErrUnexpectedEOF {
		return nil, nil, err
	}

	// Detect the MIME type using the mimetype library
	mimeType := mimetype.Detect(buf)
	if mimeType == nil {
		return nil, nil, err
	}

	// Create a new reader based on the read data
	newReader := io.Reader(bytes.NewReader(buf[:n]))

	if err == nil {
		newReader = io.MultiReader(newReader, stream)
	}

	return mimeType, newReader, nil
}

func TextChunkParser(parser obiformats.SeqFileChunkParser, target string) DoTask {

	return func(bb *Blackboard, task *Task) *Task {
		chunk := task.Body.(obiformats.SeqFileChunk)
		sequences, err := parser(chunk.Source, chunk.Raw)

		if err != nil {
			return nil
		}

		nt := task.GetNext(target, false, false)
		nt.Body = obiiter.MakeBioSequenceBatch(
			chunk.Source,
			chunk.Order,
			sequences)

		return nt
	}
}

func SeqAnnotParser(parser obiseq.SeqAnnotator, target string) DoTask {
	worker := obiseq.SeqToSliceWorker(obiseq.AnnotatorToSeqWorker(parser), false)

	return func(bb *Blackboard, task *Task) *Task {
		batch := task.Body.(obiiter.BioSequenceBatch)
		sequences, err := worker(batch.Slice())

		if err != nil {
			log.Errorf("SeqAnnotParser on %s[%d]: %v", batch.Source(), batch.Order(), err)
			return nil
		}

		nt := task.GetNext(target, false, false)
		nt.Body = obiiter.MakeBioSequenceBatch(
			batch.Source(),
			batch.Order(),
			sequences,
		)
		return nt
	}

}

// OpenStream opens a file specified by the given filename and returns a reader for the file,
// the detected MIME type of the file, and any error encountered during the process.
//
// Parameters:
//   - filename: A string representing the path to the file to be opened. If the filename is "-",
//     the function opens the standard input stream.
//
// Returns:
// - io.Reader: A reader for the file.
// - *mimetype.MIME: The detected MIME type of the file.
// - error: Any error encountered during the process.
func OpenStream(filename string) (io.Reader, *mimetype.MIME, error) {
	var stream io.Reader
	var err error
	if filename == "-" {
		stream, err = obiformats.Buf(os.Stdin)
	} else {
		stream, err = obiformats.Ropen(filename)
	}

	if err != nil {
		return nil, nil, err
	}

	// Detect the MIME type using the mimetype library
	mimeType, newReader, err := OBIMimeTypeGuesser(stream)
	if err != nil {
		return nil, nil, err
	}

	log.Infof("%s mime type: %s", filename, mimeType.String())

	return bufio.NewReader(newReader), mimeType, nil
}

type OpenedStreamBody struct {
	Stream     io.Reader
	Filename   string
	Source     string
	Mime       *mimetype.MIME
	ToBeClosed bool
}

func FilenameToStream(target string) DoTask {

	return func(bb *Blackboard, task *Task) *Task {
		filename := task.Body.(Iteration[string]).Value
		stream, mimetype, err := OpenStream(filename)

		if err != nil {
			log.Errorf("Error opening %s: %v", filename, err)
			return nil
		}

		tobeclosed := filename != "-"

		switch mimetype.String() {
		case "text/fasta", "text/fastq", "text/ecopcr2", "text/genbank", "text/embl", "text/csv":
			nt := task.GetNext(target+":"+mimetype.String(), false, false)
			nt.Body = OpenedStreamBody{
				Stream:     stream,
				Mime:       mimetype,
				Filename:   filename,
				Source:     obiutils.RemoveAllExt((path.Base(filename))),
				ToBeClosed: tobeclosed,
			}

			return nt

		default:
			log.Errorf("File %s (mime type %s) is an unsupported format", filename, mimetype.String())
			return nil
		}
	}
}

type TextChunkIteratorBody struct {
	Chunks     obiformats.ChannelSeqFileChunk
	Stream     io.Reader
	Source     string
	ToBeClosed bool
}

func StreamToTextChunkReader(lastEntry obiformats.LastSeqRecord, target string) DoTask {
	return func(bb *Blackboard, task *Task) *Task {

		body := task.Body.(OpenedStreamBody)
		iterator := obiformats.ReadSeqFileChunk(
			body.Source,
			body.Stream,
			make([]byte, 64*1024*1024),
			lastEntry,
		)

		nt := task.GetNext(target, false, false)
		nt.Body = TextChunkIteratorBody{
			Chunks:     iterator,
			Stream:     body.Stream,
			Source:     body.Source,
			ToBeClosed: body.ToBeClosed,
		}

		return nt
	}
}

func TextChuckIterator(endTask *Task, target string) DoTask {
	return func(bb *Blackboard, task *Task) *Task {
		body := task.Body.(TextChunkIteratorBody)

		chunk, ok := <-body.Chunks

		if !ok {
			return endTask
		}

		var nt *Task

		if bb.Len() > bb.TargetSize {
			nt = task.GetNext(target, false, true)
		} else {
			nt = task.GetNext(target, false, false)
			bb.PushTask(task)
		}

		nt.Body = chunk
		return nt
	}
}

type SequenceIteratorBody struct {
	Iterator   obiiter.IBioSequence
	Stream     io.Reader
	Source     string
	ToBeClosed bool
}

func StreamToSequenceReader(
	reader obiformats.SequenceReader,
	options []obiformats.WithOption,
	target string) DoTask {
	return func(bb *Blackboard, task *Task) *Task {
		body := task.Body.(OpenedStreamBody)
		iterator, err := reader(body.Stream, options...)

		if err != nil {
			log.Errorf("Error opening %s: %v", body.Filename, err)
			return nil
		}

		nt := task.GetNext(target, false, false)
		nt.Body = SequenceIteratorBody{
			Iterator:   iterator,
			Stream:     body.Stream,
			Source:     body.Source,
			ToBeClosed: body.ToBeClosed,
		}

		return nt
	}
}

func SequenceIterator(endTask *Task, target string) DoTask {
	return func(bb *Blackboard, task *Task) *Task {
		body := task.Body.(SequenceIteratorBody)

		if body.Iterator.Next() {
			batch := body.Iterator.Get()

			var nt *Task
			if bb.Len() > bb.TargetSize {
				nt = task.GetNext(target, false, true)
			} else {
				nt = task.GetNext(target, false, false)
				bb.PushTask(task)
			}

			nt.Body = batch

			return nt
		} else {
			return endTask
		}
	}
}

func (bb *Blackboard) ReadSequences(filepath []string, options ...obiformats.WithOption) {

	var err error

	opts := obiformats.MakeOptions(options)

	if len(filepath) == 0 {
		filepath = []string{"-"}
	}

	filepath, err = ExpandListOfFiles(false, filepath...)

	if err != nil {
		log.Fatalf("Cannot expand list of files : %v", err)
	}

	bb.RegisterRunner(
		"initial",
		DoIterateSlice(filepath, "filename"),
	)

	bb.RegisterRunner(
		"filename",
		FilenameToStream("stream"),
	)

	bb.RegisterRunner("stream:text/fasta",
		StreamToTextChunkReader(
			obiformats.EndOfLastFastaEntry,
			"fasta_text_reader",
		))

	bb.RegisterRunner("fasta_text_reader",
		TextChuckIterator(NewInitialTask(), "fasta_text_chunk"),
	)

	bb.RegisterRunner(
		"fasta_text_chunk",
		TextChunkParser(
			obiformats.FastaChunkParser(),
			"unannotated_sequences",
		),
	)

	bb.RegisterRunner("stream:text/fastq",
		StreamToTextChunkReader(obiformats.EndOfLastFastqEntry,
			"fastq_text_reader"))

	bb.RegisterRunner("fastq_text_reader",
		TextChuckIterator(NewInitialTask(), "fastq_text_chunk"),
	)

	bb.RegisterRunner(
		"fastq_text_chunk",
		TextChunkParser(
			obiformats.FastqChunkParser(obioptions.InputQualityShift()),
			"unannotated_sequences",
		),
	)

	bb.RegisterRunner("stream:text/embl",
		StreamToTextChunkReader(obiformats.EndOfLastFlatFileEntry,
			"embl_text_reader"))

	bb.RegisterRunner("embl_text_reader",
		TextChuckIterator(NewInitialTask(), "embl_text_chunk"),
	)

	bb.RegisterRunner(
		"embl_text_chunk",
		TextChunkParser(
			obiformats.EmblChunkParser(opts.WithFeatureTable()),
			"sequences",
		),
	)

	bb.RegisterRunner("stream:text/genbank",
		StreamToTextChunkReader(obiformats.EndOfLastFlatFileEntry,
			"genbank_text_reader"))

	bb.RegisterRunner("genbank_text_reader",
		TextChuckIterator(NewInitialTask(), "genbank_text_chunk"),
	)

	bb.RegisterRunner(
		"genbank_text_chunk",
		TextChunkParser(
			obiformats.GenbankChunkParser(opts.WithFeatureTable()),
			"sequences",
		),
	)

	bb.RegisterRunner(
		"unannotated_sequences",
		SeqAnnotParser(
			opts.ParseFastSeqHeader(),
			"sequences",
		),
	)

	bb.RegisterRunner("stream:text/csv",
		StreamToSequenceReader(obiformats.ReadCSV, options, "sequence_reader"))

	bb.RegisterRunner("stream:text/ecopcr2",
		StreamToSequenceReader(obiformats.ReadEcoPCR, options, "sequence_reader"))

	bb.RegisterRunner("sequence_reader",
		SequenceIterator(NewInitialTask(), "sequences"),
	)

}