2022-01-13 23:27:39 +01:00
|
|
|
package obiformats
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
2023-08-30 19:59:46 +02:00
|
|
|
"bytes"
|
2022-01-13 23:27:39 +01:00
|
|
|
"io"
|
2023-03-27 19:51:10 +07:00
|
|
|
"path"
|
2023-08-30 19:59:46 +02:00
|
|
|
|
|
|
|
"github.com/gabriel-vasile/mimetype"
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2022-02-24 12:14:52 +01:00
|
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
|
2023-11-29 12:14:37 +01:00
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
2022-01-13 23:27:39 +01:00
|
|
|
)
|
|
|
|
|
2024-08-02 12:35:46 +02:00
|
|
|
type SequenceReader func(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error)
|
|
|
|
|
2023-08-30 19:59:46 +02:00
|
|
|
// OBIMimeTypeGuesser is a function that takes an io.Reader as input and guesses the MIME type of the data.
|
|
|
|
// It uses several detectors to identify specific file formats, such as FASTA, FASTQ, ecoPCR2, GenBank, and EMBL.
|
|
|
|
// The function reads data from the input stream and analyzes it using the mimetype library.
|
|
|
|
// It then returns the detected MIME type, a modified reader with the read data, and any error encountered during the process.
|
|
|
|
//
|
|
|
|
// The following file types are recognized:
|
|
|
|
// - "text/ecopcr": if the first line starts with "#@ecopcr-v2".
|
|
|
|
// - "text/fasta": if the first line starts with ">".
|
|
|
|
// - "text/fastq": if the first line starts with "@".
|
|
|
|
// - "text/embl": if the first line starts with "ID ".
|
|
|
|
// - "text/genbank": if the first line starts with "LOCUS ".
|
|
|
|
// - "text/genbank" (special case): if the first line "Genetic Sequence Data Bank" (for genbank release files).
|
|
|
|
// - "text/csv"
|
|
|
|
//
|
|
|
|
// Parameters:
|
|
|
|
// - stream: An io.Reader representing the input stream to read data from.
|
|
|
|
//
|
|
|
|
// Returns:
|
|
|
|
// - *mimetype.MIME: The detected MIME type of the data.
|
|
|
|
// - io.Reader: A modified reader with the read data.
|
|
|
|
// - error: Any error encountered during the process.
|
|
|
|
func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
|
2025-03-14 14:22:22 +01:00
|
|
|
obiutils.RegisterOBIMimeType()
|
2023-10-13 21:52:57 +02:00
|
|
|
|
2023-08-30 19:59:46 +02:00
|
|
|
// Create a buffer to store the read data
|
2025-03-08 07:34:02 +01:00
|
|
|
mimetype.SetLimit(1024 * 1024)
|
2024-09-09 16:52:13 +02:00
|
|
|
buf := make([]byte, 1024*1024)
|
2023-10-13 21:52:57 +02:00
|
|
|
n, err := io.ReadFull(stream, buf)
|
2022-08-23 11:04:57 +02:00
|
|
|
|
2023-10-13 21:52:57 +02:00
|
|
|
if err != nil && err != io.ErrUnexpectedEOF {
|
2023-08-30 19:59:46 +02:00
|
|
|
return nil, nil, err
|
|
|
|
}
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2023-08-30 19:59:46 +02:00
|
|
|
// Detect the MIME type using the mimetype library
|
|
|
|
mimeType := mimetype.Detect(buf)
|
2024-09-03 06:08:07 -03:00
|
|
|
|
2023-08-30 19:59:46 +02:00
|
|
|
if mimeType == nil {
|
|
|
|
return nil, nil, err
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|
2023-08-30 19:59:46 +02:00
|
|
|
|
|
|
|
// Create a new reader based on the read data
|
2023-10-13 21:52:57 +02:00
|
|
|
newReader := io.Reader(bytes.NewReader(buf[:n]))
|
|
|
|
|
|
|
|
if err == nil {
|
|
|
|
newReader = io.MultiReader(newReader, stream)
|
|
|
|
}
|
2023-08-30 19:59:46 +02:00
|
|
|
|
|
|
|
return mimeType, newReader, nil
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|
|
|
|
|
2023-09-01 09:30:12 +02:00
|
|
|
// func ReadSequences(reader io.Reader,
|
|
|
|
// options ...WithOption) (obiiter.IBioSequence, error) {
|
|
|
|
|
|
|
|
// mime, reader, err := OBIMimeTypeGuesser(reader)
|
|
|
|
|
|
|
|
// if err != nil {
|
|
|
|
// return obiiter.NilIBioSequence, err
|
|
|
|
// }
|
|
|
|
|
|
|
|
// reader = bufio.NewReader(reader)
|
|
|
|
|
|
|
|
// switch mime.String() {
|
|
|
|
// case "text/fasta", "text/fastq":
|
|
|
|
// file.Close()
|
|
|
|
// is, err := ReadFastSeqFromFile(filename, options...)
|
|
|
|
// return is, err
|
|
|
|
// case "text/ecopcr2":
|
|
|
|
// return ReadEcoPCR(reader, options...), nil
|
|
|
|
// case "text/embl":
|
|
|
|
// return ReadEMBL(reader, options...), nil
|
|
|
|
// case "text/genbank":
|
|
|
|
// return ReadGenbank(reader, options...), nil
|
|
|
|
// default:
|
|
|
|
// log.Fatalf("File %s has guessed format %s which is not yet implemented",
|
|
|
|
// filename, mime.String())
|
|
|
|
// }
|
|
|
|
|
|
|
|
// return obiiter.NilIBioSequence, nil
|
|
|
|
// }
|
|
|
|
|
2023-08-30 19:59:46 +02:00
|
|
|
// ReadSequencesFromFile reads sequences from a file and returns an iterator of bio sequences and an error.
|
|
|
|
//
|
|
|
|
// Parameters:
|
|
|
|
// - filename: The name of the file to read the sequences from.
|
|
|
|
// - options: Optional parameters to customize the reading process.
|
|
|
|
//
|
|
|
|
// Returns:
|
|
|
|
// - obiiter.IBioSequence: An iterator of bio sequences.
|
|
|
|
// - error: An error if any occurred during the reading process.
|
2022-11-16 17:13:03 +01:00
|
|
|
func ReadSequencesFromFile(filename string,
|
2023-01-22 22:04:17 +01:00
|
|
|
options ...WithOption) (obiiter.IBioSequence, error) {
|
2025-01-24 11:47:59 +01:00
|
|
|
var file *obiutils.Reader
|
2022-01-13 23:27:39 +01:00
|
|
|
var reader io.Reader
|
|
|
|
var err error
|
|
|
|
|
2023-03-27 19:51:10 +07:00
|
|
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
2023-08-25 14:36:38 +02:00
|
|
|
|
2025-01-24 11:47:59 +01:00
|
|
|
file, err = obiutils.Ropen(filename)
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2025-01-24 11:47:59 +01:00
|
|
|
if err == obiutils.ErrNoContent {
|
2023-10-16 15:34:06 +02:00
|
|
|
log.Infof("file %s is empty", filename)
|
|
|
|
return ReadEmptyFile(options...)
|
|
|
|
}
|
|
|
|
|
2022-01-13 23:27:39 +01:00
|
|
|
if err != nil {
|
|
|
|
log.Fatalf("open file error: %v", err)
|
2023-01-22 22:04:17 +01:00
|
|
|
return obiiter.NilIBioSequence, err
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|
|
|
|
|
2023-09-01 09:30:12 +02:00
|
|
|
mime, reader, err := OBIMimeTypeGuesser(file)
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2023-08-30 19:59:46 +02:00
|
|
|
if err != nil {
|
|
|
|
return obiiter.NilIBioSequence, err
|
2022-02-15 00:47:02 +01:00
|
|
|
}
|
2023-09-03 19:16:37 +02:00
|
|
|
log.Infof("%s mime type: %s", filename, mime.String())
|
2023-08-30 19:59:46 +02:00
|
|
|
reader = bufio.NewReader(reader)
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2023-08-30 19:59:46 +02:00
|
|
|
switch mime.String() {
|
2023-09-01 09:30:12 +02:00
|
|
|
case "text/fastq":
|
2023-09-03 19:16:37 +02:00
|
|
|
return ReadFastq(reader, options...)
|
2023-09-01 09:30:12 +02:00
|
|
|
case "text/fasta":
|
|
|
|
return ReadFasta(reader, options...)
|
2023-08-30 19:59:46 +02:00
|
|
|
case "text/ecopcr2":
|
2024-08-02 12:35:46 +02:00
|
|
|
return ReadEcoPCR(reader, options...)
|
2023-08-30 19:59:46 +02:00
|
|
|
case "text/embl":
|
2024-08-02 12:35:46 +02:00
|
|
|
return ReadEMBL(reader, options...)
|
2023-08-30 19:59:46 +02:00
|
|
|
case "text/genbank":
|
2024-08-02 12:35:46 +02:00
|
|
|
return ReadGenbank(reader, options...)
|
2024-07-03 21:04:27 +02:00
|
|
|
case "text/csv":
|
|
|
|
return ReadCSV(reader, options...)
|
2022-01-13 23:27:39 +01:00
|
|
|
default:
|
|
|
|
log.Fatalf("File %s has guessed format %s which is not yet implemented",
|
2023-08-30 19:59:46 +02:00
|
|
|
filename, mime.String())
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|
|
|
|
|
2023-01-22 22:04:17 +01:00
|
|
|
return obiiter.NilIBioSequence, nil
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|
2023-09-01 09:30:12 +02:00
|
|
|
|
2025-06-04 09:48:10 +02:00
|
|
|
func ReadSequencesFromStdin(options ...WithOption) (obiiter.IBioSequence, error) {
|
|
|
|
options = append(options, OptionCloseFile())
|
|
|
|
return ReadSequencesFromFile("-", options...)
|
|
|
|
}
|