Files
obitools4/pkg/obiformats/universal_read.go

156 lines
4.7 KiB
Go
Raw Normal View History

2022-01-13 23:27:39 +01:00
package obiformats
import (
"bufio"
"bytes"
2022-01-13 23:27:39 +01:00
"io"
"os"
"path"
"regexp"
"github.com/gabriel-vasile/mimetype"
gzip "github.com/klauspost/pgzip"
2022-01-13 23:27:39 +01:00
2022-02-24 12:14:52 +01:00
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
2022-01-13 23:27:39 +01:00
)
// OBIMimeTypeGuesser is a function that takes an io.Reader as input and guesses the MIME type of the data.
// It uses several detectors to identify specific file formats, such as FASTA, FASTQ, ecoPCR2, GenBank, and EMBL.
// The function reads data from the input stream and analyzes it using the mimetype library.
// It then returns the detected MIME type, a modified reader with the read data, and any error encountered during the process.
//
// The following file types are recognized:
// - "text/ecopcr": if the first line starts with "#@ecopcr-v2".
// - "text/fasta": if the first line starts with ">".
// - "text/fastq": if the first line starts with "@".
// - "text/embl": if the first line starts with "ID ".
// - "text/genbank": if the first line starts with "LOCUS ".
// - "text/genbank" (special case): if the first line "Genetic Sequence Data Bank" (for genbank release files).
// - "text/csv"
//
// Parameters:
// - stream: An io.Reader representing the input stream to read data from.
//
// Returns:
// - *mimetype.MIME: The detected MIME type of the data.
// - io.Reader: A modified reader with the read data.
// - error: Any error encountered during the process.
func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
fastaDetector := func(raw []byte, limit uint32) bool {
ok, err := regexp.Match("^>[^ ]", raw)
return ok && err == nil
}
fastqDetector := func(raw []byte, limit uint32) bool {
ok, err := regexp.Match("^@[^ ]", raw)
return ok && err == nil
}
2022-01-13 23:27:39 +01:00
ecoPCR2Detector := func(raw []byte, limit uint32) bool {
ok := bytes.HasPrefix(raw, []byte("#@ecopcr-v2"))
return ok
}
2022-01-13 23:27:39 +01:00
genbankDetector := func(raw []byte, limit uint32) bool {
ok2 := bytes.HasPrefix(raw, []byte("LOCUS "))
ok1, err := regexp.Match("^[^ ]* +Genetic Sequence Data Bank *\n", raw)
return ok2 || (ok1 && err == nil)
}
2022-01-13 23:27:39 +01:00
emblDetector := func(raw []byte, limit uint32) bool {
ok := bytes.HasPrefix(raw, []byte("ID "))
return ok
}
2022-01-13 23:27:39 +01:00
mimetype.Lookup("text/plain").Extend(fastaDetector, "text/fasta", ".fasta")
mimetype.Lookup("text/plain").Extend(fastqDetector, "text/fastq", ".fastq")
mimetype.Lookup("text/plain").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
mimetype.Lookup("text/plain").Extend(genbankDetector, "text/genbank", ".seq")
mimetype.Lookup("text/plain").Extend(emblDetector, "text/embl", ".dat")
2022-01-13 23:27:39 +01:00
// Create a buffer to store the read data
buf := make([]byte, 1024*128)
n, err := stream.Read(buf)
2022-08-23 11:04:57 +02:00
if err != nil && err != io.EOF {
return nil, nil, err
}
2022-01-13 23:27:39 +01:00
// Detect the MIME type using the mimetype library
mimeType := mimetype.Detect(buf)
if mimeType == nil {
return nil, nil, err
2022-01-13 23:27:39 +01:00
}
// Create a new reader based on the read data
newReader := io.MultiReader(bytes.NewReader(buf[:n]), stream)
return mimeType, newReader, nil
2022-01-13 23:27:39 +01:00
}
// ReadSequencesFromFile reads sequences from a file and returns an iterator of bio sequences and an error.
//
// Parameters:
// - filename: The name of the file to read the sequences from.
// - options: Optional parameters to customize the reading process.
//
// Returns:
// - obiiter.IBioSequence: An iterator of bio sequences.
// - error: An error if any occurred during the reading process.
2022-11-16 17:13:03 +01:00
func ReadSequencesFromFile(filename string,
2023-01-22 22:04:17 +01:00
options ...WithOption) (obiiter.IBioSequence, error) {
2022-01-13 23:27:39 +01:00
var file *os.File
var reader io.Reader
var greader io.Reader
var err error
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
2022-01-13 23:27:39 +01:00
file, err = os.Open(filename)
if err != nil {
log.Fatalf("open file error: %v", err)
2023-01-22 22:04:17 +01:00
return obiiter.NilIBioSequence, err
2022-01-13 23:27:39 +01:00
}
reader = file
// Test if the flux is compressed by gzip
greader, err = gzip.NewReader(reader)
if err != nil {
file.Seek(0, 0)
} else {
2022-02-24 12:14:52 +01:00
log.Debugf("File %s is gz compressed ", filename)
2022-01-13 23:27:39 +01:00
reader = greader
}
mime, reader, err := OBIMimeTypeGuesser(reader)
2022-01-13 23:27:39 +01:00
if err != nil {
return obiiter.NilIBioSequence, err
}
reader = bufio.NewReader(reader)
2022-01-13 23:27:39 +01:00
switch mime.String() {
case "text/fasta", "text/fastq":
2022-01-13 23:27:39 +01:00
file.Close()
is, err := ReadFastSeqFromFile(filename, options...)
return is, err
case "text/ecopcr2":
2022-11-16 17:13:03 +01:00
return ReadEcoPCR(reader, options...), nil
case "text/embl":
2022-11-16 17:13:03 +01:00
return ReadEMBL(reader, options...), nil
case "text/genbank":
2022-11-16 17:13:03 +01:00
return ReadGenbank(reader, options...), nil
2022-01-13 23:27:39 +01:00
default:
log.Fatalf("File %s has guessed format %s which is not yet implemented",
filename, mime.String())
2022-01-13 23:27:39 +01:00
}
2023-01-22 22:04:17 +01:00
return obiiter.NilIBioSequence, nil
2022-01-13 23:27:39 +01:00
}