mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00

modified: pkg/obiformats/ngsfilter_read.go modified: pkg/obioptions/version.go modified: pkg/obiutils/mimetypes.go
117 lines
3.3 KiB
Go
117 lines
3.3 KiB
Go
package obiutils
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/csv"
|
|
"errors"
|
|
"io"
|
|
"regexp"
|
|
|
|
"github.com/gabriel-vasile/mimetype"
|
|
log "github.com/sirupsen/logrus"
|
|
)
|
|
|
|
func HasBOM(data []byte) bool {
|
|
switch {
|
|
case bytes.HasPrefix(data, []byte{0xEF, 0xBB, 0xBF}):
|
|
log.Infoln("BOM detected: UTF-8 (EF BB BF)")
|
|
return true
|
|
case bytes.HasPrefix(data, []byte{0xFE, 0xFF}):
|
|
log.Infoln("BOM detected: UTF-16 Big Endian (FE FF)")
|
|
return true
|
|
case bytes.HasPrefix(data, []byte{0xFF, 0xFE}):
|
|
log.Infoln("BOM detected: UTF-16 Little Endian (FF FE)")
|
|
return true
|
|
case bytes.HasPrefix(data, []byte{0x00, 0x00, 0xFE, 0xFF}):
|
|
log.Infoln("BOM detected: UTF-32 Big Endian (00 00 FE FF)")
|
|
return true
|
|
case bytes.HasPrefix(data, []byte{0xFF, 0xFE, 0x00, 0x00}):
|
|
log.Infoln("BOM detected: UTF-32 Little Endian (FF FE 00 00)")
|
|
return true
|
|
default:
|
|
log.Infoln("No BOM detected")
|
|
return false
|
|
}
|
|
}
|
|
|
|
func DropLastLine(b []byte) []byte {
|
|
for i := len(b) - 1; i > 0; i-- {
|
|
if b[i] == '\n' {
|
|
return b[:i]
|
|
}
|
|
}
|
|
return b
|
|
}
|
|
|
|
var __obimimetype_registred__ = false
|
|
|
|
func RegisterOBIMimeType() {
|
|
if !__obimimetype_registred__ {
|
|
csv := func(in []byte, limit uint32) bool {
|
|
in = DropLastLine(in)
|
|
|
|
br := bytes.NewReader(in)
|
|
r := csv.NewReader(br)
|
|
r.Comma = ','
|
|
r.ReuseRecord = true
|
|
r.LazyQuotes = true
|
|
r.Comment = '#'
|
|
|
|
lines := 0
|
|
for {
|
|
_, err := r.Read()
|
|
if errors.Is(err, io.EOF) {
|
|
break
|
|
}
|
|
if err != nil {
|
|
return false
|
|
}
|
|
lines++
|
|
}
|
|
|
|
return r.FieldsPerRecord > 1 && lines > 1
|
|
}
|
|
|
|
fastaDetector := func(raw []byte, limit uint32) bool {
|
|
ok, err := regexp.Match("^>[^ ]", raw)
|
|
return ok && err == nil
|
|
}
|
|
|
|
fastqDetector := func(raw []byte, limit uint32) bool {
|
|
ok, err := regexp.Match("^@[^ ].*\n[A-Za-z.-]+", raw)
|
|
return ok && err == nil
|
|
}
|
|
|
|
ecoPCR2Detector := func(raw []byte, limit uint32) bool {
|
|
ok := bytes.HasPrefix(raw, []byte("#@ecopcr-v2"))
|
|
return ok
|
|
}
|
|
|
|
genbankDetector := func(raw []byte, limit uint32) bool {
|
|
ok2 := bytes.HasPrefix(raw, []byte("LOCUS "))
|
|
ok1, err := regexp.Match("^[^ ]* +Genetic Sequence Data Bank *\n", raw)
|
|
return ok2 || (ok1 && err == nil)
|
|
}
|
|
|
|
emblDetector := func(raw []byte, limit uint32) bool {
|
|
ok := bytes.HasPrefix(raw, []byte("ID "))
|
|
return ok
|
|
}
|
|
|
|
mimetype.Lookup("text/plain").Extend(fastaDetector, "text/fasta", ".fasta")
|
|
mimetype.Lookup("text/plain").Extend(fastqDetector, "text/fastq", ".fastq")
|
|
mimetype.Lookup("text/plain").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
|
|
mimetype.Lookup("text/plain").Extend(genbankDetector, "text/genbank", ".seq")
|
|
mimetype.Lookup("text/plain").Extend(emblDetector, "text/embl", ".dat")
|
|
mimetype.Lookup("text/plain").Extend(csv, "text/csv", ".csv")
|
|
|
|
mimetype.Lookup("application/octet-stream").Extend(fastaDetector, "text/fasta", ".fasta")
|
|
mimetype.Lookup("application/octet-stream").Extend(fastqDetector, "text/fastq", ".fastq")
|
|
mimetype.Lookup("application/octet-stream").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
|
|
mimetype.Lookup("application/octet-stream").Extend(genbankDetector, "text/genbank", ".seq")
|
|
mimetype.Lookup("application/octet-stream").Extend(emblDetector, "text/embl", ".dat")
|
|
mimetype.Lookup("application/octet-stream").Extend(csv, "text/csv", ".csv")
|
|
}
|
|
__obimimetype_registred__ = true
|
|
}
|