2022-01-13 23:27:39 +01:00
|
|
|
package obiformats
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"compress/gzip"
|
|
|
|
"io"
|
|
|
|
"os"
|
2023-03-27 19:51:10 +07:00
|
|
|
"path"
|
2022-01-13 23:27:39 +01:00
|
|
|
"strings"
|
|
|
|
|
2022-02-24 12:14:52 +01:00
|
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
|
2022-02-24 07:08:40 +01:00
|
|
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
2023-03-27 19:51:10 +07:00
|
|
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
2022-01-13 23:27:39 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
func GuessSeqFileType(firstline string) string {
|
|
|
|
switch {
|
|
|
|
case strings.HasPrefix(firstline, "#@ecopcr-v2"):
|
|
|
|
return "ecopcr"
|
|
|
|
|
|
|
|
case strings.HasPrefix(firstline, "#"):
|
|
|
|
return "ecopcr"
|
|
|
|
|
|
|
|
case strings.HasPrefix(firstline, ">"):
|
|
|
|
return "fasta"
|
|
|
|
|
|
|
|
case strings.HasPrefix(firstline, "@"):
|
|
|
|
return "fastq"
|
|
|
|
|
|
|
|
case strings.HasPrefix(firstline, "ID "):
|
|
|
|
return "embl"
|
|
|
|
|
|
|
|
case strings.HasPrefix(firstline, "LOCUS "):
|
2022-08-23 11:04:57 +02:00
|
|
|
return "genbank"
|
|
|
|
|
|
|
|
// Special case for genbank release files
|
|
|
|
// I hope it is enougth stringeant
|
|
|
|
case strings.HasSuffix(firstline, " Genetic Se"):
|
|
|
|
return "genbank"
|
2022-01-13 23:27:39 +01:00
|
|
|
|
|
|
|
default:
|
|
|
|
return "unknown"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-16 17:13:03 +01:00
|
|
|
func ReadSequencesFromFile(filename string,
|
2023-01-22 22:04:17 +01:00
|
|
|
options ...WithOption) (obiiter.IBioSequence, error) {
|
2022-01-13 23:27:39 +01:00
|
|
|
var file *os.File
|
|
|
|
var reader io.Reader
|
|
|
|
var greader io.Reader
|
|
|
|
var err error
|
|
|
|
|
2023-03-27 19:51:10 +07:00
|
|
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
|
|
|
|
2022-01-13 23:27:39 +01:00
|
|
|
file, err = os.Open(filename)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
log.Fatalf("open file error: %v", err)
|
2023-01-22 22:04:17 +01:00
|
|
|
return obiiter.NilIBioSequence, err
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
reader = file
|
|
|
|
|
|
|
|
// Test if the flux is compressed by gzip
|
|
|
|
greader, err = gzip.NewReader(reader)
|
|
|
|
if err != nil {
|
|
|
|
file.Seek(0, 0)
|
|
|
|
} else {
|
2022-02-24 12:14:52 +01:00
|
|
|
log.Debugf("File %s is gz compressed ", filename)
|
2022-01-13 23:27:39 +01:00
|
|
|
reader = greader
|
|
|
|
}
|
|
|
|
|
|
|
|
breader := bufio.NewReader(reader)
|
|
|
|
|
|
|
|
tag, _ := breader.Peek(30)
|
|
|
|
|
2022-02-15 00:47:02 +01:00
|
|
|
if len(tag) < 30 {
|
2023-01-22 22:04:17 +01:00
|
|
|
newIter := obiiter.MakeIBioSequence()
|
2022-02-21 19:00:23 +01:00
|
|
|
newIter.Close()
|
2022-02-15 00:47:02 +01:00
|
|
|
return newIter, nil
|
|
|
|
}
|
|
|
|
|
2022-01-13 23:27:39 +01:00
|
|
|
filetype := GuessSeqFileType(string(tag))
|
2023-02-17 22:52:53 +01:00
|
|
|
log.Debugf("File guessed format : %s (tag: %s)",
|
2022-01-13 23:27:39 +01:00
|
|
|
filetype, (strings.Split(string(tag), "\n"))[0])
|
|
|
|
reader = breader
|
|
|
|
|
|
|
|
switch filetype {
|
|
|
|
case "fastq", "fasta":
|
|
|
|
file.Close()
|
2023-03-07 17:34:25 +07:00
|
|
|
is, err := ReadFastSeqFromFile(filename, options...)
|
|
|
|
return is, err
|
2022-01-13 23:27:39 +01:00
|
|
|
case "ecopcr":
|
2022-11-16 17:13:03 +01:00
|
|
|
return ReadEcoPCR(reader, options...), nil
|
2022-01-13 23:27:39 +01:00
|
|
|
case "embl":
|
2022-11-16 17:13:03 +01:00
|
|
|
return ReadEMBL(reader, options...), nil
|
2022-08-23 11:04:57 +02:00
|
|
|
case "genbank":
|
2022-11-16 17:13:03 +01:00
|
|
|
return ReadGenbank(reader, options...), nil
|
2022-01-13 23:27:39 +01:00
|
|
|
default:
|
|
|
|
log.Fatalf("File %s has guessed format %s which is not yet implemented",
|
|
|
|
filename, filetype)
|
|
|
|
}
|
|
|
|
|
2023-01-22 22:04:17 +01:00
|
|
|
return obiiter.NilIBioSequence, nil
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|