Files
obitools4/pkg/obiformats/universal_read.go

103 lines
2.1 KiB
Go
Raw Normal View History

2022-01-13 23:27:39 +01:00
package obiformats
import (
"bufio"
"compress/gzip"
"io"
"os"
"strings"
2022-02-24 12:14:52 +01:00
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
2022-01-13 23:27:39 +01:00
)
func GuessSeqFileType(firstline string) string {
switch {
case strings.HasPrefix(firstline, "#@ecopcr-v2"):
return "ecopcr"
case strings.HasPrefix(firstline, "#"):
return "ecopcr"
case strings.HasPrefix(firstline, ">"):
return "fasta"
case strings.HasPrefix(firstline, "@"):
return "fastq"
case strings.HasPrefix(firstline, "ID "):
return "embl"
case strings.HasPrefix(firstline, "LOCUS "):
2022-08-23 11:04:57 +02:00
return "genbank"
// Special case for genbank release files
// I hope it is enougth stringeant
case strings.HasSuffix(firstline, " Genetic Se"):
return "genbank"
2022-01-13 23:27:39 +01:00
default:
return "unknown"
}
}
2022-11-16 17:13:03 +01:00
func ReadSequencesFromFile(filename string,
2023-01-22 22:04:17 +01:00
options ...WithOption) (obiiter.IBioSequence, error) {
2022-01-13 23:27:39 +01:00
var file *os.File
var reader io.Reader
var greader io.Reader
var err error
file, err = os.Open(filename)
if err != nil {
log.Fatalf("open file error: %v", err)
2023-01-22 22:04:17 +01:00
return obiiter.NilIBioSequence, err
2022-01-13 23:27:39 +01:00
}
reader = file
// Test if the flux is compressed by gzip
greader, err = gzip.NewReader(reader)
if err != nil {
file.Seek(0, 0)
} else {
2022-02-24 12:14:52 +01:00
log.Debugf("File %s is gz compressed ", filename)
2022-01-13 23:27:39 +01:00
reader = greader
}
breader := bufio.NewReader(reader)
tag, _ := breader.Peek(30)
if len(tag) < 30 {
2023-01-22 22:04:17 +01:00
newIter := obiiter.MakeIBioSequence()
newIter.Close()
return newIter, nil
}
2022-01-13 23:27:39 +01:00
filetype := GuessSeqFileType(string(tag))
2022-02-24 12:14:52 +01:00
log.Debug("File guessed format : %s (tag: %s)",
2022-01-13 23:27:39 +01:00
filetype, (strings.Split(string(tag), "\n"))[0])
reader = breader
switch filetype {
case "fastq", "fasta":
file.Close()
2022-11-16 17:13:03 +01:00
is, _ := ReadFastSeqFromFile(filename, options...)
2022-01-13 23:27:39 +01:00
return is, nil
case "ecopcr":
2022-11-16 17:13:03 +01:00
return ReadEcoPCR(reader, options...), nil
2022-01-13 23:27:39 +01:00
case "embl":
2022-11-16 17:13:03 +01:00
return ReadEMBL(reader, options...), nil
2022-08-23 11:04:57 +02:00
case "genbank":
2022-11-16 17:13:03 +01:00
return ReadGenbank(reader, options...), nil
2022-01-13 23:27:39 +01:00
default:
log.Fatalf("File %s has guessed format %s which is not yet implemented",
filename, filetype)
}
2023-01-22 22:04:17 +01:00
return obiiter.NilIBioSequence, nil
2022-01-13 23:27:39 +01:00
}