mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-08 08:40:26 +00:00
Adds the ability to read gzip-tar file for the taxonomy dump
This commit is contained in:
@@ -157,9 +157,9 @@ func ReadCSV(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, err
|
||||
func ReadCSVFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||
file, err := Ropen(filename)
|
||||
file, err := obiutils.Ropen(filename)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("file %s is empty", filename)
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
@@ -173,9 +173,9 @@ func ReadCSVFromFile(filename string, options ...WithOption) (obiiter.IBioSequen
|
||||
|
||||
func ReadCSVFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin")))
|
||||
input, err := Buf(os.Stdin)
|
||||
input, err := obiutils.Buf(os.Stdin)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("stdin is empty")
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
|
||||
@@ -227,9 +227,9 @@ func ReadEMBLFromFile(filename string, options ...WithOption) (obiiter.IBioSeque
|
||||
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||
|
||||
reader, err = Ropen(filename)
|
||||
reader, err = obiutils.Ropen(filename)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("file %s is empty", filename)
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
|
||||
@@ -271,9 +271,9 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
func ReadFastaFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||
|
||||
file, err := Ropen(filename)
|
||||
file, err := obiutils.Ropen(filename)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("file %s is empty", filename)
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
@@ -287,9 +287,9 @@ func ReadFastaFromFile(filename string, options ...WithOption) (obiiter.IBioSequ
|
||||
|
||||
func ReadFastaFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
options = append(options, OptionsSource("stdin"))
|
||||
input, err := Buf(os.Stdin)
|
||||
input, err := obiutils.Buf(os.Stdin)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("stdin is empty")
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
|
||||
@@ -370,9 +370,9 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
func ReadFastqFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||
|
||||
file, err := Ropen(filename)
|
||||
file, err := obiutils.Ropen(filename)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("file %s is empty", filename)
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
@@ -386,9 +386,9 @@ func ReadFastqFromFile(filename string, options ...WithOption) (obiiter.IBioSequ
|
||||
|
||||
func ReadFastqFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin")))
|
||||
input, err := Buf(os.Stdin)
|
||||
input, err := obiutils.Buf(os.Stdin)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("stdin is empty")
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
|
||||
@@ -266,9 +266,9 @@ func ReadGenbankFromFile(filename string, options ...WithOption) (obiiter.IBioSe
|
||||
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||
|
||||
reader, err = Ropen(filename)
|
||||
reader, err = obiutils.Ropen(filename)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("file %s is empty", filename)
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
|
||||
@@ -1,204 +0,0 @@
|
||||
package ncbitaxdump
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
// loadNodeTable reads a node table from the provided reader and populates the given taxonomy.
|
||||
// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
|
||||
// The node table is expected to be in CSV format with a custom delimiter ('|') and comments
|
||||
// starting with '#'. Each record in the table represents a taxon with its taxid, parent taxid,
|
||||
// and rank.
|
||||
//
|
||||
// Parameters:
|
||||
// - reader: An io.Reader from which the node table is read.
|
||||
// - taxonomy: A pointer to an obitax.Taxonomy instance where the taxon data will be added.
|
||||
//
|
||||
// The function reads each record from the input, trims whitespace from the taxid, parent, and rank,
|
||||
// and adds the taxon to the taxonomy. If an error occurs while adding a taxon, the function logs
|
||||
// a fatal error and terminates the program.
|
||||
func loadNodeTable(reader io.Reader, taxonomy *obitax.Taxonomy) {
|
||||
file := csv.NewReader(reader)
|
||||
file.Comma = '|'
|
||||
file.Comment = '#'
|
||||
file.TrimLeadingSpace = true
|
||||
file.ReuseRecord = true
|
||||
|
||||
n := 0
|
||||
|
||||
for record, err := file.Read(); err == nil; record, err = file.Read() {
|
||||
n++
|
||||
taxid := strings.TrimSpace(record[0])
|
||||
parent := strings.TrimSpace(record[1])
|
||||
rank := strings.TrimSpace(record[2])
|
||||
|
||||
_, err := taxonomy.AddTaxon(taxid, parent, rank, taxid == "1", false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Error adding taxon %s: %v\n", taxid, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// loadNameTable reads a name table from the provided reader and populates the given taxonomy.
|
||||
// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
|
||||
// The name table is expected to be in a custom format with fields separated by the '|' character.
|
||||
// Each record in the table represents a taxon with its taxid, name, and class name.
|
||||
//
|
||||
// Parameters:
|
||||
// - reader: An io.Reader from which the name table is read.
|
||||
// - taxonomy: A pointer to an obitax.Taxonomy instance where the taxon names will be set.
|
||||
// - onlysn: A boolean flag indicating whether to only process records with the class name "scientific name".
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// The number of taxon names successfully loaded into the taxonomy. If a line is too long, -1 is returned.
|
||||
// The function processes each line, trims whitespace from the taxid, name, and class name, and sets
|
||||
// the name in the taxonomy if the conditions are met.
|
||||
func loadNameTable(reader io.Reader, taxonomy *obitax.Taxonomy, onlysn bool) int {
|
||||
// file := csv.NewReader(reader)
|
||||
// file.Comma = '|'
|
||||
// file.Comment = '#'
|
||||
// file.TrimLeadingSpace = true
|
||||
// file.ReuseRecord = true
|
||||
// file.LazyQuotes = true
|
||||
file := bufio.NewReader(reader)
|
||||
|
||||
n := 0
|
||||
l := 0
|
||||
|
||||
for line, prefix, err := file.ReadLine(); err == nil; line, prefix, err = file.ReadLine() {
|
||||
l++
|
||||
if prefix {
|
||||
return -1
|
||||
}
|
||||
|
||||
record := strings.Split(string(line), "|")
|
||||
taxid := strings.TrimSpace(record[0])
|
||||
|
||||
name := strings.TrimSpace(record[1])
|
||||
classname := strings.TrimSpace(record[3])
|
||||
|
||||
if !onlysn || classname == "scientific name" {
|
||||
n++
|
||||
taxonomy.Taxon(taxid).SetName(name, classname)
|
||||
}
|
||||
}
|
||||
|
||||
return n
|
||||
}
|
||||
|
||||
// loadMergedTable reads a merged table from the provided reader and populates the given taxonomy.
|
||||
// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
|
||||
// The merged table is expected to be in CSV format with a custom delimiter ('|') and comments
|
||||
// starting with '#'. Each record in the table represents a mapping between an old taxid and a new taxid.
|
||||
//
|
||||
// Parameters:
|
||||
// - reader: An io.Reader from which the merged table is read.
|
||||
// - taxonomy: A pointer to an obitax.Taxonomy instance where the alias mappings will be added.
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// The number of alias mappings successfully loaded into the taxonomy. The function processes
|
||||
// each record, trims whitespace from the old and new taxid, and adds the alias to the taxonomy.
|
||||
func loadMergedTable(reader io.Reader, taxonomy *obitax.Taxonomy) int {
|
||||
file := csv.NewReader(reader)
|
||||
file.Comma = '|'
|
||||
file.Comment = '#'
|
||||
file.TrimLeadingSpace = true
|
||||
file.ReuseRecord = true
|
||||
|
||||
n := 0
|
||||
|
||||
for record, err := file.Read(); err == nil; record, err = file.Read() {
|
||||
n++
|
||||
oldtaxid := strings.TrimSpace(record[0])
|
||||
newtaxid := strings.TrimSpace(record[1])
|
||||
|
||||
taxonomy.AddAlias(newtaxid, oldtaxid, false)
|
||||
}
|
||||
|
||||
return n
|
||||
}
|
||||
|
||||
// LoadNCBITaxDump loads the NCBI taxonomy data from the specified directory.
|
||||
// It reads the taxonomy nodes, taxon names, and merged taxa from the corresponding files
|
||||
// and constructs a Taxonomy object.
|
||||
//
|
||||
// Parameters:
|
||||
// - directory: A string representing the path to the directory containing the NCBI taxonomy dump files.
|
||||
// - onlysn: A boolean indicating whether to load only scientific names (true) or all names (false).
|
||||
//
|
||||
// Returns:
|
||||
// - A pointer to the obitax.Taxonomy object containing the loaded taxonomy data, or an error
|
||||
// if any of the files cannot be opened or read.
|
||||
func LoadNCBITaxDump(directory string, onlysn bool) (*obitax.Taxonomy, error) {
|
||||
|
||||
taxonomy := obitax.NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet)
|
||||
|
||||
//
|
||||
// Load the Taxonomy nodes
|
||||
//
|
||||
|
||||
log.Printf("Loading Taxonomy nodes\n")
|
||||
|
||||
nodefile, err := os.Open(path.Join(directory, "nodes.dmp"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot open nodes file from '%s'",
|
||||
directory)
|
||||
}
|
||||
defer nodefile.Close()
|
||||
|
||||
buffered := bufio.NewReader(nodefile)
|
||||
loadNodeTable(buffered, taxonomy)
|
||||
log.Printf("%d Taxonomy nodes read\n", taxonomy.Len())
|
||||
|
||||
//
|
||||
// Load the Taxonomy nodes
|
||||
//
|
||||
|
||||
log.Printf("Loading Taxon names\n")
|
||||
|
||||
namefile, nerr := os.Open(path.Join(directory, "names.dmp"))
|
||||
if nerr != nil {
|
||||
return nil, fmt.Errorf("cannot open names file from '%s'",
|
||||
directory)
|
||||
}
|
||||
defer namefile.Close()
|
||||
|
||||
n := loadNameTable(namefile, taxonomy, onlysn)
|
||||
log.Printf("%d taxon names read\n", n)
|
||||
|
||||
//
|
||||
// Load the merged taxa
|
||||
//
|
||||
|
||||
log.Printf("Loading Merged taxa\n")
|
||||
|
||||
aliasfile, aerr := os.Open(path.Join(directory, "merged.dmp"))
|
||||
if aerr != nil {
|
||||
return nil, fmt.Errorf("cannot open merged file from '%s'",
|
||||
directory)
|
||||
}
|
||||
defer aliasfile.Close()
|
||||
|
||||
buffered = bufio.NewReader(aliasfile)
|
||||
n = loadMergedTable(buffered, taxonomy)
|
||||
log.Printf("%d merged taxa read\n", n)
|
||||
|
||||
root := taxonomy.Taxon("1")
|
||||
taxonomy.SetRoot(root)
|
||||
|
||||
return taxonomy, nil
|
||||
}
|
||||
@@ -172,15 +172,15 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
|
||||
// - error: An error if any occurred during the reading process.
|
||||
func ReadSequencesFromFile(filename string,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
var file *Reader
|
||||
var file *obiutils.Reader
|
||||
var reader io.Reader
|
||||
var err error
|
||||
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||
|
||||
file, err = Ropen(filename)
|
||||
file, err = obiutils.Ropen(filename)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("file %s is empty", filename)
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
|
||||
@@ -1,437 +0,0 @@
|
||||
// This is an integration of the xopen package originally written by Brent Pedersen
|
||||
// (https://github.com/brentp/xopen).
|
||||
//
|
||||
// Here it can be considered as a fork of [Wei Shen](http://shenwei.me) the version :
|
||||
//
|
||||
// https://github.com/shenwei356/xopen
|
||||
//
|
||||
// Package xopen makes it easy to get buffered readers and writers.
|
||||
// Ropen opens a (possibly gzipped) file/process/http site for buffered reading.
|
||||
// Wopen opens a (possibly gzipped) file for buffered writing.
|
||||
// Both will use gzip when appropriate and will user buffered IO.
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/user"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/dsnet/compress/bzip2"
|
||||
"github.com/klauspost/compress/zstd"
|
||||
gzip "github.com/klauspost/pgzip"
|
||||
"github.com/ulikunitz/xz"
|
||||
)
|
||||
|
||||
// Level is the default compression level of gzip.
|
||||
// This value will be automatically adjusted to the default value of zstd or bzip2.
|
||||
var Level = gzip.DefaultCompression
|
||||
|
||||
// ErrNoContent means nothing in the stream/file.
|
||||
var ErrNoContent = errors.New("xopen: no content")
|
||||
|
||||
// ErrDirNotSupported means the path is a directory.
|
||||
var ErrDirNotSupported = errors.New("xopen: input is a directory")
|
||||
|
||||
// IsGzip returns true buffered Reader has the gzip magic.
|
||||
func IsGzip(b *bufio.Reader) (bool, error) {
|
||||
return CheckBytes(b, []byte{0x1f, 0x8b})
|
||||
}
|
||||
|
||||
// IsXz returns true buffered Reader has the xz magic.
|
||||
func IsXz(b *bufio.Reader) (bool, error) {
|
||||
return CheckBytes(b, []byte{0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00})
|
||||
}
|
||||
|
||||
// IsZst returns true buffered Reader has the zstd magic.
|
||||
func IsZst(b *bufio.Reader) (bool, error) {
|
||||
return CheckBytes(b, []byte{0x28, 0xB5, 0x2f, 0xfd})
|
||||
}
|
||||
|
||||
// IsBzip2 returns true buffered Reader has the bzip2 magic.
|
||||
func IsBzip2(b *bufio.Reader) (bool, error) {
|
||||
return CheckBytes(b, []byte{0x42, 0x5a, 0x68})
|
||||
}
|
||||
|
||||
// IsStdin checks if we are getting data from stdin.
|
||||
func IsStdin() bool {
|
||||
// http://stackoverflow.com/a/26567513
|
||||
stat, err := os.Stdin.Stat()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return (stat.Mode() & os.ModeCharDevice) == 0
|
||||
}
|
||||
|
||||
// ExpandUser expands ~/path and ~otheruser/path appropriately
|
||||
func ExpandUser(path string) (string, error) {
|
||||
if len(path) == 0 || path[0] != '~' {
|
||||
return path, nil
|
||||
}
|
||||
var u *user.User
|
||||
var err error
|
||||
if len(path) == 1 || path[1] == '/' {
|
||||
u, err = user.Current()
|
||||
} else {
|
||||
name := strings.Split(path[1:], "/")[0]
|
||||
u, err = user.Lookup(name)
|
||||
}
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
home := u.HomeDir
|
||||
path = home + "/" + path[1:]
|
||||
return path, nil
|
||||
}
|
||||
|
||||
// Exists checks if a local file exits
|
||||
func Exists(path string) bool {
|
||||
path, perr := ExpandUser(path)
|
||||
if perr != nil {
|
||||
return false
|
||||
}
|
||||
_, err := os.Stat(path)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// CheckBytes peeks at a buffered stream and checks if the first read bytes match.
|
||||
func CheckBytes(b *bufio.Reader, buf []byte) (bool, error) {
|
||||
|
||||
m, err := b.Peek(len(buf))
|
||||
if err != nil {
|
||||
// return false, ErrNoContent
|
||||
return false, err // EOF
|
||||
}
|
||||
for i := range buf {
|
||||
if m[i] != buf[i] {
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// Reader is returned by Ropen
|
||||
type Reader struct {
|
||||
*bufio.Reader
|
||||
rdr io.Reader
|
||||
gz io.ReadCloser
|
||||
}
|
||||
|
||||
// Close the associated files.
|
||||
func (r *Reader) Close() error {
|
||||
var err error
|
||||
if r.gz != nil {
|
||||
err = r.gz.Close()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if c, ok := r.rdr.(io.ReadCloser); ok {
|
||||
err = c.Close()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Writer is returned by Wopen
|
||||
type Writer struct {
|
||||
*bufio.Writer
|
||||
wtr *os.File
|
||||
gz *gzip.Writer
|
||||
xw *xz.Writer
|
||||
zw *zstd.Encoder
|
||||
bz2 *bzip2.Writer
|
||||
}
|
||||
|
||||
// Close the associated files.
|
||||
func (w *Writer) Close() error {
|
||||
var err error
|
||||
err = w.Flush()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if w.gz != nil {
|
||||
err = w.gz.Close()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if w.xw != nil {
|
||||
err = w.xw.Close()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if w.zw != nil {
|
||||
err = w.zw.Close()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if w.bz2 != nil {
|
||||
err = w.bz2.Close()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return w.wtr.Close()
|
||||
}
|
||||
|
||||
// Flush the writer.
|
||||
func (w *Writer) Flush() error {
|
||||
var err error
|
||||
err = w.Writer.Flush()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if w.gz != nil {
|
||||
err = w.gz.Flush()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if w.zw != nil {
|
||||
err = w.zw.Flush()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
var bufSize = 65536
|
||||
|
||||
// Buf returns a buffered reader from an io.Reader
|
||||
// If f == "-", then it will attempt to read from os.Stdin.
|
||||
// If the file is gzipped, it will be read as such.
|
||||
func Buf(r io.Reader) (*Reader, error) {
|
||||
b := bufio.NewReaderSize(r, bufSize)
|
||||
var rd io.Reader
|
||||
var rdr io.ReadCloser
|
||||
|
||||
if is, err := IsGzip(b); err != nil {
|
||||
// check BOM
|
||||
t, _, err := b.ReadRune() // no content
|
||||
if err != nil {
|
||||
return nil, ErrNoContent
|
||||
}
|
||||
if t != '\uFEFF' {
|
||||
b.UnreadRune()
|
||||
}
|
||||
return &Reader{b, r, rdr}, nil // non-gzip file with content less than 2 bytes
|
||||
} else if is {
|
||||
rdr, err = gzip.NewReader(b)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
b = bufio.NewReaderSize(rdr, bufSize)
|
||||
} else if is, err := IsZst(b); err != nil {
|
||||
// check BOM
|
||||
t, _, err := b.ReadRune() // no content
|
||||
if err != nil {
|
||||
return nil, ErrNoContent
|
||||
}
|
||||
if t != '\uFEFF' {
|
||||
b.UnreadRune()
|
||||
}
|
||||
return &Reader{b, r, rdr}, nil // non-gzip/zst file with content less than 4 bytes
|
||||
} else if is {
|
||||
rd, err = zstd.NewReader(b)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
b = bufio.NewReaderSize(rd, bufSize)
|
||||
} else if is, err := IsXz(b); err != nil {
|
||||
// check BOM
|
||||
t, _, err := b.ReadRune() // no content
|
||||
if err != nil {
|
||||
return nil, ErrNoContent
|
||||
}
|
||||
if t != '\uFEFF' {
|
||||
b.UnreadRune()
|
||||
}
|
||||
return &Reader{b, r, rdr}, nil // non-gzip/zst/xz file with content less than 6 bytes
|
||||
} else if is {
|
||||
rd, err = xz.NewReader(b)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
b = bufio.NewReaderSize(rd, bufSize)
|
||||
} else if is, err := IsBzip2(b); err != nil {
|
||||
// check BOM
|
||||
t, _, err := b.ReadRune() // no content
|
||||
if err != nil {
|
||||
return nil, ErrNoContent
|
||||
}
|
||||
if t != '\uFEFF' {
|
||||
b.UnreadRune()
|
||||
}
|
||||
return &Reader{b, r, rdr}, nil // non-gzip/zst/xz file with content less than 6 bytes
|
||||
} else if is {
|
||||
rd, err = bzip2.NewReader(b, &bzip2.ReaderConfig{})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
b = bufio.NewReaderSize(rd, bufSize)
|
||||
}
|
||||
|
||||
// other files with content >= 6 bytes
|
||||
|
||||
// check BOM
|
||||
t, _, err := b.ReadRune()
|
||||
if err != nil {
|
||||
return nil, ErrNoContent
|
||||
}
|
||||
if t != '\uFEFF' {
|
||||
b.UnreadRune()
|
||||
}
|
||||
return &Reader{b, r, rdr}, nil
|
||||
}
|
||||
|
||||
// XReader returns a reader from a url string or a file.
|
||||
func XReader(f string) (io.Reader, error) {
|
||||
if strings.HasPrefix(f, "http://") || strings.HasPrefix(f, "https://") {
|
||||
var rsp *http.Response
|
||||
rsp, err := http.Get(f)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if rsp.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("http error downloading %s. status: %s", f, rsp.Status)
|
||||
}
|
||||
rdr := rsp.Body
|
||||
return rdr, nil
|
||||
}
|
||||
f, err := ExpandUser(f)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fi, err := os.Stat(f)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if fi.IsDir() {
|
||||
return nil, ErrDirNotSupported
|
||||
}
|
||||
|
||||
return os.Open(f)
|
||||
}
|
||||
|
||||
// Ropen opens a buffered reader.
|
||||
func Ropen(f string) (*Reader, error) {
|
||||
var err error
|
||||
var rdr io.Reader
|
||||
if f == "-" {
|
||||
if !IsStdin() {
|
||||
return nil, errors.New("stdin not detected")
|
||||
}
|
||||
b, err := Buf(os.Stdin)
|
||||
return b, err
|
||||
} else if f[0] == '|' {
|
||||
// TODO: use csv to handle quoted file names.
|
||||
cmdStrs := strings.Split(f[1:], " ")
|
||||
var cmd *exec.Cmd
|
||||
if len(cmdStrs) == 2 {
|
||||
cmd = exec.Command(cmdStrs[0], cmdStrs[1:]...)
|
||||
} else {
|
||||
cmd = exec.Command(cmdStrs[0])
|
||||
}
|
||||
rdr, err = cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
err = cmd.Start()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
rdr, err = XReader(f)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
b, err := Buf(rdr)
|
||||
return b, err
|
||||
}
|
||||
|
||||
// Wopen opens a buffered reader.
|
||||
// If f == "-", then stdout will be used.
|
||||
// If f endswith ".gz", then the output will be gzipped.
|
||||
// If f endswith ".xz", then the output will be zx-compressed.
|
||||
// If f endswith ".zst", then the output will be zstd-compressed.
|
||||
// If f endswith ".bz2", then the output will be bzip2-compressed.
|
||||
func Wopen(f string) (*Writer, error) {
|
||||
return WopenFile(f, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0666)
|
||||
}
|
||||
|
||||
// WopenFile opens a buffered reader.
|
||||
// If f == "-", then stdout will be used.
|
||||
// If f endswith ".gz", then the output will be gzipped.
|
||||
// If f endswith ".xz", then the output will be zx-compressed.
|
||||
// If f endswith ".bz2", then the output will be bzip2-compressed.
|
||||
func WopenFile(f string, flag int, perm os.FileMode) (*Writer, error) {
|
||||
var wtr *os.File
|
||||
if f == "-" {
|
||||
wtr = os.Stdout
|
||||
} else {
|
||||
dir := filepath.Dir(f)
|
||||
fi, err := os.Stat(dir)
|
||||
if err == nil && !fi.IsDir() {
|
||||
return nil, fmt.Errorf("can not write file into a non-directory path: %s", dir)
|
||||
}
|
||||
if os.IsNotExist(err) {
|
||||
os.MkdirAll(dir, 0755)
|
||||
}
|
||||
wtr, err = os.OpenFile(f, flag, perm)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
f2 := strings.ToLower(f)
|
||||
if strings.HasSuffix(f2, ".gz") {
|
||||
gz, err := gzip.NewWriterLevel(wtr, Level)
|
||||
if err != nil {
|
||||
err = fmt.Errorf("xopen: %s", err)
|
||||
}
|
||||
return &Writer{bufio.NewWriterSize(gz, bufSize), wtr, gz, nil, nil, nil}, err
|
||||
}
|
||||
if strings.HasSuffix(f2, ".xz") {
|
||||
xw, err := xz.NewWriter(wtr)
|
||||
return &Writer{bufio.NewWriterSize(xw, bufSize), wtr, nil, xw, nil, nil}, err
|
||||
}
|
||||
if strings.HasSuffix(f2, ".zst") {
|
||||
level := Level
|
||||
if level == gzip.DefaultCompression {
|
||||
level = 2
|
||||
}
|
||||
zw, err := zstd.NewWriter(wtr, zstd.WithEncoderLevel(zstd.EncoderLevel(level)))
|
||||
if err != nil {
|
||||
err = fmt.Errorf("xopen: zstd: %s", err)
|
||||
}
|
||||
return &Writer{bufio.NewWriterSize(zw, bufSize), wtr, nil, nil, zw, nil}, err
|
||||
}
|
||||
if strings.HasSuffix(f2, ".bz2") {
|
||||
level := Level
|
||||
if level == gzip.DefaultCompression {
|
||||
level = 6
|
||||
}
|
||||
bz2, err := bzip2.NewWriter(wtr, &bzip2.WriterConfig{Level: level})
|
||||
if err != nil {
|
||||
err = fmt.Errorf("xopen: %s", err)
|
||||
}
|
||||
return &Writer{bufio.NewWriterSize(bz2, bufSize), wtr, nil, nil, nil, bz2}, err
|
||||
}
|
||||
return &Writer{bufio.NewWriterSize(wtr, bufSize), wtr, nil, nil, nil, nil}, nil
|
||||
}
|
||||
@@ -1,148 +0,0 @@
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
. "gopkg.in/check.v1"
|
||||
)
|
||||
|
||||
func Test(t *testing.T) { TestingT(t) }
|
||||
|
||||
type XopenTest struct{}
|
||||
|
||||
var _ = Suite(&XopenTest{})
|
||||
|
||||
func gzFromString(s string) string {
|
||||
var c bytes.Buffer
|
||||
gz := gzip.NewWriter(&c)
|
||||
gz.Write([]byte(s))
|
||||
return c.String()
|
||||
}
|
||||
|
||||
var gzTests = []struct {
|
||||
isGz bool
|
||||
data string
|
||||
}{
|
||||
{false, "asdf"},
|
||||
{true, gzFromString("asdf")},
|
||||
}
|
||||
|
||||
func (s *XopenTest) TestIsGzip(c *C) {
|
||||
for _, t := range gzTests {
|
||||
isGz, err := IsGzip(bufio.NewReader(strings.NewReader(t.data)))
|
||||
c.Assert(err, IsNil)
|
||||
c.Assert(t.isGz, Equals, isGz)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *XopenTest) TestIsStdin(c *C) {
|
||||
r := IsStdin()
|
||||
c.Assert(r, Equals, false)
|
||||
}
|
||||
|
||||
func (s *XopenTest) TestRopen(c *C) {
|
||||
rdr, err := Ropen("-")
|
||||
c.Assert(err, ErrorMatches, "stdin not detected")
|
||||
c.Assert(rdr, IsNil)
|
||||
}
|
||||
|
||||
func (s *XopenTest) TestWopen(c *C) {
|
||||
for _, f := range []string{"t.gz", "t"} {
|
||||
testString := "ASDF1234"
|
||||
wtr, err := Wopen(f)
|
||||
c.Assert(err, IsNil)
|
||||
_, err = os.Stat(f)
|
||||
c.Assert(err, IsNil)
|
||||
c.Assert(wtr.wtr, NotNil)
|
||||
fmt.Fprint(wtr, testString)
|
||||
wtr.Close()
|
||||
|
||||
rdr, err := Ropen(f)
|
||||
c.Assert(err, IsNil)
|
||||
|
||||
str, err := rdr.ReadString(99)
|
||||
c.Assert(str, Equals, testString)
|
||||
c.Assert(err, Equals, io.EOF)
|
||||
str, _ = rdr.ReadString(99)
|
||||
c.Assert(str, Equals, "")
|
||||
|
||||
rdr.Close()
|
||||
os.Remove(f)
|
||||
}
|
||||
}
|
||||
|
||||
var httpTests = []struct {
|
||||
url string
|
||||
expectError bool
|
||||
}{
|
||||
{"https://raw.githubusercontent.com/brentp/xopen/master/README.md", false},
|
||||
{"http://raw.githubusercontent.com/brentp/xopen/master/README.md", false},
|
||||
{"http://raw.githubusercontent.com/brentp/xopen/master/BAD.md", true},
|
||||
}
|
||||
|
||||
func (s *XopenTest) TestReadHttp(c *C) {
|
||||
for _, t := range httpTests {
|
||||
rdr, err := Ropen(t.url)
|
||||
if !t.expectError {
|
||||
c.Assert(err, IsNil)
|
||||
v, err := rdr.ReadString(byte('\n'))
|
||||
c.Assert(err, IsNil)
|
||||
c.Assert(len(v), Not(Equals), 0)
|
||||
} else {
|
||||
c.Assert(err, ErrorMatches, ".* 404 Not Found")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// func (s *XopenTest) TestReadProcess(c *C) {
|
||||
// for _, cmd := range []string{"|ls -lh", "|ls", "|ls -lh xopen_test.go"} {
|
||||
// rdr, err := Ropen(cmd)
|
||||
// c.Assert(err, IsNil)
|
||||
// b := make([]byte, 1000)
|
||||
// _, err = rdr.Read(b)
|
||||
// if err != io.EOF {
|
||||
// c.Assert(err, IsNil)
|
||||
// }
|
||||
// lines := strings.Split(string(b), "\n")
|
||||
// has := false
|
||||
// for _, line := range lines {
|
||||
// if strings.Contains(line, "xopen_test.go") {
|
||||
// has = true
|
||||
// }
|
||||
// }
|
||||
// c.Assert(has, Equals, true)
|
||||
// }
|
||||
// }
|
||||
|
||||
func (s *XopenTest) TestOpenStdout(c *C) {
|
||||
w, err := Wopen("-")
|
||||
c.Assert(err, IsNil)
|
||||
c.Assert(w.wtr, Equals, os.Stdout)
|
||||
}
|
||||
|
||||
func (s *XopenTest) TestOpenBadFile(c *C) {
|
||||
r, err := Ropen("XXXXXXXXXXXXXXXXXXXXXXX")
|
||||
c.Assert(r, IsNil)
|
||||
c.Assert(err, ErrorMatches, ".*no such file.*")
|
||||
}
|
||||
|
||||
func (s *XopenTest) TestExists(c *C) {
|
||||
c.Assert(Exists("xopen.go"), Equals, true)
|
||||
c.Assert(Exists("____xx"), Equals, false)
|
||||
}
|
||||
|
||||
func (s *XopenTest) TestUser(c *C) {
|
||||
c.Assert(Exists("~"), Equals, true)
|
||||
}
|
||||
|
||||
func (s *XopenTest) TestExpand(c *C) {
|
||||
_, err := ExpandUser("~baduser66")
|
||||
c.Assert(err, Not(IsNil))
|
||||
}
|
||||
Reference in New Issue
Block a user