mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Adds the ability to read gzip-tar file for the taxonomy dump
This commit is contained in:
@ -2,6 +2,14 @@
|
||||
|
||||
## Latest changes
|
||||
|
||||
### Breaking changes
|
||||
|
||||
- In `obimultiplex`, the short version of the **--tag-list** option used to specify the list
|
||||
of tags and primers to be used for the demultiplexing has been changed from `-t` to `-s`.
|
||||
|
||||
- The **--taxdump** option used to specify the path to the taxdump containing the NCBI taxonomy
|
||||
has been renamed to **--taxonomy**.
|
||||
|
||||
### Bug fixes
|
||||
|
||||
- In `obipairing`, correct the stats `seq_a_single` and `seq_b_single` when
|
||||
@ -13,6 +21,10 @@
|
||||
|
||||
### New features
|
||||
|
||||
- NCBI Taxonomy dump does not need to be uncompressed and unarchived anymore. The
|
||||
path of the tar and gziped dump file can be directly specified using the
|
||||
**--taxonomy** option.
|
||||
|
||||
- Most of the time obitools identify automatically sequence file format. But
|
||||
it fails sometimes. Two new option **--fasta** and **--fastq** are added to
|
||||
allow the processing of the rare fasta and fastq files not recognized.
|
||||
|
@ -1,36 +1,12 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"os"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitaxformat"
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
one := obifp.OneUint[obifp.Uint128]()
|
||||
a, b := obifp.OneUint[obifp.Uint64]().LeftShift64(66, 0)
|
||||
log.Infof("one: %v, %v", a, b)
|
||||
shift := one.LeftShift(66)
|
||||
log.Infof("one: %v", shift)
|
||||
|
||||
seq := obiseq.NewBioSequence("test", []byte("atcgggttccaacc"), "")
|
||||
|
||||
kmermap := obikmer.NewKmerMap[obifp.Uint128](
|
||||
obiseq.BioSequenceSlice{
|
||||
seq,
|
||||
},
|
||||
7,
|
||||
true,
|
||||
10,
|
||||
)
|
||||
|
||||
kmers := kmermap.NormalizedKmerSlice(seq, nil)
|
||||
|
||||
for _, kmer := range kmers {
|
||||
println(kmermap.KmerAsString(kmer))
|
||||
}
|
||||
|
||||
obitaxformat.DetectTaxonomyFormat(os.Args[1])
|
||||
}
|
||||
|
@ -157,9 +157,9 @@ func ReadCSV(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, err
|
||||
func ReadCSVFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||
file, err := Ropen(filename)
|
||||
file, err := obiutils.Ropen(filename)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("file %s is empty", filename)
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
@ -173,9 +173,9 @@ func ReadCSVFromFile(filename string, options ...WithOption) (obiiter.IBioSequen
|
||||
|
||||
func ReadCSVFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin")))
|
||||
input, err := Buf(os.Stdin)
|
||||
input, err := obiutils.Buf(os.Stdin)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("stdin is empty")
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
|
@ -227,9 +227,9 @@ func ReadEMBLFromFile(filename string, options ...WithOption) (obiiter.IBioSeque
|
||||
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||
|
||||
reader, err = Ropen(filename)
|
||||
reader, err = obiutils.Ropen(filename)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("file %s is empty", filename)
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
|
@ -271,9 +271,9 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
func ReadFastaFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||
|
||||
file, err := Ropen(filename)
|
||||
file, err := obiutils.Ropen(filename)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("file %s is empty", filename)
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
@ -287,9 +287,9 @@ func ReadFastaFromFile(filename string, options ...WithOption) (obiiter.IBioSequ
|
||||
|
||||
func ReadFastaFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
options = append(options, OptionsSource("stdin"))
|
||||
input, err := Buf(os.Stdin)
|
||||
input, err := obiutils.Buf(os.Stdin)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("stdin is empty")
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
|
@ -370,9 +370,9 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
func ReadFastqFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||
|
||||
file, err := Ropen(filename)
|
||||
file, err := obiutils.Ropen(filename)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("file %s is empty", filename)
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
@ -386,9 +386,9 @@ func ReadFastqFromFile(filename string, options ...WithOption) (obiiter.IBioSequ
|
||||
|
||||
func ReadFastqFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin")))
|
||||
input, err := Buf(os.Stdin)
|
||||
input, err := obiutils.Buf(os.Stdin)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("stdin is empty")
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
|
@ -266,9 +266,9 @@ func ReadGenbankFromFile(filename string, options ...WithOption) (obiiter.IBioSe
|
||||
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||
|
||||
reader, err = Ropen(filename)
|
||||
reader, err = obiutils.Ropen(filename)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("file %s is empty", filename)
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
|
@ -172,15 +172,15 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
|
||||
// - error: An error if any occurred during the reading process.
|
||||
func ReadSequencesFromFile(filename string,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
var file *Reader
|
||||
var file *obiutils.Reader
|
||||
var reader io.Reader
|
||||
var err error
|
||||
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||
|
||||
file, err = Ropen(filename)
|
||||
file, err = obiutils.Ropen(filename)
|
||||
|
||||
if err == ErrNoContent {
|
||||
if err == obiutils.ErrNoContent {
|
||||
log.Infof("file %s is empty", filename)
|
||||
return ReadEmptyFile(options...)
|
||||
}
|
||||
|
@ -6,8 +6,8 @@ import (
|
||||
"os"
|
||||
"runtime"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats/ncbitaxdump"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitaxformat"
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
@ -32,7 +32,7 @@ var _Quality_Shift_Input = byte(33)
|
||||
var _Quality_Shift_Output = byte(33)
|
||||
var _Read_Qualities = true
|
||||
|
||||
var __taxdump__ = ""
|
||||
var __taxonomy__ = ""
|
||||
var __alternative_name__ = false
|
||||
|
||||
type ArgumentParser func([]string) (*getoptions.GetOpt, []string)
|
||||
@ -131,8 +131,8 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser
|
||||
log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/block'")
|
||||
}
|
||||
|
||||
if options.Called("taxdump") {
|
||||
taxonomy, err := ncbitaxdump.LoadNCBITaxDump(CLISelectedNCBITaxDump(),
|
||||
if options.Called("taxonomy") {
|
||||
taxonomy, err := obitaxformat.LoadTaxonomy(CLISelectedTaxonomy(),
|
||||
!CLIAreAlternativeNamesSelected())
|
||||
if err != nil {
|
||||
log.Fatalf("Loading taxonomy error: %v", err)
|
||||
@ -186,14 +186,14 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser
|
||||
|
||||
func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bool) {
|
||||
if required {
|
||||
options.StringVar(&__taxdump__, "taxdump", "",
|
||||
options.StringVar(&__taxonomy__, "taxonomy", "",
|
||||
options.Alias("t"),
|
||||
options.Required(),
|
||||
options.Description("Points to the directory containing the NCBI Taxonomy database dump."))
|
||||
options.Description("Path to the taxonomy database."))
|
||||
} else {
|
||||
options.StringVar(&__taxdump__, "taxdump", "",
|
||||
options.StringVar(&__taxonomy__, "taxonomy", "",
|
||||
options.Alias("t"),
|
||||
options.Description("Points to the directory containing the NCBI Taxonomy database dump."))
|
||||
options.Description("Path to the taxonomy database."))
|
||||
}
|
||||
if alternatiive {
|
||||
options.BoolVar(&__alternative_name__, "alternative-names", false,
|
||||
@ -462,12 +462,12 @@ func SetParallelFilesRead(n int) {
|
||||
_ParallelFilesRead = n
|
||||
}
|
||||
|
||||
func CLISelectedNCBITaxDump() string {
|
||||
return __taxdump__
|
||||
func CLISelectedTaxonomy() string {
|
||||
return __taxonomy__
|
||||
}
|
||||
|
||||
func CLIHasSelectedTaxonomy() bool {
|
||||
return __taxdump__ != ""
|
||||
return __taxonomy__ != ""
|
||||
}
|
||||
|
||||
func CLIAreAlternativeNamesSelected() bool {
|
||||
@ -479,8 +479,8 @@ func CLILoadSelectedTaxonomy() (*obitax.Taxonomy, error) {
|
||||
return obitax.DefaultTaxonomy(), nil
|
||||
}
|
||||
|
||||
if CLISelectedNCBITaxDump() != "" {
|
||||
taxonomy, err := ncbitaxdump.LoadNCBITaxDump(CLISelectedNCBITaxDump(),
|
||||
if CLISelectedTaxonomy() != "" {
|
||||
taxonomy, err := obitaxformat.LoadTaxonomy(CLISelectedTaxonomy(),
|
||||
!CLIAreAlternativeNamesSelected())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -489,5 +489,5 @@ func CLILoadSelectedTaxonomy() (*obitax.Taxonomy, error) {
|
||||
return taxonomy, nil
|
||||
}
|
||||
|
||||
return nil, errors.New("no NCBI taxdump selected using option -t|--taxdump")
|
||||
return nil, errors.New("no taxonomy selected using option -t|--taxonomy")
|
||||
}
|
||||
|
@ -8,7 +8,7 @@ import (
|
||||
// corresponds to the last commit, and not the one when the file will be
|
||||
// commited
|
||||
|
||||
var _Commit = "757448c"
|
||||
var _Commit = "ffd6725"
|
||||
var _Version = "Release 4.2.0"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
|
142
pkg/obitaxformat/ncbitaxdump/readtar.go
Normal file
142
pkg/obitaxformat/ncbitaxdump/readtar.go
Normal file
@ -0,0 +1,142 @@
|
||||
package ncbitaxdump
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bufio"
|
||||
"fmt"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func IsNCBITarTaxDump(path string) bool {
|
||||
|
||||
file, err := obiutils.Ropen(path)
|
||||
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
defer file.Close()
|
||||
|
||||
citations := false
|
||||
division := false
|
||||
gencode := false
|
||||
names := false
|
||||
delnodes := false
|
||||
gc := false
|
||||
merged := false
|
||||
nodes := false
|
||||
|
||||
tarfile := tar.NewReader(file)
|
||||
|
||||
header, err := tarfile.Next()
|
||||
|
||||
for err == nil {
|
||||
name := header.Name
|
||||
|
||||
if header.Typeflag == tar.TypeReg {
|
||||
switch name {
|
||||
case "citations.dmp":
|
||||
citations = true
|
||||
case "division.dmp":
|
||||
division = true
|
||||
case "gencode.dmp":
|
||||
gencode = true
|
||||
case "names.dmp":
|
||||
names = true
|
||||
case "delnodes.dmp":
|
||||
delnodes = true
|
||||
case "gc.prt":
|
||||
gc = true
|
||||
case "merged.dmp":
|
||||
merged = true
|
||||
case "nodes.dmp":
|
||||
nodes = true
|
||||
}
|
||||
}
|
||||
header, err = tarfile.Next()
|
||||
}
|
||||
|
||||
return citations && division && gencode && names && delnodes && gc && merged && nodes
|
||||
}
|
||||
|
||||
func LoadNCBITarTaxDump(path string, onlysn bool) (*obitax.Taxonomy, error) {
|
||||
|
||||
taxonomy := obitax.NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet)
|
||||
|
||||
//
|
||||
// Load the Taxonomy nodes
|
||||
//
|
||||
|
||||
log.Printf("Loading Taxonomy nodes\n")
|
||||
|
||||
file, err := obiutils.Ropen(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
|
||||
path)
|
||||
}
|
||||
|
||||
nodefile, err := obiutils.TarFileReader(file, "nodes.dmp")
|
||||
if err != nil {
|
||||
file.Close()
|
||||
return nil, fmt.Errorf("cannot open nodes file from '%s'",
|
||||
path)
|
||||
}
|
||||
|
||||
buffered := bufio.NewReader(nodefile)
|
||||
loadNodeTable(buffered, taxonomy)
|
||||
log.Printf("%d Taxonomy nodes read\n", taxonomy.Len())
|
||||
file.Close()
|
||||
|
||||
//
|
||||
// Load the Taxonomy nodes
|
||||
//
|
||||
|
||||
log.Printf("Loading Taxon names\n")
|
||||
|
||||
file, err = obiutils.Ropen(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
|
||||
path)
|
||||
}
|
||||
|
||||
namefile, nerr := obiutils.TarFileReader(file, "names.dmp")
|
||||
if nerr != nil {
|
||||
file.Close()
|
||||
return nil, fmt.Errorf("cannot open names file from '%s'",
|
||||
path)
|
||||
}
|
||||
n := loadNameTable(namefile, taxonomy, onlysn)
|
||||
log.Printf("%d taxon names read\n", n)
|
||||
file.Close()
|
||||
|
||||
//
|
||||
// Load the merged taxa
|
||||
//
|
||||
|
||||
log.Printf("Loading Merged taxa\n")
|
||||
file, err = obiutils.Ropen(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
|
||||
path)
|
||||
}
|
||||
|
||||
aliasfile, aerr := obiutils.TarFileReader(file, "merged.dmp")
|
||||
if aerr != nil {
|
||||
file.Close()
|
||||
return nil, fmt.Errorf("cannot open merged file from '%s'",
|
||||
path)
|
||||
}
|
||||
|
||||
buffered = bufio.NewReader(aliasfile)
|
||||
n = loadMergedTable(buffered, taxonomy)
|
||||
log.Printf("%d merged taxa read\n", n)
|
||||
|
||||
root := taxonomy.Taxon("1")
|
||||
taxonomy.SetRoot(root)
|
||||
|
||||
return taxonomy, nil
|
||||
}
|
91
pkg/obitaxformat/taxonomy_read.go
Normal file
91
pkg/obitaxformat/taxonomy_read.go
Normal file
@ -0,0 +1,91 @@
|
||||
package obitaxformat
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitaxformat/ncbitaxdump"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
"github.com/gabriel-vasile/mimetype"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type TaxonomyLoader func(path string, onlysn bool) (*obitax.Taxonomy, error)
|
||||
|
||||
func DetectTaxonomyTarFormat(path string) (TaxonomyLoader, error) {
|
||||
|
||||
switch {
|
||||
case ncbitaxdump.IsNCBITarTaxDump(path):
|
||||
log.Infof("NCBI Taxdump Tar Archive detected: %s", path)
|
||||
return ncbitaxdump.LoadNCBITarTaxDump, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unknown taxonomy format: %s", path)
|
||||
}
|
||||
|
||||
func DetectTaxonomyFormat(path string) (TaxonomyLoader, error) {
|
||||
|
||||
file, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fileInfo, err := file.Stat()
|
||||
if err != nil {
|
||||
file.Close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
file.Close()
|
||||
|
||||
if fileInfo.IsDir() {
|
||||
// For the moment, we only support NCBI Taxdump directory format
|
||||
log.Infof("NCBI Taxdump detected: %s", path)
|
||||
return ncbitaxdump.LoadNCBITaxDump, nil
|
||||
} else {
|
||||
file, err := obiutils.Ropen(path)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
mimetype, err := mimetype.DetectReader(file)
|
||||
|
||||
if err != nil {
|
||||
file.Close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
file.Close()
|
||||
|
||||
switch mimetype.String() {
|
||||
case "text/csv":
|
||||
return LoadCSVTaxonomy, nil
|
||||
case "application/x-tar":
|
||||
return DetectTaxonomyTarFormat(path)
|
||||
}
|
||||
|
||||
log.Fatalf("Detected file format: %s", mimetype.String())
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func LoadCSVTaxonomy(path string, onlysn bool) (*obitax.Taxonomy, error) {
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func LoadTaxonomy(path string, onlysn bool) (*obitax.Taxonomy, error) {
|
||||
loader, err := DetectTaxonomyFormat(path)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
taxonomy, err := loader(path, onlysn)
|
||||
|
||||
return taxonomy, err
|
||||
}
|
@ -53,7 +53,7 @@ func CLICSVTaxaIterator(iterator *obitax.ITaxon) *obicsv.ICSVRecord {
|
||||
OptionsWithScientificName(CLIWithScientificName()),
|
||||
OptionsWithPath(CLIWithPath()),
|
||||
OptionsRawTaxid(CLIRawTaxid()),
|
||||
OptionsSource(obioptions.CLISelectedNCBITaxDump()),
|
||||
OptionsSource(obioptions.CLISelectedTaxonomy()),
|
||||
)
|
||||
|
||||
return NewCSVTaxaIterator(iterator, options...)
|
||||
|
20
pkg/obiutils/tar.go
Normal file
20
pkg/obiutils/tar.go
Normal file
@ -0,0 +1,20 @@
|
||||
package obiutils
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
func TarFileReader(file *Reader, path string) (*tar.Reader, error) {
|
||||
tarfile := tar.NewReader(file)
|
||||
header, err := tarfile.Next()
|
||||
|
||||
for err == nil {
|
||||
if header.Name == path {
|
||||
return tarfile, nil
|
||||
}
|
||||
header, err = tarfile.Next()
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("file not found: %s", path)
|
||||
}
|
@ -9,7 +9,7 @@
|
||||
// Ropen opens a (possibly gzipped) file/process/http site for buffered reading.
|
||||
// Wopen opens a (possibly gzipped) file for buffered writing.
|
||||
// Both will use gzip when appropriate and will user buffered IO.
|
||||
package obiformats
|
||||
package obiutils
|
||||
|
||||
import (
|
||||
"bufio"
|
@ -1,4 +1,4 @@
|
||||
package obiformats
|
||||
package obiutils
|
||||
|
||||
import (
|
||||
"bufio"
|
Reference in New Issue
Block a user