Adds the ability to read gzip-tar file for the taxonomy dump

This commit is contained in:
Eric Coissac
2025-01-24 11:47:59 +01:00
parent ffd67252c3
commit 3137c1f841
17 changed files with 305 additions and 64 deletions

View File

@ -2,6 +2,14 @@
## Latest changes
### Breaking changes
- In `obimultiplex`, the short version of the **--tag-list** option used to specify the list
of tags and primers to be used for the demultiplexing has been changed from `-t` to `-s`.
- The **--taxdump** option used to specify the path to the taxdump containing the NCBI taxonomy
has been renamed to **--taxonomy**.
### Bug fixes
- In `obipairing`, correct the stats `seq_a_single` and `seq_b_single` when
@ -13,6 +21,10 @@
### New features
- NCBI Taxonomy dump does not need to be uncompressed and unarchived anymore. The
path of the tar and gziped dump file can be directly specified using the
**--taxonomy** option.
- Most of the time obitools identify automatically sequence file format. But
it fails sometimes. Two new option **--fasta** and **--fastq** are added to
allow the processing of the rare fasta and fastq files not recognized.

View File

@ -1,36 +1,12 @@
package main
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"os"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitaxformat"
)
func main() {
one := obifp.OneUint[obifp.Uint128]()
a, b := obifp.OneUint[obifp.Uint64]().LeftShift64(66, 0)
log.Infof("one: %v, %v", a, b)
shift := one.LeftShift(66)
log.Infof("one: %v", shift)
seq := obiseq.NewBioSequence("test", []byte("atcgggttccaacc"), "")
kmermap := obikmer.NewKmerMap[obifp.Uint128](
obiseq.BioSequenceSlice{
seq,
},
7,
true,
10,
)
kmers := kmermap.NormalizedKmerSlice(seq, nil)
for _, kmer := range kmers {
println(kmermap.KmerAsString(kmer))
}
obitaxformat.DetectTaxonomyFormat(os.Args[1])
}

View File

@ -157,9 +157,9 @@ func ReadCSV(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, err
func ReadCSVFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
file, err := Ropen(filename)
file, err := obiutils.Ropen(filename)
if err == ErrNoContent {
if err == obiutils.ErrNoContent {
log.Infof("file %s is empty", filename)
return ReadEmptyFile(options...)
}
@ -173,9 +173,9 @@ func ReadCSVFromFile(filename string, options ...WithOption) (obiiter.IBioSequen
func ReadCSVFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin")))
input, err := Buf(os.Stdin)
input, err := obiutils.Buf(os.Stdin)
if err == ErrNoContent {
if err == obiutils.ErrNoContent {
log.Infof("stdin is empty")
return ReadEmptyFile(options...)
}

View File

@ -227,9 +227,9 @@ func ReadEMBLFromFile(filename string, options ...WithOption) (obiiter.IBioSeque
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
reader, err = Ropen(filename)
reader, err = obiutils.Ropen(filename)
if err == ErrNoContent {
if err == obiutils.ErrNoContent {
log.Infof("file %s is empty", filename)
return ReadEmptyFile(options...)
}

View File

@ -271,9 +271,9 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
func ReadFastaFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
file, err := Ropen(filename)
file, err := obiutils.Ropen(filename)
if err == ErrNoContent {
if err == obiutils.ErrNoContent {
log.Infof("file %s is empty", filename)
return ReadEmptyFile(options...)
}
@ -287,9 +287,9 @@ func ReadFastaFromFile(filename string, options ...WithOption) (obiiter.IBioSequ
func ReadFastaFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
options = append(options, OptionsSource("stdin"))
input, err := Buf(os.Stdin)
input, err := obiutils.Buf(os.Stdin)
if err == ErrNoContent {
if err == obiutils.ErrNoContent {
log.Infof("stdin is empty")
return ReadEmptyFile(options...)
}

View File

@ -370,9 +370,9 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
func ReadFastqFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
file, err := Ropen(filename)
file, err := obiutils.Ropen(filename)
if err == ErrNoContent {
if err == obiutils.ErrNoContent {
log.Infof("file %s is empty", filename)
return ReadEmptyFile(options...)
}
@ -386,9 +386,9 @@ func ReadFastqFromFile(filename string, options ...WithOption) (obiiter.IBioSequ
func ReadFastqFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin")))
input, err := Buf(os.Stdin)
input, err := obiutils.Buf(os.Stdin)
if err == ErrNoContent {
if err == obiutils.ErrNoContent {
log.Infof("stdin is empty")
return ReadEmptyFile(options...)
}

View File

@ -266,9 +266,9 @@ func ReadGenbankFromFile(filename string, options ...WithOption) (obiiter.IBioSe
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
reader, err = Ropen(filename)
reader, err = obiutils.Ropen(filename)
if err == ErrNoContent {
if err == obiutils.ErrNoContent {
log.Infof("file %s is empty", filename)
return ReadEmptyFile(options...)
}

View File

@ -172,15 +172,15 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
// - error: An error if any occurred during the reading process.
func ReadSequencesFromFile(filename string,
options ...WithOption) (obiiter.IBioSequence, error) {
var file *Reader
var file *obiutils.Reader
var reader io.Reader
var err error
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
file, err = Ropen(filename)
file, err = obiutils.Ropen(filename)
if err == ErrNoContent {
if err == obiutils.ErrNoContent {
log.Infof("file %s is empty", filename)
return ReadEmptyFile(options...)
}

View File

@ -6,8 +6,8 @@ import (
"os"
"runtime"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats/ncbitaxdump"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitaxformat"
log "github.com/sirupsen/logrus"
"github.com/DavidGamba/go-getoptions"
@ -32,7 +32,7 @@ var _Quality_Shift_Input = byte(33)
var _Quality_Shift_Output = byte(33)
var _Read_Qualities = true
var __taxdump__ = ""
var __taxonomy__ = ""
var __alternative_name__ = false
type ArgumentParser func([]string) (*getoptions.GetOpt, []string)
@ -131,8 +131,8 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser
log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/block'")
}
if options.Called("taxdump") {
taxonomy, err := ncbitaxdump.LoadNCBITaxDump(CLISelectedNCBITaxDump(),
if options.Called("taxonomy") {
taxonomy, err := obitaxformat.LoadTaxonomy(CLISelectedTaxonomy(),
!CLIAreAlternativeNamesSelected())
if err != nil {
log.Fatalf("Loading taxonomy error: %v", err)
@ -186,14 +186,14 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser
func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bool) {
if required {
options.StringVar(&__taxdump__, "taxdump", "",
options.StringVar(&__taxonomy__, "taxonomy", "",
options.Alias("t"),
options.Required(),
options.Description("Points to the directory containing the NCBI Taxonomy database dump."))
options.Description("Path to the taxonomy database."))
} else {
options.StringVar(&__taxdump__, "taxdump", "",
options.StringVar(&__taxonomy__, "taxonomy", "",
options.Alias("t"),
options.Description("Points to the directory containing the NCBI Taxonomy database dump."))
options.Description("Path to the taxonomy database."))
}
if alternatiive {
options.BoolVar(&__alternative_name__, "alternative-names", false,
@ -462,12 +462,12 @@ func SetParallelFilesRead(n int) {
_ParallelFilesRead = n
}
func CLISelectedNCBITaxDump() string {
return __taxdump__
func CLISelectedTaxonomy() string {
return __taxonomy__
}
func CLIHasSelectedTaxonomy() bool {
return __taxdump__ != ""
return __taxonomy__ != ""
}
func CLIAreAlternativeNamesSelected() bool {
@ -479,8 +479,8 @@ func CLILoadSelectedTaxonomy() (*obitax.Taxonomy, error) {
return obitax.DefaultTaxonomy(), nil
}
if CLISelectedNCBITaxDump() != "" {
taxonomy, err := ncbitaxdump.LoadNCBITaxDump(CLISelectedNCBITaxDump(),
if CLISelectedTaxonomy() != "" {
taxonomy, err := obitaxformat.LoadTaxonomy(CLISelectedTaxonomy(),
!CLIAreAlternativeNamesSelected())
if err != nil {
return nil, err
@ -489,5 +489,5 @@ func CLILoadSelectedTaxonomy() (*obitax.Taxonomy, error) {
return taxonomy, nil
}
return nil, errors.New("no NCBI taxdump selected using option -t|--taxdump")
return nil, errors.New("no taxonomy selected using option -t|--taxonomy")
}

View File

@ -8,7 +8,7 @@ import (
// corresponds to the last commit, and not the one when the file will be
// commited
var _Commit = "757448c"
var _Commit = "ffd6725"
var _Version = "Release 4.2.0"
// Version returns the version of the obitools package.

View File

@ -0,0 +1,142 @@
package ncbitaxdump
import (
"archive/tar"
"bufio"
"fmt"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
func IsNCBITarTaxDump(path string) bool {
file, err := obiutils.Ropen(path)
if err != nil {
return false
}
defer file.Close()
citations := false
division := false
gencode := false
names := false
delnodes := false
gc := false
merged := false
nodes := false
tarfile := tar.NewReader(file)
header, err := tarfile.Next()
for err == nil {
name := header.Name
if header.Typeflag == tar.TypeReg {
switch name {
case "citations.dmp":
citations = true
case "division.dmp":
division = true
case "gencode.dmp":
gencode = true
case "names.dmp":
names = true
case "delnodes.dmp":
delnodes = true
case "gc.prt":
gc = true
case "merged.dmp":
merged = true
case "nodes.dmp":
nodes = true
}
}
header, err = tarfile.Next()
}
return citations && division && gencode && names && delnodes && gc && merged && nodes
}
func LoadNCBITarTaxDump(path string, onlysn bool) (*obitax.Taxonomy, error) {
taxonomy := obitax.NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet)
//
// Load the Taxonomy nodes
//
log.Printf("Loading Taxonomy nodes\n")
file, err := obiutils.Ropen(path)
if err != nil {
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
path)
}
nodefile, err := obiutils.TarFileReader(file, "nodes.dmp")
if err != nil {
file.Close()
return nil, fmt.Errorf("cannot open nodes file from '%s'",
path)
}
buffered := bufio.NewReader(nodefile)
loadNodeTable(buffered, taxonomy)
log.Printf("%d Taxonomy nodes read\n", taxonomy.Len())
file.Close()
//
// Load the Taxonomy nodes
//
log.Printf("Loading Taxon names\n")
file, err = obiutils.Ropen(path)
if err != nil {
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
path)
}
namefile, nerr := obiutils.TarFileReader(file, "names.dmp")
if nerr != nil {
file.Close()
return nil, fmt.Errorf("cannot open names file from '%s'",
path)
}
n := loadNameTable(namefile, taxonomy, onlysn)
log.Printf("%d taxon names read\n", n)
file.Close()
//
// Load the merged taxa
//
log.Printf("Loading Merged taxa\n")
file, err = obiutils.Ropen(path)
if err != nil {
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
path)
}
aliasfile, aerr := obiutils.TarFileReader(file, "merged.dmp")
if aerr != nil {
file.Close()
return nil, fmt.Errorf("cannot open merged file from '%s'",
path)
}
buffered = bufio.NewReader(aliasfile)
n = loadMergedTable(buffered, taxonomy)
log.Printf("%d merged taxa read\n", n)
root := taxonomy.Taxon("1")
taxonomy.SetRoot(root)
return taxonomy, nil
}

View File

@ -0,0 +1,91 @@
package obitaxformat
import (
"fmt"
"os"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitaxformat/ncbitaxdump"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/gabriel-vasile/mimetype"
log "github.com/sirupsen/logrus"
)
type TaxonomyLoader func(path string, onlysn bool) (*obitax.Taxonomy, error)
func DetectTaxonomyTarFormat(path string) (TaxonomyLoader, error) {
switch {
case ncbitaxdump.IsNCBITarTaxDump(path):
log.Infof("NCBI Taxdump Tar Archive detected: %s", path)
return ncbitaxdump.LoadNCBITarTaxDump, nil
}
return nil, fmt.Errorf("unknown taxonomy format: %s", path)
}
func DetectTaxonomyFormat(path string) (TaxonomyLoader, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
fileInfo, err := file.Stat()
if err != nil {
file.Close()
return nil, err
}
file.Close()
if fileInfo.IsDir() {
// For the moment, we only support NCBI Taxdump directory format
log.Infof("NCBI Taxdump detected: %s", path)
return ncbitaxdump.LoadNCBITaxDump, nil
} else {
file, err := obiutils.Ropen(path)
if err != nil {
return nil, err
}
mimetype, err := mimetype.DetectReader(file)
if err != nil {
file.Close()
return nil, err
}
file.Close()
switch mimetype.String() {
case "text/csv":
return LoadCSVTaxonomy, nil
case "application/x-tar":
return DetectTaxonomyTarFormat(path)
}
log.Fatalf("Detected file format: %s", mimetype.String())
}
return nil, nil
}
func LoadCSVTaxonomy(path string, onlysn bool) (*obitax.Taxonomy, error) {
return nil, nil
}
func LoadTaxonomy(path string, onlysn bool) (*obitax.Taxonomy, error) {
loader, err := DetectTaxonomyFormat(path)
if err != nil {
return nil, err
}
taxonomy, err := loader(path, onlysn)
return taxonomy, err
}

View File

@ -53,7 +53,7 @@ func CLICSVTaxaIterator(iterator *obitax.ITaxon) *obicsv.ICSVRecord {
OptionsWithScientificName(CLIWithScientificName()),
OptionsWithPath(CLIWithPath()),
OptionsRawTaxid(CLIRawTaxid()),
OptionsSource(obioptions.CLISelectedNCBITaxDump()),
OptionsSource(obioptions.CLISelectedTaxonomy()),
)
return NewCSVTaxaIterator(iterator, options...)

20
pkg/obiutils/tar.go Normal file
View File

@ -0,0 +1,20 @@
package obiutils
import (
"archive/tar"
"fmt"
)
func TarFileReader(file *Reader, path string) (*tar.Reader, error) {
tarfile := tar.NewReader(file)
header, err := tarfile.Next()
for err == nil {
if header.Name == path {
return tarfile, nil
}
header, err = tarfile.Next()
}
return nil, fmt.Errorf("file not found: %s", path)
}

View File

@ -9,7 +9,7 @@
// Ropen opens a (possibly gzipped) file/process/http site for buffered reading.
// Wopen opens a (possibly gzipped) file for buffered writing.
// Both will use gzip when appropriate and will user buffered IO.
package obiformats
package obiutils
import (
"bufio"

View File

@ -1,4 +1,4 @@
package obiformats
package obiutils
import (
"bufio"