mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Adds the ability to read gzip-tar file for the taxonomy dump
This commit is contained in:
@ -2,6 +2,14 @@
|
|||||||
|
|
||||||
## Latest changes
|
## Latest changes
|
||||||
|
|
||||||
|
### Breaking changes
|
||||||
|
|
||||||
|
- In `obimultiplex`, the short version of the **--tag-list** option used to specify the list
|
||||||
|
of tags and primers to be used for the demultiplexing has been changed from `-t` to `-s`.
|
||||||
|
|
||||||
|
- The **--taxdump** option used to specify the path to the taxdump containing the NCBI taxonomy
|
||||||
|
has been renamed to **--taxonomy**.
|
||||||
|
|
||||||
### Bug fixes
|
### Bug fixes
|
||||||
|
|
||||||
- In `obipairing`, correct the stats `seq_a_single` and `seq_b_single` when
|
- In `obipairing`, correct the stats `seq_a_single` and `seq_b_single` when
|
||||||
@ -13,6 +21,10 @@
|
|||||||
|
|
||||||
### New features
|
### New features
|
||||||
|
|
||||||
|
- NCBI Taxonomy dump does not need to be uncompressed and unarchived anymore. The
|
||||||
|
path of the tar and gziped dump file can be directly specified using the
|
||||||
|
**--taxonomy** option.
|
||||||
|
|
||||||
- Most of the time obitools identify automatically sequence file format. But
|
- Most of the time obitools identify automatically sequence file format. But
|
||||||
it fails sometimes. Two new option **--fasta** and **--fastq** are added to
|
it fails sometimes. Two new option **--fasta** and **--fastq** are added to
|
||||||
allow the processing of the rare fasta and fastq files not recognized.
|
allow the processing of the rare fasta and fastq files not recognized.
|
||||||
|
@ -1,36 +1,12 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp"
|
"os"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
|
||||||
|
|
||||||
log "github.com/sirupsen/logrus"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitaxformat"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
|
||||||
one := obifp.OneUint[obifp.Uint128]()
|
obitaxformat.DetectTaxonomyFormat(os.Args[1])
|
||||||
a, b := obifp.OneUint[obifp.Uint64]().LeftShift64(66, 0)
|
|
||||||
log.Infof("one: %v, %v", a, b)
|
|
||||||
shift := one.LeftShift(66)
|
|
||||||
log.Infof("one: %v", shift)
|
|
||||||
|
|
||||||
seq := obiseq.NewBioSequence("test", []byte("atcgggttccaacc"), "")
|
|
||||||
|
|
||||||
kmermap := obikmer.NewKmerMap[obifp.Uint128](
|
|
||||||
obiseq.BioSequenceSlice{
|
|
||||||
seq,
|
|
||||||
},
|
|
||||||
7,
|
|
||||||
true,
|
|
||||||
10,
|
|
||||||
)
|
|
||||||
|
|
||||||
kmers := kmermap.NormalizedKmerSlice(seq, nil)
|
|
||||||
|
|
||||||
for _, kmer := range kmers {
|
|
||||||
println(kmermap.KmerAsString(kmer))
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -157,9 +157,9 @@ func ReadCSV(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, err
|
|||||||
func ReadCSVFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
func ReadCSVFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
|
|
||||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||||
file, err := Ropen(filename)
|
file, err := obiutils.Ropen(filename)
|
||||||
|
|
||||||
if err == ErrNoContent {
|
if err == obiutils.ErrNoContent {
|
||||||
log.Infof("file %s is empty", filename)
|
log.Infof("file %s is empty", filename)
|
||||||
return ReadEmptyFile(options...)
|
return ReadEmptyFile(options...)
|
||||||
}
|
}
|
||||||
@ -173,9 +173,9 @@ func ReadCSVFromFile(filename string, options ...WithOption) (obiiter.IBioSequen
|
|||||||
|
|
||||||
func ReadCSVFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
func ReadCSVFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin")))
|
options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin")))
|
||||||
input, err := Buf(os.Stdin)
|
input, err := obiutils.Buf(os.Stdin)
|
||||||
|
|
||||||
if err == ErrNoContent {
|
if err == obiutils.ErrNoContent {
|
||||||
log.Infof("stdin is empty")
|
log.Infof("stdin is empty")
|
||||||
return ReadEmptyFile(options...)
|
return ReadEmptyFile(options...)
|
||||||
}
|
}
|
||||||
|
@ -227,9 +227,9 @@ func ReadEMBLFromFile(filename string, options ...WithOption) (obiiter.IBioSeque
|
|||||||
|
|
||||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||||
|
|
||||||
reader, err = Ropen(filename)
|
reader, err = obiutils.Ropen(filename)
|
||||||
|
|
||||||
if err == ErrNoContent {
|
if err == obiutils.ErrNoContent {
|
||||||
log.Infof("file %s is empty", filename)
|
log.Infof("file %s is empty", filename)
|
||||||
return ReadEmptyFile(options...)
|
return ReadEmptyFile(options...)
|
||||||
}
|
}
|
||||||
|
@ -271,9 +271,9 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
|||||||
func ReadFastaFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
func ReadFastaFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||||
|
|
||||||
file, err := Ropen(filename)
|
file, err := obiutils.Ropen(filename)
|
||||||
|
|
||||||
if err == ErrNoContent {
|
if err == obiutils.ErrNoContent {
|
||||||
log.Infof("file %s is empty", filename)
|
log.Infof("file %s is empty", filename)
|
||||||
return ReadEmptyFile(options...)
|
return ReadEmptyFile(options...)
|
||||||
}
|
}
|
||||||
@ -287,9 +287,9 @@ func ReadFastaFromFile(filename string, options ...WithOption) (obiiter.IBioSequ
|
|||||||
|
|
||||||
func ReadFastaFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
func ReadFastaFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
options = append(options, OptionsSource("stdin"))
|
options = append(options, OptionsSource("stdin"))
|
||||||
input, err := Buf(os.Stdin)
|
input, err := obiutils.Buf(os.Stdin)
|
||||||
|
|
||||||
if err == ErrNoContent {
|
if err == obiutils.ErrNoContent {
|
||||||
log.Infof("stdin is empty")
|
log.Infof("stdin is empty")
|
||||||
return ReadEmptyFile(options...)
|
return ReadEmptyFile(options...)
|
||||||
}
|
}
|
||||||
|
@ -370,9 +370,9 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
|||||||
func ReadFastqFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
func ReadFastqFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||||
|
|
||||||
file, err := Ropen(filename)
|
file, err := obiutils.Ropen(filename)
|
||||||
|
|
||||||
if err == ErrNoContent {
|
if err == obiutils.ErrNoContent {
|
||||||
log.Infof("file %s is empty", filename)
|
log.Infof("file %s is empty", filename)
|
||||||
return ReadEmptyFile(options...)
|
return ReadEmptyFile(options...)
|
||||||
}
|
}
|
||||||
@ -386,9 +386,9 @@ func ReadFastqFromFile(filename string, options ...WithOption) (obiiter.IBioSequ
|
|||||||
|
|
||||||
func ReadFastqFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
func ReadFastqFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin")))
|
options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin")))
|
||||||
input, err := Buf(os.Stdin)
|
input, err := obiutils.Buf(os.Stdin)
|
||||||
|
|
||||||
if err == ErrNoContent {
|
if err == obiutils.ErrNoContent {
|
||||||
log.Infof("stdin is empty")
|
log.Infof("stdin is empty")
|
||||||
return ReadEmptyFile(options...)
|
return ReadEmptyFile(options...)
|
||||||
}
|
}
|
||||||
|
@ -266,9 +266,9 @@ func ReadGenbankFromFile(filename string, options ...WithOption) (obiiter.IBioSe
|
|||||||
|
|
||||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||||
|
|
||||||
reader, err = Ropen(filename)
|
reader, err = obiutils.Ropen(filename)
|
||||||
|
|
||||||
if err == ErrNoContent {
|
if err == obiutils.ErrNoContent {
|
||||||
log.Infof("file %s is empty", filename)
|
log.Infof("file %s is empty", filename)
|
||||||
return ReadEmptyFile(options...)
|
return ReadEmptyFile(options...)
|
||||||
}
|
}
|
||||||
|
@ -172,15 +172,15 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
|
|||||||
// - error: An error if any occurred during the reading process.
|
// - error: An error if any occurred during the reading process.
|
||||||
func ReadSequencesFromFile(filename string,
|
func ReadSequencesFromFile(filename string,
|
||||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
var file *Reader
|
var file *obiutils.Reader
|
||||||
var reader io.Reader
|
var reader io.Reader
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||||
|
|
||||||
file, err = Ropen(filename)
|
file, err = obiutils.Ropen(filename)
|
||||||
|
|
||||||
if err == ErrNoContent {
|
if err == obiutils.ErrNoContent {
|
||||||
log.Infof("file %s is empty", filename)
|
log.Infof("file %s is empty", filename)
|
||||||
return ReadEmptyFile(options...)
|
return ReadEmptyFile(options...)
|
||||||
}
|
}
|
||||||
|
@ -6,8 +6,8 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats/ncbitaxdump"
|
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitaxformat"
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
|
|
||||||
"github.com/DavidGamba/go-getoptions"
|
"github.com/DavidGamba/go-getoptions"
|
||||||
@ -32,7 +32,7 @@ var _Quality_Shift_Input = byte(33)
|
|||||||
var _Quality_Shift_Output = byte(33)
|
var _Quality_Shift_Output = byte(33)
|
||||||
var _Read_Qualities = true
|
var _Read_Qualities = true
|
||||||
|
|
||||||
var __taxdump__ = ""
|
var __taxonomy__ = ""
|
||||||
var __alternative_name__ = false
|
var __alternative_name__ = false
|
||||||
|
|
||||||
type ArgumentParser func([]string) (*getoptions.GetOpt, []string)
|
type ArgumentParser func([]string) (*getoptions.GetOpt, []string)
|
||||||
@ -131,8 +131,8 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser
|
|||||||
log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/block'")
|
log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/block'")
|
||||||
}
|
}
|
||||||
|
|
||||||
if options.Called("taxdump") {
|
if options.Called("taxonomy") {
|
||||||
taxonomy, err := ncbitaxdump.LoadNCBITaxDump(CLISelectedNCBITaxDump(),
|
taxonomy, err := obitaxformat.LoadTaxonomy(CLISelectedTaxonomy(),
|
||||||
!CLIAreAlternativeNamesSelected())
|
!CLIAreAlternativeNamesSelected())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("Loading taxonomy error: %v", err)
|
log.Fatalf("Loading taxonomy error: %v", err)
|
||||||
@ -186,14 +186,14 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser
|
|||||||
|
|
||||||
func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bool) {
|
func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bool) {
|
||||||
if required {
|
if required {
|
||||||
options.StringVar(&__taxdump__, "taxdump", "",
|
options.StringVar(&__taxonomy__, "taxonomy", "",
|
||||||
options.Alias("t"),
|
options.Alias("t"),
|
||||||
options.Required(),
|
options.Required(),
|
||||||
options.Description("Points to the directory containing the NCBI Taxonomy database dump."))
|
options.Description("Path to the taxonomy database."))
|
||||||
} else {
|
} else {
|
||||||
options.StringVar(&__taxdump__, "taxdump", "",
|
options.StringVar(&__taxonomy__, "taxonomy", "",
|
||||||
options.Alias("t"),
|
options.Alias("t"),
|
||||||
options.Description("Points to the directory containing the NCBI Taxonomy database dump."))
|
options.Description("Path to the taxonomy database."))
|
||||||
}
|
}
|
||||||
if alternatiive {
|
if alternatiive {
|
||||||
options.BoolVar(&__alternative_name__, "alternative-names", false,
|
options.BoolVar(&__alternative_name__, "alternative-names", false,
|
||||||
@ -462,12 +462,12 @@ func SetParallelFilesRead(n int) {
|
|||||||
_ParallelFilesRead = n
|
_ParallelFilesRead = n
|
||||||
}
|
}
|
||||||
|
|
||||||
func CLISelectedNCBITaxDump() string {
|
func CLISelectedTaxonomy() string {
|
||||||
return __taxdump__
|
return __taxonomy__
|
||||||
}
|
}
|
||||||
|
|
||||||
func CLIHasSelectedTaxonomy() bool {
|
func CLIHasSelectedTaxonomy() bool {
|
||||||
return __taxdump__ != ""
|
return __taxonomy__ != ""
|
||||||
}
|
}
|
||||||
|
|
||||||
func CLIAreAlternativeNamesSelected() bool {
|
func CLIAreAlternativeNamesSelected() bool {
|
||||||
@ -479,8 +479,8 @@ func CLILoadSelectedTaxonomy() (*obitax.Taxonomy, error) {
|
|||||||
return obitax.DefaultTaxonomy(), nil
|
return obitax.DefaultTaxonomy(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if CLISelectedNCBITaxDump() != "" {
|
if CLISelectedTaxonomy() != "" {
|
||||||
taxonomy, err := ncbitaxdump.LoadNCBITaxDump(CLISelectedNCBITaxDump(),
|
taxonomy, err := obitaxformat.LoadTaxonomy(CLISelectedTaxonomy(),
|
||||||
!CLIAreAlternativeNamesSelected())
|
!CLIAreAlternativeNamesSelected())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -489,5 +489,5 @@ func CLILoadSelectedTaxonomy() (*obitax.Taxonomy, error) {
|
|||||||
return taxonomy, nil
|
return taxonomy, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, errors.New("no NCBI taxdump selected using option -t|--taxdump")
|
return nil, errors.New("no taxonomy selected using option -t|--taxonomy")
|
||||||
}
|
}
|
||||||
|
@ -8,7 +8,7 @@ import (
|
|||||||
// corresponds to the last commit, and not the one when the file will be
|
// corresponds to the last commit, and not the one when the file will be
|
||||||
// commited
|
// commited
|
||||||
|
|
||||||
var _Commit = "757448c"
|
var _Commit = "ffd6725"
|
||||||
var _Version = "Release 4.2.0"
|
var _Version = "Release 4.2.0"
|
||||||
|
|
||||||
// Version returns the version of the obitools package.
|
// Version returns the version of the obitools package.
|
||||||
|
142
pkg/obitaxformat/ncbitaxdump/readtar.go
Normal file
142
pkg/obitaxformat/ncbitaxdump/readtar.go
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
package ncbitaxdump
|
||||||
|
|
||||||
|
import (
|
||||||
|
"archive/tar"
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||||
|
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
|
func IsNCBITarTaxDump(path string) bool {
|
||||||
|
|
||||||
|
file, err := obiutils.Ropen(path)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
citations := false
|
||||||
|
division := false
|
||||||
|
gencode := false
|
||||||
|
names := false
|
||||||
|
delnodes := false
|
||||||
|
gc := false
|
||||||
|
merged := false
|
||||||
|
nodes := false
|
||||||
|
|
||||||
|
tarfile := tar.NewReader(file)
|
||||||
|
|
||||||
|
header, err := tarfile.Next()
|
||||||
|
|
||||||
|
for err == nil {
|
||||||
|
name := header.Name
|
||||||
|
|
||||||
|
if header.Typeflag == tar.TypeReg {
|
||||||
|
switch name {
|
||||||
|
case "citations.dmp":
|
||||||
|
citations = true
|
||||||
|
case "division.dmp":
|
||||||
|
division = true
|
||||||
|
case "gencode.dmp":
|
||||||
|
gencode = true
|
||||||
|
case "names.dmp":
|
||||||
|
names = true
|
||||||
|
case "delnodes.dmp":
|
||||||
|
delnodes = true
|
||||||
|
case "gc.prt":
|
||||||
|
gc = true
|
||||||
|
case "merged.dmp":
|
||||||
|
merged = true
|
||||||
|
case "nodes.dmp":
|
||||||
|
nodes = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
header, err = tarfile.Next()
|
||||||
|
}
|
||||||
|
|
||||||
|
return citations && division && gencode && names && delnodes && gc && merged && nodes
|
||||||
|
}
|
||||||
|
|
||||||
|
func LoadNCBITarTaxDump(path string, onlysn bool) (*obitax.Taxonomy, error) {
|
||||||
|
|
||||||
|
taxonomy := obitax.NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet)
|
||||||
|
|
||||||
|
//
|
||||||
|
// Load the Taxonomy nodes
|
||||||
|
//
|
||||||
|
|
||||||
|
log.Printf("Loading Taxonomy nodes\n")
|
||||||
|
|
||||||
|
file, err := obiutils.Ropen(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
|
||||||
|
path)
|
||||||
|
}
|
||||||
|
|
||||||
|
nodefile, err := obiutils.TarFileReader(file, "nodes.dmp")
|
||||||
|
if err != nil {
|
||||||
|
file.Close()
|
||||||
|
return nil, fmt.Errorf("cannot open nodes file from '%s'",
|
||||||
|
path)
|
||||||
|
}
|
||||||
|
|
||||||
|
buffered := bufio.NewReader(nodefile)
|
||||||
|
loadNodeTable(buffered, taxonomy)
|
||||||
|
log.Printf("%d Taxonomy nodes read\n", taxonomy.Len())
|
||||||
|
file.Close()
|
||||||
|
|
||||||
|
//
|
||||||
|
// Load the Taxonomy nodes
|
||||||
|
//
|
||||||
|
|
||||||
|
log.Printf("Loading Taxon names\n")
|
||||||
|
|
||||||
|
file, err = obiutils.Ropen(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
|
||||||
|
path)
|
||||||
|
}
|
||||||
|
|
||||||
|
namefile, nerr := obiutils.TarFileReader(file, "names.dmp")
|
||||||
|
if nerr != nil {
|
||||||
|
file.Close()
|
||||||
|
return nil, fmt.Errorf("cannot open names file from '%s'",
|
||||||
|
path)
|
||||||
|
}
|
||||||
|
n := loadNameTable(namefile, taxonomy, onlysn)
|
||||||
|
log.Printf("%d taxon names read\n", n)
|
||||||
|
file.Close()
|
||||||
|
|
||||||
|
//
|
||||||
|
// Load the merged taxa
|
||||||
|
//
|
||||||
|
|
||||||
|
log.Printf("Loading Merged taxa\n")
|
||||||
|
file, err = obiutils.Ropen(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
|
||||||
|
path)
|
||||||
|
}
|
||||||
|
|
||||||
|
aliasfile, aerr := obiutils.TarFileReader(file, "merged.dmp")
|
||||||
|
if aerr != nil {
|
||||||
|
file.Close()
|
||||||
|
return nil, fmt.Errorf("cannot open merged file from '%s'",
|
||||||
|
path)
|
||||||
|
}
|
||||||
|
|
||||||
|
buffered = bufio.NewReader(aliasfile)
|
||||||
|
n = loadMergedTable(buffered, taxonomy)
|
||||||
|
log.Printf("%d merged taxa read\n", n)
|
||||||
|
|
||||||
|
root := taxonomy.Taxon("1")
|
||||||
|
taxonomy.SetRoot(root)
|
||||||
|
|
||||||
|
return taxonomy, nil
|
||||||
|
}
|
91
pkg/obitaxformat/taxonomy_read.go
Normal file
91
pkg/obitaxformat/taxonomy_read.go
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
package obitaxformat
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitaxformat/ncbitaxdump"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||||
|
"github.com/gabriel-vasile/mimetype"
|
||||||
|
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
|
type TaxonomyLoader func(path string, onlysn bool) (*obitax.Taxonomy, error)
|
||||||
|
|
||||||
|
func DetectTaxonomyTarFormat(path string) (TaxonomyLoader, error) {
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case ncbitaxdump.IsNCBITarTaxDump(path):
|
||||||
|
log.Infof("NCBI Taxdump Tar Archive detected: %s", path)
|
||||||
|
return ncbitaxdump.LoadNCBITarTaxDump, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil, fmt.Errorf("unknown taxonomy format: %s", path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func DetectTaxonomyFormat(path string) (TaxonomyLoader, error) {
|
||||||
|
|
||||||
|
file, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
fileInfo, err := file.Stat()
|
||||||
|
if err != nil {
|
||||||
|
file.Close()
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
file.Close()
|
||||||
|
|
||||||
|
if fileInfo.IsDir() {
|
||||||
|
// For the moment, we only support NCBI Taxdump directory format
|
||||||
|
log.Infof("NCBI Taxdump detected: %s", path)
|
||||||
|
return ncbitaxdump.LoadNCBITaxDump, nil
|
||||||
|
} else {
|
||||||
|
file, err := obiutils.Ropen(path)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
mimetype, err := mimetype.DetectReader(file)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
file.Close()
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
file.Close()
|
||||||
|
|
||||||
|
switch mimetype.String() {
|
||||||
|
case "text/csv":
|
||||||
|
return LoadCSVTaxonomy, nil
|
||||||
|
case "application/x-tar":
|
||||||
|
return DetectTaxonomyTarFormat(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Fatalf("Detected file format: %s", mimetype.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func LoadCSVTaxonomy(path string, onlysn bool) (*obitax.Taxonomy, error) {
|
||||||
|
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func LoadTaxonomy(path string, onlysn bool) (*obitax.Taxonomy, error) {
|
||||||
|
loader, err := DetectTaxonomyFormat(path)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
taxonomy, err := loader(path, onlysn)
|
||||||
|
|
||||||
|
return taxonomy, err
|
||||||
|
}
|
@ -53,7 +53,7 @@ func CLICSVTaxaIterator(iterator *obitax.ITaxon) *obicsv.ICSVRecord {
|
|||||||
OptionsWithScientificName(CLIWithScientificName()),
|
OptionsWithScientificName(CLIWithScientificName()),
|
||||||
OptionsWithPath(CLIWithPath()),
|
OptionsWithPath(CLIWithPath()),
|
||||||
OptionsRawTaxid(CLIRawTaxid()),
|
OptionsRawTaxid(CLIRawTaxid()),
|
||||||
OptionsSource(obioptions.CLISelectedNCBITaxDump()),
|
OptionsSource(obioptions.CLISelectedTaxonomy()),
|
||||||
)
|
)
|
||||||
|
|
||||||
return NewCSVTaxaIterator(iterator, options...)
|
return NewCSVTaxaIterator(iterator, options...)
|
||||||
|
20
pkg/obiutils/tar.go
Normal file
20
pkg/obiutils/tar.go
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
package obiutils
|
||||||
|
|
||||||
|
import (
|
||||||
|
"archive/tar"
|
||||||
|
"fmt"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TarFileReader(file *Reader, path string) (*tar.Reader, error) {
|
||||||
|
tarfile := tar.NewReader(file)
|
||||||
|
header, err := tarfile.Next()
|
||||||
|
|
||||||
|
for err == nil {
|
||||||
|
if header.Name == path {
|
||||||
|
return tarfile, nil
|
||||||
|
}
|
||||||
|
header, err = tarfile.Next()
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil, fmt.Errorf("file not found: %s", path)
|
||||||
|
}
|
@ -9,7 +9,7 @@
|
|||||||
// Ropen opens a (possibly gzipped) file/process/http site for buffered reading.
|
// Ropen opens a (possibly gzipped) file/process/http site for buffered reading.
|
||||||
// Wopen opens a (possibly gzipped) file for buffered writing.
|
// Wopen opens a (possibly gzipped) file for buffered writing.
|
||||||
// Both will use gzip when appropriate and will user buffered IO.
|
// Both will use gzip when appropriate and will user buffered IO.
|
||||||
package obiformats
|
package obiutils
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
@ -1,4 +1,4 @@
|
|||||||
package obiformats
|
package obiutils
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
Reference in New Issue
Block a user