diff --git a/Release-notes.md b/Release-notes.md index be97334..8b9ed6a 100644 --- a/Release-notes.md +++ b/Release-notes.md @@ -2,6 +2,14 @@ ## Latest changes +### Breaking changes + +- In `obimultiplex`, the short version of the **--tag-list** option used to specify the list + of tags and primers to be used for the demultiplexing has been changed from `-t` to `-s`. + +- The **--taxdump** option used to specify the path to the taxdump containing the NCBI taxonomy + has been renamed to **--taxonomy**. + ### Bug fixes - In `obipairing`, correct the stats `seq_a_single` and `seq_b_single` when @@ -13,6 +21,10 @@ ### New features +- NCBI Taxonomy dump does not need to be uncompressed and unarchived anymore. The + path of the tar and gziped dump file can be directly specified using the + **--taxonomy** option. + - Most of the time obitools identify automatically sequence file format. But it fails sometimes. Two new option **--fasta** and **--fastq** are added to allow the processing of the rare fasta and fastq files not recognized. diff --git a/cmd/test/main.go b/cmd/test/main.go index 71eb38b..84ab35d 100644 --- a/cmd/test/main.go +++ b/cmd/test/main.go @@ -1,36 +1,12 @@ package main import ( - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "os" - log "github.com/sirupsen/logrus" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitaxformat" ) func main() { - one := obifp.OneUint[obifp.Uint128]() - a, b := obifp.OneUint[obifp.Uint64]().LeftShift64(66, 0) - log.Infof("one: %v, %v", a, b) - shift := one.LeftShift(66) - log.Infof("one: %v", shift) - - seq := obiseq.NewBioSequence("test", []byte("atcgggttccaacc"), "") - - kmermap := obikmer.NewKmerMap[obifp.Uint128]( - obiseq.BioSequenceSlice{ - seq, - }, - 7, - true, - 10, - ) - - kmers := kmermap.NormalizedKmerSlice(seq, nil) - - for _, kmer := range kmers { - println(kmermap.KmerAsString(kmer)) - } - + obitaxformat.DetectTaxonomyFormat(os.Args[1]) } diff --git a/pkg/obiformats/csv_read.go b/pkg/obiformats/csv_read.go index 8ec842d..239989c 100644 --- a/pkg/obiformats/csv_read.go +++ b/pkg/obiformats/csv_read.go @@ -157,9 +157,9 @@ func ReadCSV(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, err func ReadCSVFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) { options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename))))) - file, err := Ropen(filename) + file, err := obiutils.Ropen(filename) - if err == ErrNoContent { + if err == obiutils.ErrNoContent { log.Infof("file %s is empty", filename) return ReadEmptyFile(options...) } @@ -173,9 +173,9 @@ func ReadCSVFromFile(filename string, options ...WithOption) (obiiter.IBioSequen func ReadCSVFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) { options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin"))) - input, err := Buf(os.Stdin) + input, err := obiutils.Buf(os.Stdin) - if err == ErrNoContent { + if err == obiutils.ErrNoContent { log.Infof("stdin is empty") return ReadEmptyFile(options...) } diff --git a/pkg/obiformats/embl_read.go b/pkg/obiformats/embl_read.go index 992660c..4a5ca60 100644 --- a/pkg/obiformats/embl_read.go +++ b/pkg/obiformats/embl_read.go @@ -227,9 +227,9 @@ func ReadEMBLFromFile(filename string, options ...WithOption) (obiiter.IBioSeque options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename))))) - reader, err = Ropen(filename) + reader, err = obiutils.Ropen(filename) - if err == ErrNoContent { + if err == obiutils.ErrNoContent { log.Infof("file %s is empty", filename) return ReadEmptyFile(options...) } diff --git a/pkg/obiformats/fastaseq_read.go b/pkg/obiformats/fastaseq_read.go index 86b5de7..a3ab5c8 100644 --- a/pkg/obiformats/fastaseq_read.go +++ b/pkg/obiformats/fastaseq_read.go @@ -271,9 +271,9 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e func ReadFastaFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) { options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename))))) - file, err := Ropen(filename) + file, err := obiutils.Ropen(filename) - if err == ErrNoContent { + if err == obiutils.ErrNoContent { log.Infof("file %s is empty", filename) return ReadEmptyFile(options...) } @@ -287,9 +287,9 @@ func ReadFastaFromFile(filename string, options ...WithOption) (obiiter.IBioSequ func ReadFastaFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) { options = append(options, OptionsSource("stdin")) - input, err := Buf(os.Stdin) + input, err := obiutils.Buf(os.Stdin) - if err == ErrNoContent { + if err == obiutils.ErrNoContent { log.Infof("stdin is empty") return ReadEmptyFile(options...) } diff --git a/pkg/obiformats/fastqseq_read.go b/pkg/obiformats/fastqseq_read.go index f350f5a..55ba783 100644 --- a/pkg/obiformats/fastqseq_read.go +++ b/pkg/obiformats/fastqseq_read.go @@ -370,9 +370,9 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e func ReadFastqFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) { options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename))))) - file, err := Ropen(filename) + file, err := obiutils.Ropen(filename) - if err == ErrNoContent { + if err == obiutils.ErrNoContent { log.Infof("file %s is empty", filename) return ReadEmptyFile(options...) } @@ -386,9 +386,9 @@ func ReadFastqFromFile(filename string, options ...WithOption) (obiiter.IBioSequ func ReadFastqFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) { options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin"))) - input, err := Buf(os.Stdin) + input, err := obiutils.Buf(os.Stdin) - if err == ErrNoContent { + if err == obiutils.ErrNoContent { log.Infof("stdin is empty") return ReadEmptyFile(options...) } diff --git a/pkg/obiformats/genbank_read.go b/pkg/obiformats/genbank_read.go index b0a0dfc..34fed24 100644 --- a/pkg/obiformats/genbank_read.go +++ b/pkg/obiformats/genbank_read.go @@ -266,9 +266,9 @@ func ReadGenbankFromFile(filename string, options ...WithOption) (obiiter.IBioSe options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename))))) - reader, err = Ropen(filename) + reader, err = obiutils.Ropen(filename) - if err == ErrNoContent { + if err == obiutils.ErrNoContent { log.Infof("file %s is empty", filename) return ReadEmptyFile(options...) } diff --git a/pkg/obiformats/universal_read.go b/pkg/obiformats/universal_read.go index 6ee0750..5e7d6da 100644 --- a/pkg/obiformats/universal_read.go +++ b/pkg/obiformats/universal_read.go @@ -172,15 +172,15 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) { // - error: An error if any occurred during the reading process. func ReadSequencesFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) { - var file *Reader + var file *obiutils.Reader var reader io.Reader var err error options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename))))) - file, err = Ropen(filename) + file, err = obiutils.Ropen(filename) - if err == ErrNoContent { + if err == obiutils.ErrNoContent { log.Infof("file %s is empty", filename) return ReadEmptyFile(options...) } diff --git a/pkg/obioptions/options.go b/pkg/obioptions/options.go index ab72159..4951afe 100644 --- a/pkg/obioptions/options.go +++ b/pkg/obioptions/options.go @@ -6,8 +6,8 @@ import ( "os" "runtime" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats/ncbitaxdump" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitaxformat" log "github.com/sirupsen/logrus" "github.com/DavidGamba/go-getoptions" @@ -32,7 +32,7 @@ var _Quality_Shift_Input = byte(33) var _Quality_Shift_Output = byte(33) var _Read_Qualities = true -var __taxdump__ = "" +var __taxonomy__ = "" var __alternative_name__ = false type ArgumentParser func([]string) (*getoptions.GetOpt, []string) @@ -131,8 +131,8 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/block'") } - if options.Called("taxdump") { - taxonomy, err := ncbitaxdump.LoadNCBITaxDump(CLISelectedNCBITaxDump(), + if options.Called("taxonomy") { + taxonomy, err := obitaxformat.LoadTaxonomy(CLISelectedTaxonomy(), !CLIAreAlternativeNamesSelected()) if err != nil { log.Fatalf("Loading taxonomy error: %v", err) @@ -186,14 +186,14 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bool) { if required { - options.StringVar(&__taxdump__, "taxdump", "", + options.StringVar(&__taxonomy__, "taxonomy", "", options.Alias("t"), options.Required(), - options.Description("Points to the directory containing the NCBI Taxonomy database dump.")) + options.Description("Path to the taxonomy database.")) } else { - options.StringVar(&__taxdump__, "taxdump", "", + options.StringVar(&__taxonomy__, "taxonomy", "", options.Alias("t"), - options.Description("Points to the directory containing the NCBI Taxonomy database dump.")) + options.Description("Path to the taxonomy database.")) } if alternatiive { options.BoolVar(&__alternative_name__, "alternative-names", false, @@ -462,12 +462,12 @@ func SetParallelFilesRead(n int) { _ParallelFilesRead = n } -func CLISelectedNCBITaxDump() string { - return __taxdump__ +func CLISelectedTaxonomy() string { + return __taxonomy__ } func CLIHasSelectedTaxonomy() bool { - return __taxdump__ != "" + return __taxonomy__ != "" } func CLIAreAlternativeNamesSelected() bool { @@ -479,8 +479,8 @@ func CLILoadSelectedTaxonomy() (*obitax.Taxonomy, error) { return obitax.DefaultTaxonomy(), nil } - if CLISelectedNCBITaxDump() != "" { - taxonomy, err := ncbitaxdump.LoadNCBITaxDump(CLISelectedNCBITaxDump(), + if CLISelectedTaxonomy() != "" { + taxonomy, err := obitaxformat.LoadTaxonomy(CLISelectedTaxonomy(), !CLIAreAlternativeNamesSelected()) if err != nil { return nil, err @@ -489,5 +489,5 @@ func CLILoadSelectedTaxonomy() (*obitax.Taxonomy, error) { return taxonomy, nil } - return nil, errors.New("no NCBI taxdump selected using option -t|--taxdump") + return nil, errors.New("no taxonomy selected using option -t|--taxonomy") } diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 45175d7..72251f0 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "757448c" +var _Commit = "ffd6725" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obiformats/ncbitaxdump/read.go b/pkg/obitaxformat/ncbitaxdump/read.go similarity index 100% rename from pkg/obiformats/ncbitaxdump/read.go rename to pkg/obitaxformat/ncbitaxdump/read.go diff --git a/pkg/obitaxformat/ncbitaxdump/readtar.go b/pkg/obitaxformat/ncbitaxdump/readtar.go new file mode 100644 index 0000000..92d112b --- /dev/null +++ b/pkg/obitaxformat/ncbitaxdump/readtar.go @@ -0,0 +1,142 @@ +package ncbitaxdump + +import ( + "archive/tar" + "bufio" + "fmt" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" + + log "github.com/sirupsen/logrus" +) + +func IsNCBITarTaxDump(path string) bool { + + file, err := obiutils.Ropen(path) + + if err != nil { + return false + } + + defer file.Close() + + citations := false + division := false + gencode := false + names := false + delnodes := false + gc := false + merged := false + nodes := false + + tarfile := tar.NewReader(file) + + header, err := tarfile.Next() + + for err == nil { + name := header.Name + + if header.Typeflag == tar.TypeReg { + switch name { + case "citations.dmp": + citations = true + case "division.dmp": + division = true + case "gencode.dmp": + gencode = true + case "names.dmp": + names = true + case "delnodes.dmp": + delnodes = true + case "gc.prt": + gc = true + case "merged.dmp": + merged = true + case "nodes.dmp": + nodes = true + } + } + header, err = tarfile.Next() + } + + return citations && division && gencode && names && delnodes && gc && merged && nodes +} + +func LoadNCBITarTaxDump(path string, onlysn bool) (*obitax.Taxonomy, error) { + + taxonomy := obitax.NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet) + + // + // Load the Taxonomy nodes + // + + log.Printf("Loading Taxonomy nodes\n") + + file, err := obiutils.Ropen(path) + if err != nil { + return nil, fmt.Errorf("cannot open taxonomy file from '%s'", + path) + } + + nodefile, err := obiutils.TarFileReader(file, "nodes.dmp") + if err != nil { + file.Close() + return nil, fmt.Errorf("cannot open nodes file from '%s'", + path) + } + + buffered := bufio.NewReader(nodefile) + loadNodeTable(buffered, taxonomy) + log.Printf("%d Taxonomy nodes read\n", taxonomy.Len()) + file.Close() + + // + // Load the Taxonomy nodes + // + + log.Printf("Loading Taxon names\n") + + file, err = obiutils.Ropen(path) + if err != nil { + return nil, fmt.Errorf("cannot open taxonomy file from '%s'", + path) + } + + namefile, nerr := obiutils.TarFileReader(file, "names.dmp") + if nerr != nil { + file.Close() + return nil, fmt.Errorf("cannot open names file from '%s'", + path) + } + n := loadNameTable(namefile, taxonomy, onlysn) + log.Printf("%d taxon names read\n", n) + file.Close() + + // + // Load the merged taxa + // + + log.Printf("Loading Merged taxa\n") + file, err = obiutils.Ropen(path) + if err != nil { + return nil, fmt.Errorf("cannot open taxonomy file from '%s'", + path) + } + + aliasfile, aerr := obiutils.TarFileReader(file, "merged.dmp") + if aerr != nil { + file.Close() + return nil, fmt.Errorf("cannot open merged file from '%s'", + path) + } + + buffered = bufio.NewReader(aliasfile) + n = loadMergedTable(buffered, taxonomy) + log.Printf("%d merged taxa read\n", n) + + root := taxonomy.Taxon("1") + taxonomy.SetRoot(root) + + return taxonomy, nil +} diff --git a/pkg/obitaxformat/taxonomy_read.go b/pkg/obitaxformat/taxonomy_read.go new file mode 100644 index 0000000..58c26ef --- /dev/null +++ b/pkg/obitaxformat/taxonomy_read.go @@ -0,0 +1,91 @@ +package obitaxformat + +import ( + "fmt" + "os" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitaxformat/ncbitaxdump" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" + "github.com/gabriel-vasile/mimetype" + + log "github.com/sirupsen/logrus" +) + +type TaxonomyLoader func(path string, onlysn bool) (*obitax.Taxonomy, error) + +func DetectTaxonomyTarFormat(path string) (TaxonomyLoader, error) { + + switch { + case ncbitaxdump.IsNCBITarTaxDump(path): + log.Infof("NCBI Taxdump Tar Archive detected: %s", path) + return ncbitaxdump.LoadNCBITarTaxDump, nil + } + + return nil, fmt.Errorf("unknown taxonomy format: %s", path) +} + +func DetectTaxonomyFormat(path string) (TaxonomyLoader, error) { + + file, err := os.Open(path) + if err != nil { + return nil, err + } + + fileInfo, err := file.Stat() + if err != nil { + file.Close() + return nil, err + } + + file.Close() + + if fileInfo.IsDir() { + // For the moment, we only support NCBI Taxdump directory format + log.Infof("NCBI Taxdump detected: %s", path) + return ncbitaxdump.LoadNCBITaxDump, nil + } else { + file, err := obiutils.Ropen(path) + + if err != nil { + return nil, err + } + + mimetype, err := mimetype.DetectReader(file) + + if err != nil { + file.Close() + return nil, err + } + + file.Close() + + switch mimetype.String() { + case "text/csv": + return LoadCSVTaxonomy, nil + case "application/x-tar": + return DetectTaxonomyTarFormat(path) + } + + log.Fatalf("Detected file format: %s", mimetype.String()) + } + + return nil, nil +} + +func LoadCSVTaxonomy(path string, onlysn bool) (*obitax.Taxonomy, error) { + + return nil, nil +} + +func LoadTaxonomy(path string, onlysn bool) (*obitax.Taxonomy, error) { + loader, err := DetectTaxonomyFormat(path) + + if err != nil { + return nil, err + } + + taxonomy, err := loader(path, onlysn) + + return taxonomy, err +} diff --git a/pkg/obitools/obifind/obifind.go b/pkg/obitools/obifind/obifind.go index 6534a0c..f1df393 100644 --- a/pkg/obitools/obifind/obifind.go +++ b/pkg/obitools/obifind/obifind.go @@ -53,7 +53,7 @@ func CLICSVTaxaIterator(iterator *obitax.ITaxon) *obicsv.ICSVRecord { OptionsWithScientificName(CLIWithScientificName()), OptionsWithPath(CLIWithPath()), OptionsRawTaxid(CLIRawTaxid()), - OptionsSource(obioptions.CLISelectedNCBITaxDump()), + OptionsSource(obioptions.CLISelectedTaxonomy()), ) return NewCSVTaxaIterator(iterator, options...) diff --git a/pkg/obiutils/tar.go b/pkg/obiutils/tar.go new file mode 100644 index 0000000..3731439 --- /dev/null +++ b/pkg/obiutils/tar.go @@ -0,0 +1,20 @@ +package obiutils + +import ( + "archive/tar" + "fmt" +) + +func TarFileReader(file *Reader, path string) (*tar.Reader, error) { + tarfile := tar.NewReader(file) + header, err := tarfile.Next() + + for err == nil { + if header.Name == path { + return tarfile, nil + } + header, err = tarfile.Next() + } + + return nil, fmt.Errorf("file not found: %s", path) +} diff --git a/pkg/obiformats/xopen.go b/pkg/obiutils/xopen.go similarity index 99% rename from pkg/obiformats/xopen.go rename to pkg/obiutils/xopen.go index 8a0c093..7231b2c 100644 --- a/pkg/obiformats/xopen.go +++ b/pkg/obiutils/xopen.go @@ -9,7 +9,7 @@ // Ropen opens a (possibly gzipped) file/process/http site for buffered reading. // Wopen opens a (possibly gzipped) file for buffered writing. // Both will use gzip when appropriate and will user buffered IO. -package obiformats +package obiutils import ( "bufio" diff --git a/pkg/obiformats/xopen_test.go b/pkg/obiutils/xopen_test.go similarity index 99% rename from pkg/obiformats/xopen_test.go rename to pkg/obiutils/xopen_test.go index 45b2b92..b99cae7 100644 --- a/pkg/obiformats/xopen_test.go +++ b/pkg/obiutils/xopen_test.go @@ -1,4 +1,4 @@ -package obiformats +package obiutils import ( "bufio"