mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Add support for .gbff and .gbff.gz file extensions in sequence reader. Update the logic to return an error instead of using NilIBioSequence when no sequence files are found, improving the error handling and user feedback.
233 lines
5.8 KiB
Go
233 lines
5.8 KiB
Go
package obiconvert
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/goombaio/orderedset"
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
|
)
|
|
|
|
func ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
|
|
var err error
|
|
list_of_files := orderedset.NewOrderedSet()
|
|
for _, fn := range filenames {
|
|
|
|
if strings.HasPrefix(fn, "http://") ||
|
|
strings.HasPrefix(fn, "https://") ||
|
|
strings.HasPrefix(fn, "ftp://") {
|
|
list_of_files.Add(fn)
|
|
continue
|
|
}
|
|
|
|
err = filepath.Walk(fn,
|
|
func(path string, info os.FileInfo, err error) error {
|
|
var e error
|
|
if info == nil {
|
|
return fmt.Errorf("cannot open path")
|
|
}
|
|
for info.Mode()&os.ModeSymlink == os.ModeSymlink {
|
|
path, e = filepath.EvalSymlinks(path)
|
|
if e != nil {
|
|
return e
|
|
}
|
|
|
|
info, e = os.Stat(path)
|
|
if e != nil {
|
|
return e
|
|
}
|
|
}
|
|
|
|
if info.IsDir() {
|
|
if path != fn {
|
|
subdir, e := ExpandListOfFiles(true, path)
|
|
if e != nil {
|
|
return e
|
|
}
|
|
for _, f := range subdir {
|
|
list_of_files.Add(f)
|
|
}
|
|
} else {
|
|
check_ext = true
|
|
}
|
|
} else {
|
|
if !check_ext ||
|
|
strings.HasSuffix(path, "fasta") ||
|
|
strings.HasSuffix(path, "fasta.gz") ||
|
|
strings.HasSuffix(path, "fastq") ||
|
|
strings.HasSuffix(path, "fastq.gz") ||
|
|
strings.HasSuffix(path, "fq") ||
|
|
strings.HasSuffix(path, "fq.gz") ||
|
|
strings.HasSuffix(path, "seq") ||
|
|
strings.HasSuffix(path, "seq.gz") ||
|
|
strings.HasSuffix(path, "gb") ||
|
|
strings.HasSuffix(path, "gb.gz") ||
|
|
strings.HasSuffix(path, "gbff") ||
|
|
strings.HasSuffix(path, "gbff.gz") ||
|
|
strings.HasSuffix(path, "dat") ||
|
|
strings.HasSuffix(path, "dat.gz") ||
|
|
strings.HasSuffix(path, "ecopcr") ||
|
|
strings.HasSuffix(path, "ecopcr.gz") {
|
|
log.Debugf("Appending %s file\n", path)
|
|
list_of_files.Add(path)
|
|
}
|
|
}
|
|
return nil
|
|
})
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
res := make([]string, 0, list_of_files.Size())
|
|
for _, v := range list_of_files.Values() {
|
|
res = append(res, v.(string))
|
|
}
|
|
|
|
log.Infof("Found %d files to process", len(res))
|
|
return res, nil
|
|
}
|
|
|
|
func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
|
|
var iterator obiiter.IBioSequence
|
|
var reader func(string, ...obiformats.WithOption) (obiiter.IBioSequence, error)
|
|
|
|
opts := make([]obiformats.WithOption, 0, 10)
|
|
|
|
switch CLIInputFastHeaderFormat() {
|
|
case "json":
|
|
opts = append(opts, obiformats.OptionsFastSeqHeaderParser(obiformats.ParseFastSeqJsonHeader))
|
|
case "obi":
|
|
opts = append(opts, obiformats.OptionsFastSeqHeaderParser(obiformats.ParseFastSeqOBIHeader))
|
|
default:
|
|
opts = append(opts, obiformats.OptionsFastSeqHeaderParser(obiformats.ParseGuessedFastSeqHeader))
|
|
}
|
|
|
|
opts = append(opts, obiformats.OptionsReadQualities(obidefault.ReadQualities()))
|
|
|
|
nworkers := obidefault.ReadParallelWorkers()
|
|
if nworkers < 2 {
|
|
nworkers = 2
|
|
}
|
|
|
|
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
|
|
opts = append(opts, obiformats.OptionsBatchSize(obidefault.BatchSize()))
|
|
|
|
opts = append(opts, obiformats.OptionsFullFileBatch(FullFileBatch()))
|
|
opts = append(opts, obiformats.OptionsUtoT(CLIUtoT()))
|
|
|
|
if len(filenames) == 0 {
|
|
log.Printf("Reading sequences from stdin in %s\n", CLIInputFormat())
|
|
opts = append(opts, obiformats.OptionsSource("stdin"))
|
|
|
|
var err error
|
|
|
|
switch CLIInputFormat() {
|
|
case "ecopcr":
|
|
iterator, err = obiformats.ReadEcoPCR(os.Stdin, opts...)
|
|
case "embl":
|
|
iterator, err = obiformats.ReadEMBL(os.Stdin, opts...)
|
|
case "genbank":
|
|
iterator, err = obiformats.ReadGenbank(os.Stdin, opts...)
|
|
case "fasta":
|
|
iterator, err = obiformats.ReadFasta(os.Stdin, opts...)
|
|
case "fastq":
|
|
iterator, err = obiformats.ReadFastq(os.Stdin, opts...)
|
|
case "csv":
|
|
iterator, err = obiformats.ReadCSV(os.Stdin, opts...)
|
|
default:
|
|
iterator, err = obiformats.ReadSequencesFromStdin(opts...)
|
|
}
|
|
|
|
if err != nil {
|
|
return obiiter.NilIBioSequence, err
|
|
}
|
|
|
|
} else {
|
|
|
|
list_of_files, err := ExpandListOfFiles(false, filenames...)
|
|
if err != nil {
|
|
return obiiter.NilIBioSequence, err
|
|
}
|
|
switch CLIInputFormat() {
|
|
case "fastq", "fq":
|
|
reader = obiformats.ReadFastqFromFile
|
|
case "fasta":
|
|
reader = obiformats.ReadFastaFromFile
|
|
case "csv":
|
|
reader = obiformats.ReadCSVFromFile
|
|
case "ecopcr":
|
|
reader = obiformats.ReadEcoPCRFromFile
|
|
case "embl":
|
|
reader = obiformats.ReadEMBLFromFile
|
|
case "genbank":
|
|
reader = obiformats.ReadGenbankFromFile
|
|
default:
|
|
reader = obiformats.ReadSequencesFromFile
|
|
}
|
|
|
|
if len(list_of_files) > 1 {
|
|
nreader := 1
|
|
|
|
if CLINoInputOrder() {
|
|
nreader = obidefault.ParallelFilesRead()
|
|
}
|
|
|
|
iterator = obiformats.ReadSequencesBatchFromFiles(
|
|
list_of_files,
|
|
reader,
|
|
nreader,
|
|
opts...,
|
|
)
|
|
|
|
} else {
|
|
if len(list_of_files) > 0 {
|
|
|
|
iterator, err = reader(list_of_files[0], opts...)
|
|
|
|
if err != nil {
|
|
return obiiter.NilIBioSequence, err
|
|
}
|
|
|
|
if CLIPairedFileName() != "" {
|
|
ip, err := reader(CLIPairedFileName(), opts...)
|
|
|
|
if err != nil {
|
|
return obiiter.NilIBioSequence, err
|
|
}
|
|
|
|
iterator = iterator.PairTo(ip)
|
|
}
|
|
} else {
|
|
return obiiter.NilIBioSequence, fmt.Errorf("no sequence files found in the provided paths")
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
iterator = iterator.Speed("Reading sequences")
|
|
|
|
return iterator, nil
|
|
}
|
|
|
|
func OpenSequenceDataErrorMessage(args []string, err error) {
|
|
if err != nil {
|
|
switch len(args) {
|
|
case 0:
|
|
log.Errorf("Cannot open stdin (%v)", err)
|
|
case 1:
|
|
log.Errorf("Cannot open file %s: %v", args[0], err)
|
|
default:
|
|
log.Errorf("Cannot open one of the data files: %v", err)
|
|
}
|
|
os.Exit(1)
|
|
}
|
|
}
|