mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
224 lines
5.4 KiB
Go
224 lines
5.4 KiB
Go
package obiconvert
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/goombaio/orderedset"
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
|
)
|
|
|
|
func ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
|
|
var err error
|
|
list_of_files := orderedset.NewOrderedSet()
|
|
for _, fn := range filenames {
|
|
|
|
err = filepath.Walk(fn,
|
|
func(path string, info os.FileInfo, err error) error {
|
|
var e error
|
|
if info == nil {
|
|
return fmt.Errorf("cannot open path")
|
|
}
|
|
for info.Mode()&os.ModeSymlink == os.ModeSymlink {
|
|
path, e = filepath.EvalSymlinks(path)
|
|
if e != nil {
|
|
return e
|
|
}
|
|
|
|
info, e = os.Stat(path)
|
|
if e != nil {
|
|
return e
|
|
}
|
|
}
|
|
|
|
if info.IsDir() {
|
|
if path != fn {
|
|
subdir, e := ExpandListOfFiles(true, path)
|
|
if e != nil {
|
|
return e
|
|
}
|
|
for _, f := range subdir {
|
|
list_of_files.Add(f)
|
|
}
|
|
} else {
|
|
check_ext = true
|
|
}
|
|
} else {
|
|
if !check_ext ||
|
|
strings.HasSuffix(path, "fasta") ||
|
|
strings.HasSuffix(path, "fasta.gz") ||
|
|
strings.HasSuffix(path, "fastq") ||
|
|
strings.HasSuffix(path, "fastq.gz") ||
|
|
strings.HasSuffix(path, "fq") ||
|
|
strings.HasSuffix(path, "fq.gz") ||
|
|
strings.HasSuffix(path, "seq") ||
|
|
strings.HasSuffix(path, "seq.gz") ||
|
|
strings.HasSuffix(path, "gb") ||
|
|
strings.HasSuffix(path, "gb.gz") ||
|
|
strings.HasSuffix(path, "dat") ||
|
|
strings.HasSuffix(path, "dat.gz") ||
|
|
strings.HasSuffix(path, "ecopcr") ||
|
|
strings.HasSuffix(path, "ecopcr.gz") {
|
|
log.Debugf("Appending %s file\n", path)
|
|
list_of_files.Add(path)
|
|
}
|
|
}
|
|
return nil
|
|
})
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
res := make([]string, 0, list_of_files.Size())
|
|
for _, v := range list_of_files.Values() {
|
|
res = append(res, v.(string))
|
|
}
|
|
|
|
log.Infof("Found %d files to process", len(res))
|
|
return res, nil
|
|
}
|
|
|
|
func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
|
|
var iterator obiiter.IBioSequence
|
|
var reader func(string, ...obiformats.WithOption) (obiiter.IBioSequence, error)
|
|
|
|
opts := make([]obiformats.WithOption, 0, 10)
|
|
|
|
switch CLIInputFastHeaderFormat() {
|
|
case "json":
|
|
opts = append(opts, obiformats.OptionsFastSeqHeaderParser(obiformats.ParseFastSeqJsonHeader))
|
|
case "obi":
|
|
opts = append(opts, obiformats.OptionsFastSeqHeaderParser(obiformats.ParseFastSeqOBIHeader))
|
|
default:
|
|
opts = append(opts, obiformats.OptionsFastSeqHeaderParser(obiformats.ParseGuessedFastSeqHeader))
|
|
}
|
|
|
|
opts = append(opts, obiformats.OptionsReadQualities(obidefault.ReadQualities()))
|
|
|
|
nworkers := obidefault.ReadParallelWorkers()
|
|
if nworkers < 2 {
|
|
nworkers = 2
|
|
}
|
|
|
|
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
|
|
opts = append(opts, obiformats.OptionsBatchSize(obidefault.BatchSize()))
|
|
|
|
opts = append(opts, obiformats.OptionsFullFileBatch(FullFileBatch()))
|
|
|
|
if len(filenames) == 0 {
|
|
log.Printf("Reading sequences from stdin in %s\n", CLIInputFormat())
|
|
opts = append(opts, obiformats.OptionsSource("stdin"))
|
|
|
|
var err error
|
|
|
|
switch CLIInputFormat() {
|
|
case "ecopcr":
|
|
iterator, err = obiformats.ReadEcoPCR(os.Stdin, opts...)
|
|
case "embl":
|
|
iterator, err = obiformats.ReadEMBL(os.Stdin, opts...)
|
|
case "genbank":
|
|
iterator, err = obiformats.ReadGenbank(os.Stdin, opts...)
|
|
case "fasta":
|
|
iterator, err = obiformats.ReadFasta(os.Stdin, opts...)
|
|
case "fastq":
|
|
iterator, err = obiformats.ReadFastq(os.Stdin, opts...)
|
|
case "csv":
|
|
iterator, err = obiformats.ReadCSV(os.Stdin, opts...)
|
|
default:
|
|
iterator = obiformats.ReadFastSeqFromStdin(opts...)
|
|
}
|
|
|
|
if err != nil {
|
|
return obiiter.NilIBioSequence, err
|
|
}
|
|
|
|
} else {
|
|
|
|
list_of_files, err := ExpandListOfFiles(false, filenames...)
|
|
if err != nil {
|
|
return obiiter.NilIBioSequence, err
|
|
}
|
|
|
|
switch CLIInputFormat() {
|
|
case "fastq", "fq":
|
|
reader = obiformats.ReadFastqFromFile
|
|
case "fasta":
|
|
reader = obiformats.ReadFastaFromFile
|
|
case "csv":
|
|
reader = obiformats.ReadCSVFromFile
|
|
case "ecopcr":
|
|
reader = obiformats.ReadEcoPCRFromFile
|
|
case "embl":
|
|
reader = obiformats.ReadEMBLFromFile
|
|
case "genbank":
|
|
reader = obiformats.ReadGenbankFromFile
|
|
default:
|
|
reader = obiformats.ReadSequencesFromFile
|
|
}
|
|
|
|
if len(list_of_files) > 1 {
|
|
nreader := 1
|
|
|
|
if CLINoInputOrder() {
|
|
nreader = obidefault.ParallelFilesRead()
|
|
}
|
|
|
|
iterator = obiformats.ReadSequencesBatchFromFiles(
|
|
list_of_files,
|
|
reader,
|
|
nreader,
|
|
opts...,
|
|
)
|
|
} else {
|
|
if len(list_of_files) > 0 {
|
|
iterator, err = reader(list_of_files[0], opts...)
|
|
|
|
if err != nil {
|
|
return obiiter.NilIBioSequence, err
|
|
}
|
|
|
|
if CLIPairedFileName() != "" {
|
|
ip, err := reader(CLIPairedFileName(), opts...)
|
|
|
|
if err != nil {
|
|
return obiiter.NilIBioSequence, err
|
|
}
|
|
|
|
iterator = iterator.PairTo(ip)
|
|
}
|
|
} else {
|
|
iterator = obiiter.NilIBioSequence
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
if CLIProgressBar() {
|
|
iterator = iterator.Speed("Reading sequences")
|
|
}
|
|
|
|
return iterator, nil
|
|
}
|
|
|
|
func OpenSequenceDataErrorMessage(args []string, err error) {
|
|
if err != nil {
|
|
switch len(args) {
|
|
case 0:
|
|
log.Errorf("Cannot open stdin (%v)", err)
|
|
case 1:
|
|
log.Errorf("Cannot open file %s: %v", args[0], err)
|
|
default:
|
|
log.Errorf("Cannot open one of the data files: %v", err)
|
|
}
|
|
os.Exit(1)
|
|
}
|
|
}
|