Adds CSV as an input format

Former-commit-id: a365bb6947064adc2709d66df05fa54c6fe47fad
This commit is contained in:
Eric Coissac
2024-07-03 21:04:27 +02:00
parent 5d9ac261ff
commit bd855c4965
6 changed files with 249 additions and 56 deletions

180
pkg/obiformats/csv_read.go Normal file
View File

@ -0,0 +1,180 @@
package obiformats
import (
"encoding/csv"
"io"
"os"
"path"
"unsafe"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/goccy/go-json"
log "github.com/sirupsen/logrus"
)
func _ParseCsvFile(source string,
reader io.Reader,
out obiiter.IBioSequence,
shift byte,
batchSize int) {
file := csv.NewReader(reader)
file.Comma = ','
file.ReuseRecord = false
file.LazyQuotes = true
file.Comment = '#'
file.FieldsPerRecord = -1
file.TrimLeadingSpace = true
header, err := file.Read()
if err != nil {
if err == io.EOF {
out.Done()
return
}
log.Fatal(err)
}
sequenceColIndex := -1
idColIndex := -1
qualitiesColIndex := -1
o := 0
for i, colName := range header {
switch colName {
case "sequence":
sequenceColIndex = i
case "id":
idColIndex = i
case "qualities":
qualitiesColIndex = i
}
}
file.ReuseRecord = true
slice := obiseq.MakeBioSequenceSlice()
for {
rec, err := file.Read()
if err == io.EOF {
break
}
if err != nil {
log.Fatal(err)
}
sequence := obiseq.NewEmptyBioSequence(0)
if sequenceColIndex >= 0 {
sequence.SetSequence([]byte(rec[sequenceColIndex]))
}
if idColIndex >= 0 {
sequence.SetId(rec[idColIndex])
}
if qualitiesColIndex >= 0 {
q := []byte(rec[qualitiesColIndex])
for i := 0; i < len(q); i++ {
q[i] -= shift
}
sequence.SetQualities(q)
}
for i, field := range rec {
var val interface{}
if i == sequenceColIndex || i == idColIndex || i == qualitiesColIndex {
continue
}
err := json.Unmarshal(unsafe.Slice(unsafe.StringData(field), len(field)), &val)
if err != nil {
val = field
} else {
if _, ok := val.(float64); ok {
if obiutils.IsIntegral(val.(float64)) {
val = int(val.(float64))
}
}
}
sequence.SetAttribute(header[i], val)
}
slice = append(slice, sequence)
if len(slice) >= batchSize {
out.Push(obiiter.MakeBioSequenceBatch(o, slice))
o++
slice = obiseq.MakeBioSequenceSlice()
}
}
if len(slice) > 0 {
out.Push(obiiter.MakeBioSequenceBatch(o, slice))
}
out.Done()
}
func ReadCSV(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
opt := MakeOptions(options)
out := obiiter.MakeIBioSequence()
out.Add(1)
go _ParseCsvFile(opt.Source(),
reader,
out,
byte(obioptions.InputQualityShift()),
opt.BatchSize())
go func() {
out.WaitAndClose()
}()
return out, nil
}
func ReadCSVFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
file, err := Ropen(filename)
if err == ErrNoContent {
log.Infof("file %s is empty", filename)
return ReadEmptyFile(options...)
}
if err != nil {
return obiiter.NilIBioSequence, err
}
return ReadCSV(file, options...)
}
func ReadCSVFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin")))
input, err := Buf(os.Stdin)
if err == ErrNoContent {
log.Infof("stdin is empty")
return ReadEmptyFile(options...)
}
if err != nil {
log.Fatalf("open file error: %v", err)
return obiiter.NilIBioSequence, err
}
return ReadCSV(input, options...)
}

View File

@ -169,9 +169,6 @@ func ReadSequencesFromFile(filename string,
switch mime.String() { switch mime.String() {
case "text/fastq": case "text/fastq":
return ReadFastq(reader, options...) return ReadFastq(reader, options...)
// file.Close()
// is, err := ReadFastSeqFromFile(filename, options...)
// return is, err
case "text/fasta": case "text/fasta":
return ReadFasta(reader, options...) return ReadFasta(reader, options...)
case "text/ecopcr2": case "text/ecopcr2":
@ -180,6 +177,8 @@ func ReadSequencesFromFile(filename string,
return ReadEMBL(reader, options...), nil return ReadEMBL(reader, options...), nil
case "text/genbank": case "text/genbank":
return ReadGenbank(reader, options...), nil return ReadGenbank(reader, options...), nil
case "text/csv":
return ReadCSV(reader, options...)
default: default:
log.Fatalf("File %s has guessed format %s which is not yet implemented", log.Fatalf("File %s has guessed format %s which is not yet implemented",
filename, mime.String()) filename, mime.String())

View File

@ -7,7 +7,7 @@ import (
// TODO: The version number is extracted from git. This induces that the version // TODO: The version number is extracted from git. This induces that the version
// corresponds to the last commit, and not the one when the file will be // corresponds to the last commit, and not the one when the file will be
// commited // commited
var _Commit = "e60b61d" var _Commit = "2bc540d"
var _Version = "Release 4.2.0" var _Version = "Release 4.2.0"
// Version returns the version of the obitools package. // Version returns the version of the obitools package.

View File

@ -137,6 +137,11 @@ func (s *BioSequence) SetAttribute(key string, value interface{}) {
return return
} }
if key == "qualities" {
s.SetQualities(value.([]byte))
return
}
annot := s.Annotations() annot := s.Annotations()
defer s.AnnotationsUnlock() defer s.AnnotationsUnlock()

View File

@ -35,6 +35,8 @@ func diagCoord(x, y, n int) int {
} }
func SequenceTrustSlice(sequences obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) { func SequenceTrustSlice(sequences obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) {
n := len(sequences) n := len(sequences)
if n > 1 {
score := make([]float64, n*(n-1)/2) score := make([]float64, n*(n-1)/2)
matrix := make([]uint64, sequences[0].Len()*sequences[0].Len()) matrix := make([]uint64, sequences[0].Len()*sequences[0].Len())
@ -63,7 +65,7 @@ func SequenceTrustSlice(sequences obiseq.BioSequenceSlice) (obiseq.BioSequenceSl
} }
scoremed := obistats.Median(score) scoremed := obistats.Median(score)
scorethr := 1 - 2*(1-scoremed) scorethr := 1 - 3*(1-scoremed)
mednorm := (scoremed - scorethr) / 2.0 mednorm := (scoremed - scorethr) / 2.0
for i, s := range score { for i, s := range score {
@ -80,14 +82,14 @@ func SequenceTrustSlice(sequences obiseq.BioSequenceSlice) (obiseq.BioSequenceSl
// Tylos // Tylos
for i, sa := range sequences { for i, sa := range sequences {
ngroup := float64(sa.Count()) ngroup := float64(sa.Count())
ss := make([]float64, 0, n-1) ss := make(map[string]float64, n-1)
sc := sa.Count() sc := sa.Count()
for j, sb := range sequences { for j, sb := range sequences {
if i == j { if i == j {
continue continue
} }
ss = append(ss, score[diagCoord(i, j, n)]) ss[sb.Id()] = score[diagCoord(i, j, n)]
sc += sb.Count() sc += sb.Count()
nt, _ := sb.GetFloatAttribute("obicleandb_trusted_on") nt, _ := sb.GetFloatAttribute("obicleandb_trusted_on")
ngroup += score[diagCoord(i, j, n)] * nt ngroup += score[diagCoord(i, j, n)] * nt
@ -96,7 +98,10 @@ func SequenceTrustSlice(sequences obiseq.BioSequenceSlice) (obiseq.BioSequenceSl
sa.SetAttribute("obicleandb_trusted", 1.0-1.0/float64(ngroup+1)) sa.SetAttribute("obicleandb_trusted", 1.0-1.0/float64(ngroup+1))
sa.SetAttribute("obicleandb_trusted_on", ngroup) sa.SetAttribute("obicleandb_trusted_on", ngroup)
sa.SetAttribute("obicleandb_median", scoremed) sa.SetAttribute("obicleandb_median", scoremed)
sa.SetAttribute("obicleandb_ss", ss) sa.SetAttribute("obicleandb_scores", ss)
}
} else {
sequences[0].SetAttribute("obicleandb_median", 1.0)
} }
return sequences, nil return sequences, nil

View File

@ -418,6 +418,10 @@ func IsASlice(value interface{}) bool {
return reflect.TypeOf(value).Kind() == reflect.Slice return reflect.TypeOf(value).Kind() == reflect.Slice
} }
func IsIntegral(val float64) bool {
return val == float64(int(val))
}
// HasLength checks if the given value has a length. // HasLength checks if the given value has a length.
// //
// value: The value to be checked. // value: The value to be checked.