diff --git a/pkg/obiformats/csv_read.go b/pkg/obiformats/csv_read.go new file mode 100644 index 0000000..4a7604c --- /dev/null +++ b/pkg/obiformats/csv_read.go @@ -0,0 +1,180 @@ +package obiformats + +import ( + "encoding/csv" + "io" + "os" + "path" + "unsafe" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" + "github.com/goccy/go-json" + log "github.com/sirupsen/logrus" +) + +func _ParseCsvFile(source string, + reader io.Reader, + out obiiter.IBioSequence, + shift byte, + batchSize int) { + + file := csv.NewReader(reader) + + file.Comma = ',' + file.ReuseRecord = false + file.LazyQuotes = true + file.Comment = '#' + file.FieldsPerRecord = -1 + file.TrimLeadingSpace = true + + header, err := file.Read() + + if err != nil { + if err == io.EOF { + out.Done() + return + } + log.Fatal(err) + } + + sequenceColIndex := -1 + idColIndex := -1 + qualitiesColIndex := -1 + o := 0 + + for i, colName := range header { + switch colName { + case "sequence": + sequenceColIndex = i + case "id": + idColIndex = i + case "qualities": + qualitiesColIndex = i + } + } + + file.ReuseRecord = true + slice := obiseq.MakeBioSequenceSlice() + + for { + rec, err := file.Read() + if err == io.EOF { + break + } + if err != nil { + log.Fatal(err) + } + + sequence := obiseq.NewEmptyBioSequence(0) + + if sequenceColIndex >= 0 { + sequence.SetSequence([]byte(rec[sequenceColIndex])) + } + + if idColIndex >= 0 { + sequence.SetId(rec[idColIndex]) + } + + if qualitiesColIndex >= 0 { + q := []byte(rec[qualitiesColIndex]) + + for i := 0; i < len(q); i++ { + q[i] -= shift + } + sequence.SetQualities(q) + } + + for i, field := range rec { + var val interface{} + + if i == sequenceColIndex || i == idColIndex || i == qualitiesColIndex { + continue + } + + err := json.Unmarshal(unsafe.Slice(unsafe.StringData(field), len(field)), &val) + + if err != nil { + val = field + } else { + if _, ok := val.(float64); ok { + if obiutils.IsIntegral(val.(float64)) { + val = int(val.(float64)) + } + } + } + + sequence.SetAttribute(header[i], val) + } + + slice = append(slice, sequence) + if len(slice) >= batchSize { + out.Push(obiiter.MakeBioSequenceBatch(o, slice)) + o++ + slice = obiseq.MakeBioSequenceSlice() + } + } + + if len(slice) > 0 { + out.Push(obiiter.MakeBioSequenceBatch(o, slice)) + } + + out.Done() + +} + +func ReadCSV(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) { + + opt := MakeOptions(options) + out := obiiter.MakeIBioSequence() + + out.Add(1) + go _ParseCsvFile(opt.Source(), + reader, + out, + byte(obioptions.InputQualityShift()), + opt.BatchSize()) + + go func() { + out.WaitAndClose() + }() + + return out, nil + +} + +func ReadCSVFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) { + + options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename))))) + file, err := Ropen(filename) + + if err == ErrNoContent { + log.Infof("file %s is empty", filename) + return ReadEmptyFile(options...) + } + + if err != nil { + return obiiter.NilIBioSequence, err + } + + return ReadCSV(file, options...) +} + +func ReadCSVFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) { + options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin"))) + input, err := Buf(os.Stdin) + + if err == ErrNoContent { + log.Infof("stdin is empty") + return ReadEmptyFile(options...) + } + + if err != nil { + log.Fatalf("open file error: %v", err) + return obiiter.NilIBioSequence, err + } + + return ReadCSV(input, options...) +} diff --git a/pkg/obiformats/universal_read.go b/pkg/obiformats/universal_read.go index 7dc8140..97d820d 100644 --- a/pkg/obiformats/universal_read.go +++ b/pkg/obiformats/universal_read.go @@ -169,9 +169,6 @@ func ReadSequencesFromFile(filename string, switch mime.String() { case "text/fastq": return ReadFastq(reader, options...) - // file.Close() - // is, err := ReadFastSeqFromFile(filename, options...) - // return is, err case "text/fasta": return ReadFasta(reader, options...) case "text/ecopcr2": @@ -180,6 +177,8 @@ func ReadSequencesFromFile(filename string, return ReadEMBL(reader, options...), nil case "text/genbank": return ReadGenbank(reader, options...), nil + case "text/csv": + return ReadCSV(reader, options...) default: log.Fatalf("File %s has guessed format %s which is not yet implemented", filename, mime.String()) diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index f333547..61e544b 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -7,7 +7,7 @@ import ( // TODO: The version number is extracted from git. This induces that the version // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "e60b61d" +var _Commit = "2bc540d" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/attributes.go b/pkg/obiseq/attributes.go index ba36256..afdc142 100644 --- a/pkg/obiseq/attributes.go +++ b/pkg/obiseq/attributes.go @@ -137,6 +137,11 @@ func (s *BioSequence) SetAttribute(key string, value interface{}) { return } + if key == "qualities" { + s.SetQualities(value.([]byte)) + return + } + annot := s.Annotations() defer s.AnnotationsUnlock() diff --git a/pkg/obitools/obicleandb/obicleandb.go b/pkg/obitools/obicleandb/obicleandb.go index 7eafae7..b3e6956 100644 --- a/pkg/obitools/obicleandb/obicleandb.go +++ b/pkg/obitools/obicleandb/obicleandb.go @@ -35,68 +35,73 @@ func diagCoord(x, y, n int) int { } func SequenceTrustSlice(sequences obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) { n := len(sequences) - score := make([]float64, n*(n-1)/2) - matrix := make([]uint64, sequences[0].Len()*sequences[0].Len()) - for i, sa := range sequences { - for j, sb := range sequences[i+1:] { - lca, lali := obialign.FastLCSScore(sa, sb, -1, &matrix) - score[diagCoord(i, i+1+j, n)] = float64(lca) / float64(lali) - } - } + if n > 1 { + score := make([]float64, n*(n-1)/2) + matrix := make([]uint64, sequences[0].Len()*sequences[0].Len()) - for i, sa := range sequences { - ss := make([]float64, 0, n-1) - for j, _ := range sequences { - if i == j { - continue + for i, sa := range sequences { + for j, sb := range sequences[i+1:] { + lca, lali := obialign.FastLCSScore(sa, sb, -1, &matrix) + score[diagCoord(i, i+1+j, n)] = float64(lca) / float64(lali) } + } + + for i, sa := range sequences { + ss := make([]float64, 0, n-1) + for j, _ := range sequences { + if i == j { + continue + } + + s := score[diagCoord(i, j, n)] + if s == 0.0 { + log.Panicf("score[%d, %d] == 0.0", i, j) + } + ss = append(ss, score[diagCoord(i, j, n)]) - s := score[diagCoord(i, j, n)] - if s == 0.0 { - log.Panicf("score[%d, %d] == 0.0", i, j) } - ss = append(ss, score[diagCoord(i, j, n)]) - + sa.SetAttribute("obicleandb_dist", ss) } - sa.SetAttribute("obicleandb_dist", ss) - } - scoremed := obistats.Median(score) - scorethr := 1 - 2*(1-scoremed) - mednorm := (scoremed - scorethr) / 2.0 + scoremed := obistats.Median(score) + scorethr := 1 - 3*(1-scoremed) + mednorm := (scoremed - scorethr) / 2.0 - for i, s := range score { - switch { - case s < scorethr: - score[i] = -1.0 - case s < scoremed: - score[i] = (s-scorethr)/mednorm - 1.0 - default: - score[i] = 1.0 - } - } - - // Tylos - for i, sa := range sequences { - ngroup := float64(sa.Count()) - ss := make([]float64, 0, n-1) - sc := sa.Count() - for j, sb := range sequences { - if i == j { - continue + for i, s := range score { + switch { + case s < scorethr: + score[i] = -1.0 + case s < scoremed: + score[i] = (s-scorethr)/mednorm - 1.0 + default: + score[i] = 1.0 } - - ss = append(ss, score[diagCoord(i, j, n)]) - sc += sb.Count() - nt, _ := sb.GetFloatAttribute("obicleandb_trusted_on") - ngroup += score[diagCoord(i, j, n)] * nt } - ngroup = max(0, ngroup) - sa.SetAttribute("obicleandb_trusted", 1.0-1.0/float64(ngroup+1)) - sa.SetAttribute("obicleandb_trusted_on", ngroup) - sa.SetAttribute("obicleandb_median", scoremed) - sa.SetAttribute("obicleandb_ss", ss) + + // Tylos + for i, sa := range sequences { + ngroup := float64(sa.Count()) + ss := make(map[string]float64, n-1) + sc := sa.Count() + for j, sb := range sequences { + if i == j { + continue + } + + ss[sb.Id()] = score[diagCoord(i, j, n)] + sc += sb.Count() + nt, _ := sb.GetFloatAttribute("obicleandb_trusted_on") + ngroup += score[diagCoord(i, j, n)] * nt + } + ngroup = max(0, ngroup) + sa.SetAttribute("obicleandb_trusted", 1.0-1.0/float64(ngroup+1)) + sa.SetAttribute("obicleandb_trusted_on", ngroup) + sa.SetAttribute("obicleandb_median", scoremed) + sa.SetAttribute("obicleandb_scores", ss) + } + } else { + sequences[0].SetAttribute("obicleandb_median", 1.0) } return sequences, nil diff --git a/pkg/obiutils/goutils.go b/pkg/obiutils/goutils.go index 0dbc8fd..1bc97b9 100644 --- a/pkg/obiutils/goutils.go +++ b/pkg/obiutils/goutils.go @@ -418,6 +418,10 @@ func IsASlice(value interface{}) bool { return reflect.TypeOf(value).Kind() == reflect.Slice } +func IsIntegral(val float64) bool { + return val == float64(int(val)) +} + // HasLength checks if the given value has a length. // // value: The value to be checked.