mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Force sequence reading to produce lowercase sequences.
Adds two columns to the obiclean ratio csv file
This commit is contained in:
@ -1,6 +1,7 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"runtime/pprof"
|
||||
@ -37,4 +38,6 @@ func main() {
|
||||
identified := obitag.AssignTaxonomy(fs)
|
||||
|
||||
obiconvert.WriteBioSequences(identified, true)
|
||||
|
||||
fmt.Println("")
|
||||
}
|
||||
|
@ -66,7 +66,56 @@ func InterfaceToInt(i interface{}) (val int, err error) {
|
||||
case uint64:
|
||||
val = int(t) // standardizes across systems
|
||||
default:
|
||||
err = &NotABoolean{"value attribute cannot be casted to an integer"}
|
||||
err = &NotAnInteger{"value attribute cannot be casted to an integer"}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// NotAnInteger defines a new type of Error : "NotAnInteger"
|
||||
type NotAnFloat64 struct {
|
||||
message string
|
||||
}
|
||||
|
||||
// Error() retreives the error message associated to the "NotAnInteger"
|
||||
// error. Tha addition of that Error message make the "NotAnInteger"
|
||||
// complying with the error interface
|
||||
func (m *NotAnFloat64) Error() string {
|
||||
return m.message
|
||||
}
|
||||
|
||||
// InterfaceToInt converts a interface{} to an integer value if possible.
|
||||
// If not a "NotAnInteger" error is returned via the err
|
||||
// return value and val is set to 0.
|
||||
func InterfaceToFloat64(i interface{}) (val float64, err error) {
|
||||
|
||||
err = nil
|
||||
val = 0
|
||||
|
||||
switch t := i.(type) {
|
||||
case int:
|
||||
val = float64(t)
|
||||
case int8:
|
||||
val = float64(t) // standardizes across systems
|
||||
case int16:
|
||||
val = float64(t) // standardizes across systems
|
||||
case int32:
|
||||
val = float64(t) // standardizes across systems
|
||||
case int64:
|
||||
val = float64(t) // standardizes across systems
|
||||
case float32:
|
||||
val = float64(t) // standardizes across systems
|
||||
case float64:
|
||||
val = t // standardizes across systems
|
||||
case uint8:
|
||||
val = float64(t) // standardizes across systems
|
||||
case uint16:
|
||||
val = float64(t) // standardizes across systems
|
||||
case uint32:
|
||||
val = float64(t) // standardizes across systems
|
||||
case uint64:
|
||||
val = float64(t) // standardizes across systems
|
||||
default:
|
||||
err = &NotAnFloat64{"value attribute cannot be casted to a float value"}
|
||||
}
|
||||
return
|
||||
}
|
||||
@ -109,6 +158,45 @@ func InterfaceToIntMap(i interface{}) (val map[string]int, err error) {
|
||||
return
|
||||
}
|
||||
|
||||
// NotABoolean defines a new type of Error : "NotAMapInt"
|
||||
type NotAMapFloat64 struct {
|
||||
message string
|
||||
}
|
||||
|
||||
// Error() retreives the error message associated to the "NotAnInteger"
|
||||
// error. Tha addition of that Error message make the "NotAnInteger"
|
||||
// complying with the error interface
|
||||
func (m *NotAMapFloat64) Error() string {
|
||||
return m.message
|
||||
}
|
||||
|
||||
func InterfaceToFloat64Map(i interface{}) (val map[string]float64, err error) {
|
||||
err = nil
|
||||
|
||||
switch i := i.(type) {
|
||||
case map[string]float64:
|
||||
val = i
|
||||
case map[string]interface{}:
|
||||
val = make(map[string]float64, len(i))
|
||||
for k, v := range i {
|
||||
val[k], err = InterfaceToFloat64(v)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
case map[string]int:
|
||||
val = make(map[string]float64, len(i))
|
||||
for k, v := range i {
|
||||
val[k] = float64(v)
|
||||
}
|
||||
default:
|
||||
err = &NotAMapFloat64{"value attribute cannot be casted to a map[string]float64"}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
// NotABoolean defines a new type of Error : "NotABoolean"
|
||||
type NotABoolean struct {
|
||||
message string
|
||||
|
@ -199,7 +199,6 @@ func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64
|
||||
Sleft = _notavail
|
||||
default:
|
||||
Sdiag = previous[x]
|
||||
|
||||
if bA[j-1] == bB[i-1] {
|
||||
Sdiag = _incscore(Sdiag)
|
||||
}
|
||||
|
@ -5,6 +5,131 @@ import (
|
||||
"github.com/PaesslerAG/gval"
|
||||
)
|
||||
|
||||
func maxIntVector(values []int) float64 {
|
||||
m := values[0]
|
||||
for _,v := range values {
|
||||
if v > m {
|
||||
m = v
|
||||
}
|
||||
}
|
||||
|
||||
return float64(m)
|
||||
}
|
||||
|
||||
func maxIntMap(values map[string]int) float64 {
|
||||
var m int
|
||||
first := true
|
||||
for _,v := range values {
|
||||
if first {
|
||||
first = false
|
||||
m = v
|
||||
} else {
|
||||
if v > m {
|
||||
m = v
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return float64(m)
|
||||
}
|
||||
|
||||
func minIntVector(values []int) float64 {
|
||||
m := values[0]
|
||||
for _,v := range values {
|
||||
if v < m {
|
||||
m = v
|
||||
}
|
||||
}
|
||||
|
||||
return float64(m)
|
||||
}
|
||||
|
||||
func minIntMap(values map[string]int) float64 {
|
||||
var m int
|
||||
first := true
|
||||
for _,v := range values {
|
||||
if first {
|
||||
first = false
|
||||
m = v
|
||||
} else {
|
||||
if v < m {
|
||||
m = v
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return float64(m)
|
||||
}
|
||||
|
||||
|
||||
func maxFloatVector(values []float64) float64 {
|
||||
m := values[0]
|
||||
for _,v := range values {
|
||||
if v > m {
|
||||
m = v
|
||||
}
|
||||
}
|
||||
|
||||
return m
|
||||
}
|
||||
|
||||
func maxFloatMap(values map[string]float64) float64 {
|
||||
var m float64
|
||||
first := true
|
||||
for _,v := range values {
|
||||
if first {
|
||||
first = false
|
||||
m = v
|
||||
} else {
|
||||
if v > m {
|
||||
m = v
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return m
|
||||
}
|
||||
|
||||
func minFloatVector(values []float64) float64 {
|
||||
m := values[0]
|
||||
for _,v := range values {
|
||||
if v < m {
|
||||
m = v
|
||||
}
|
||||
}
|
||||
|
||||
return m
|
||||
}
|
||||
|
||||
func minFloatMap(values map[string]float64) float64 {
|
||||
var m float64
|
||||
first := true
|
||||
for _,v := range values {
|
||||
if first {
|
||||
first = false
|
||||
m = v
|
||||
} else {
|
||||
if v < m {
|
||||
m = v
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return m
|
||||
}
|
||||
|
||||
// func maxNumeric(args ...interface{}) (interface{}, error) {
|
||||
// var m float64
|
||||
// first := true
|
||||
|
||||
// for _, v := range args {
|
||||
// switch {
|
||||
// case
|
||||
// }
|
||||
// }
|
||||
|
||||
// }
|
||||
|
||||
var OBILang = gval.NewLanguage(
|
||||
gval.Full(),
|
||||
gval.Function("len", func(args ...interface{}) (interface{}, error) {
|
||||
|
@ -1,6 +1,7 @@
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
@ -67,7 +68,7 @@ func __read_ecopcr_bioseq__(file *__ecopcr_file__) (*obiseq.BioSequence, error)
|
||||
comment = strings.TrimSpace(record[19])
|
||||
}
|
||||
|
||||
bseq := obiseq.NewBioSequence(name, sequence, comment)
|
||||
bseq := obiseq.NewBioSequence(name, bytes.ToLower(sequence), comment)
|
||||
annotation := bseq.Annotations()
|
||||
|
||||
annotation["ac"] = name
|
||||
|
@ -128,7 +128,7 @@ func _ParseEmblFile(input <-chan _FileChunk, out obiiter.IBioSequenceBatch) {
|
||||
}
|
||||
case line == "//":
|
||||
sequence := obiseq.NewBioSequence(id,
|
||||
seqBytes.Bytes(),
|
||||
bytes.ToLower(seqBytes.Bytes()),
|
||||
defBytes.String())
|
||||
|
||||
sequence.SetFeatures(featBytes.Bytes())
|
||||
|
@ -7,6 +7,7 @@ package obiformats
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"os"
|
||||
"unsafe"
|
||||
@ -38,7 +39,7 @@ func _FastseqReader(seqfile C.fast_kseq_p,
|
||||
comment = ""
|
||||
}
|
||||
|
||||
rep := obiseq.NewBioSequence(name, sequence, comment)
|
||||
rep := obiseq.NewBioSequence(name, bytes.ToLower(sequence), comment)
|
||||
|
||||
if s.qual.l > C.ulong(0) {
|
||||
cquality := unsafe.Slice(s.qual.s, C.int(s.qual.l))
|
||||
|
@ -84,7 +84,7 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequenceBatch) {
|
||||
|
||||
case line == "//":
|
||||
sequence := obiseq.NewBioSequence(id,
|
||||
seqBytes.Bytes(),
|
||||
bytes.ToLower(seqBytes.Bytes()),
|
||||
defBytes.String())
|
||||
state = inHeader
|
||||
|
||||
|
@ -11,6 +11,7 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/md5"
|
||||
"fmt"
|
||||
"strconv"
|
||||
@ -370,7 +371,7 @@ func (s *BioSequence) SetSequence(sequence []byte) {
|
||||
if s.sequence != nil {
|
||||
RecycleSlice(&s.sequence)
|
||||
}
|
||||
s.sequence = sequence
|
||||
s.sequence = bytes.ToLower(sequence)
|
||||
}
|
||||
|
||||
// Setting the qualities of the BioSequence.
|
||||
|
@ -209,9 +209,7 @@ func ExpressionPredicat(expression string) SequencePredicate {
|
||||
f := func(sequence *BioSequence) bool {
|
||||
value, err := exp.EvalBool(context.Background(),
|
||||
map[string]interface{}{
|
||||
"annot": sequence.Annotations(),
|
||||
"count": sequence.Count(),
|
||||
"seqlength": sequence.Len(),
|
||||
"annotations": sequence.Annotations(),
|
||||
"sequence": sequence,
|
||||
},
|
||||
)
|
||||
|
@ -18,6 +18,8 @@ import (
|
||||
|
||||
type Ratio struct {
|
||||
Sample string
|
||||
SeqID string
|
||||
status string
|
||||
From int
|
||||
To int
|
||||
CFrom int
|
||||
@ -97,12 +99,14 @@ func EmpiricalDistCsv(filename string, data [][]Ratio) {
|
||||
|
||||
bar := progressbar.NewOptions(len(data), pbopt...)
|
||||
|
||||
fmt.Fprintln(file, "Sample,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length")
|
||||
fmt.Fprintln(file, "Sample,Father_id,Father_status,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length")
|
||||
for code, dist := range data {
|
||||
a1, a2 := intToNucPair(code)
|
||||
for _, ratio := range dist {
|
||||
fmt.Fprintf(file, "%s,%c,%c,%d,%d,%d,%d,%d,%d\n",
|
||||
fmt.Fprintf(file, "%s,%s,%s,%c,%c,%d,%d,%d,%d,%d,%d\n",
|
||||
ratio.Sample,
|
||||
ratio.SeqID,
|
||||
ratio.status,
|
||||
a1, a2,
|
||||
ratio.From,
|
||||
ratio.To,
|
||||
@ -463,7 +467,13 @@ func EstimateRatio(samples map[string]*[]*seqPCR, minStatRatio int) [][]Ratio {
|
||||
for _, edge := range seq.Edges {
|
||||
father := (*seqs)[edge.Father]
|
||||
if father.Weight >= minStatRatio && edge.Dist == 1 {
|
||||
ratio[edge.NucPair] = append(ratio[edge.NucPair], Ratio{name, father.Weight, seq.Weight, father.Count, seq.Count, edge.Pos, father.Sequence.Len()})
|
||||
ratio[edge.NucPair] = append(ratio[edge.NucPair],
|
||||
Ratio{name,
|
||||
father.Sequence.Id(), Status(father.Sequence)[name],
|
||||
father.Weight, seq.Weight,
|
||||
father.Count, seq.Count,
|
||||
edge.Pos,
|
||||
father.Sequence.Len()})
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -19,6 +19,7 @@ type seqPCR struct {
|
||||
SonCount int
|
||||
AddedSons int
|
||||
Edges []Edge
|
||||
Cluster map[int]bool // used as the set of head sequences associated to that sequence
|
||||
}
|
||||
|
||||
// buildSamples sorts the sequences by samples
|
||||
@ -183,13 +184,53 @@ func GetMutation(sequence *obiseq.BioSequence) map[string]string {
|
||||
return mutation
|
||||
}
|
||||
|
||||
func GetCluster(sequence *obiseq.BioSequence) map[string]string {
|
||||
annotation := sequence.Annotations()
|
||||
icluster, ok := annotation["obiclean_cluster"]
|
||||
var cluster map[string]string
|
||||
|
||||
if ok {
|
||||
switch icluster := icluster.(type) {
|
||||
case map[string]string:
|
||||
cluster = icluster
|
||||
case map[string]interface{}:
|
||||
cluster = make(map[string]string)
|
||||
for k, v := range icluster {
|
||||
cluster[k] = fmt.Sprint(v)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cluster = make(map[string]string)
|
||||
annotation["obiclean_cluster"] = cluster
|
||||
}
|
||||
|
||||
return cluster
|
||||
}
|
||||
|
||||
|
||||
// func Cluster(sample map[string]*([]*seqPCR)) {
|
||||
// for _, graph := range sample {
|
||||
// for _, s := range *graph {
|
||||
// cluster := GetCluster(s.Sequence)
|
||||
// if len(s.Edges) > 0 {
|
||||
// for _, f := range s.Edges {
|
||||
|
||||
// }
|
||||
// } else {
|
||||
// cluster
|
||||
// }
|
||||
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
func Mutation(sample map[string]*([]*seqPCR)) {
|
||||
for _, graph := range sample {
|
||||
for _, s := range *graph {
|
||||
for _, f := range s.Edges {
|
||||
id := (*graph)[f.Father].Sequence.Id()
|
||||
GetMutation(s.Sequence)[id] = fmt.Sprintf("(%c)->(%c)@%d",
|
||||
f.From, f.To, f.Pos + 1)
|
||||
f.From, f.To, f.Pos+1)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -277,14 +318,6 @@ func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
|
||||
}
|
||||
}
|
||||
|
||||
if IsSaveRatioTable() {
|
||||
all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate())
|
||||
EmpiricalDistCsv(RatioTableFilename(), all_ratio)
|
||||
}
|
||||
|
||||
if SaveGraphToFiles() {
|
||||
SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
|
||||
}
|
||||
|
||||
Mutation(samples)
|
||||
|
||||
@ -310,6 +343,16 @@ func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
|
||||
bar.Add(1)
|
||||
}
|
||||
|
||||
if SaveGraphToFiles() {
|
||||
SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
|
||||
}
|
||||
|
||||
if IsSaveRatioTable() {
|
||||
all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate())
|
||||
EmpiricalDistCsv(RatioTableFilename(), all_ratio)
|
||||
}
|
||||
|
||||
|
||||
iter := annotateOBIClean(db, samples, SampleAttribute(), "NA")
|
||||
|
||||
if OnlyHead() {
|
||||
|
@ -56,5 +56,5 @@ func IExtractBarcode(iterator obiiter.IBioSequenceBatch) (obiiter.IBioSequenceBa
|
||||
}
|
||||
log.Printf("Sequence demultiplexing using %d workers\n", obioptions.CLIParallelWorkers())
|
||||
|
||||
return newIter, nil
|
||||
return newIter.Speed("Demultiplexing"), nil
|
||||
}
|
||||
|
@ -66,7 +66,9 @@ func FindClosests(sequence *obiseq.BioSequence,
|
||||
|
||||
// log.Println(sequence.Id(),cw[j], maxe)
|
||||
if runExact || (atMost <= (maxe + 1)) {
|
||||
// if true {
|
||||
lcs, alilength := obialign.FastLCSScore(sequence, ref, maxe+1, &matrix)
|
||||
// fmt.Println(j, cw[j], lcs, alilength, alilength-lcs)
|
||||
// lcs, alilength := obialign.LCSScore(sequence, ref, maxe+1, matrix)
|
||||
n++
|
||||
if lcs == -1 {
|
||||
|
Reference in New Issue
Block a user