Adds a reader for NGS filter files and change some API for the apat library

This commit is contained in:
2022-01-18 13:09:32 +01:00
parent 6571296bb2
commit 4551df08b1
13 changed files with 301 additions and 96 deletions

View File

@@ -184,8 +184,8 @@ func ReadEMBLBatch(reader io.Reader, options ...WithOption) obiseq.IBioSequenceB
newIter := obiseq.MakeIBioSequenceBatch(opt.BufferSize())
// newIter.Add(opt.ParallelWorkers())
newIter.Add(2)
nworkers := opt.ParallelWorkers()
newIter.Add(nworkers)
go func() {
newIter.Wait()
@@ -196,7 +196,7 @@ func ReadEMBLBatch(reader io.Reader, options ...WithOption) obiseq.IBioSequenceB
}()
// for j := 0; j < opt.ParallelWorkers(); j++ {
for j := 0; j < 2; j++ {
for j := 0; j < nworkers; j++ {
go _ParseEmblFile(entry_channel, newIter)
}

View File

@@ -173,16 +173,11 @@ func __is_false__(text []byte) bool {
bytes.Equal(text, __FALSE__)
}
func ParseFastSeqOBIHeader(sequence obiseq.BioSequence) {
definition := []byte(sequence.Definition())
annotations := sequence.Annotations()
// all_matches := __obi_header_pattern__.FindAllSubmatchIndex(definition, -1)
func ParseOBIFeatures(text string, annotations obiseq.Annotation) string {
definition := []byte(text)
d := definition
//for m := __obi_header_key_pattern__.FindIndex(definition); len(m) > 0; {
//fmt.Println(string(definition[0:20]), __match__key__(definition))
for m := __match__key__(definition); len(m) > 0; {
var bvalue []byte
var value interface{}
@@ -263,7 +258,16 @@ func ParseFastSeqOBIHeader(sequence obiseq.BioSequence) {
m = __match__key__(d)
}
sequence.SetDefinition(string(bytes.TrimSpace(d)))
return string(bytes.TrimSpace(d))
}
func ParseFastSeqOBIHeader(sequence obiseq.BioSequence) {
annotations := sequence.Annotations()
definition := ParseOBIFeatures(sequence.Definition(),
annotations)
sequence.SetDefinition(definition)
}
func FormatFastSeqOBIHeader(sequence obiseq.BioSequence) string {

View File

@@ -120,10 +120,10 @@ func WriteFastaBatch(iterator obiseq.IBioSequenceBatch, file io.Writer, options
}
log.Println("Start of the fasta file writing")
go ff(iterator)
for i := 0; i < nwriters-1; i++ {
go ff(iterator.Split())
}
go ff(iterator)
next_to_send := 0
received := make(map[int]FileChunck, 100)

View File

@@ -122,10 +122,10 @@ func WriteFastqBatch(iterator obiseq.IBioSequenceBatch, file io.Writer, options
}
log.Println("Start of the fastq file writing")
go ff(iterator)
for i := 0; i < nwriters-1; i++ {
go ff(iterator.Split())
}
go ff(iterator)
next_to_send := 0
received := make(map[int]FileChunck, 100)

View File

@@ -0,0 +1,133 @@
package obiformats
import (
"bufio"
"fmt"
"io"
"strings"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
type PrimerPair struct {
Forward string
Reverse string
}
type TagPair struct {
Forward string
Reverse string
}
type PCR struct {
Experiment string
Sample string
Partial bool
Annotations obiseq.Annotation
}
type PCRs map[TagPair]PCR
type NGSFilter map[PrimerPair]PCRs
func _readLines(reader io.Reader) []string {
r := bufio.NewReader(reader)
bytes := []byte{}
lines := []string{}
for {
line, isPrefix, err := r.ReadLine()
if err != nil {
break
}
bytes = append(bytes, line...)
if !isPrefix {
str := strings.TrimSpace(string(bytes))
if len(str) > 0 {
lines = append(lines, str)
bytes = []byte{}
}
}
}
if len(bytes) > 0 {
lines = append(lines, string(bytes))
}
return lines
}
func _parseMainNGSFilterTags(text string) TagPair {
tags := strings.Split(text, ":")
if len(tags) == 1 {
return TagPair{tags[0], tags[0]}
}
if tags[0] == "-" {
tags[0] = ""
}
if tags[1] == "-" {
tags[1] = ""
}
return TagPair{tags[0], tags[1]}
}
func _parseMainNGSFilter(text string) (PrimerPair, TagPair, string, string, bool) {
fields := strings.Fields(text)
tags := _parseMainNGSFilterTags(fields[2])
partial := fields[5] == "T" || fields[5] == "t"
return PrimerPair{fields[3], fields[4]},
tags,
fields[0],
fields[1],
partial
}
func ReadNGSFilter(reader io.Reader) (NGSFilter, error) {
ngsfilter := make(NGSFilter, 10)
lines := _readLines(reader)
for _, line := range lines {
line = strings.TrimSpace(line)
if strings.HasPrefix(line, "#") || len(line) == 0 {
continue
}
split := strings.SplitN(line, "@", 2)
primers, tags, experiment, sample, partial := _parseMainNGSFilter(split[0])
newPCR := PCR{
Experiment: experiment,
Sample: sample,
Partial: partial,
Annotations: nil,
}
if len(split) > 1 && len(split[1]) > 0 {
newPCR.Annotations = obiseq.GetAnnotation()
ParseOBIFeatures(split[1], newPCR.Annotations)
}
samples, ok := ngsfilter[primers]
if ok {
pcr, ok := samples[tags]
if ok {
return nil, fmt.Errorf("pair of tags %v used for samples %s in %s and %s in %s",
tags, sample, experiment, pcr.Sample, pcr.Experiment)
}
samples[tags] = newPCR
} else {
ngsfilter[primers] = make(PCRs, 1000)
ngsfilter[primers][tags] = newPCR
}
}
return ngsfilter, nil
}

View File

@@ -56,16 +56,23 @@ func WriteSequenceBatch(iterator obiseq.IBioSequenceBatch,
file io.Writer,
options ...WithOption) (obiseq.IBioSequenceBatch, error) {
var newIter obiseq.IBioSequenceBatch
var err error
iterator = iterator.Rebatch(1000)
ok := iterator.Next()
if ok {
iterator.PushBack()
batch := iterator.Get()
if batch.Slice()[0].HasQualities() {
newIter, err = WriteFastqBatch(iterator, file, options...)
iterator.PushBack()
var newIter obiseq.IBioSequenceBatch
var err error
if len(batch.Slice()) > 0 {
if batch.Slice()[0].HasQualities() {
newIter, err = WriteFastqBatch(iterator, file, options...)
} else {
newIter, err = WriteFastaBatch(iterator, file, options...)
}
} else {
newIter, err = WriteFastaBatch(iterator, file, options...)
}