mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-07-18 17:40:45 +00:00
Add a reading option on readers to convet U to T
This commit is contained in:
@@ -87,7 +87,7 @@ func EndOfLastFlatFileEntry(buff []byte) int {
|
|||||||
return -1
|
return -1
|
||||||
}
|
}
|
||||||
|
|
||||||
func EmblChunkParser(withFeatureTable bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
|
func EmblChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
|
||||||
parser := func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
|
parser := func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
|
||||||
scanner := bufio.NewScanner(input)
|
scanner := bufio.NewScanner(input)
|
||||||
sequences := make(obiseq.BioSequenceSlice, 0, 100)
|
sequences := make(obiseq.BioSequenceSlice, 0, 100)
|
||||||
@@ -128,6 +128,9 @@ func EmblChunkParser(withFeatureTable bool) func(string, io.Reader) (obiseq.BioS
|
|||||||
parts := strings.SplitN(line[5:], " ", 7)
|
parts := strings.SplitN(line[5:], " ", 7)
|
||||||
np := len(parts) - 1
|
np := len(parts) - 1
|
||||||
for i := 0; i < np; i++ {
|
for i := 0; i < np; i++ {
|
||||||
|
if UtoT {
|
||||||
|
parts[i] = strings.ReplaceAll(parts[i], "u", "t")
|
||||||
|
}
|
||||||
seqBytes.WriteString(parts[i])
|
seqBytes.WriteString(parts[i])
|
||||||
}
|
}
|
||||||
case line == "//":
|
case line == "//":
|
||||||
@@ -161,10 +164,10 @@ func EmblChunkParser(withFeatureTable bool) func(string, io.Reader) (obiseq.BioS
|
|||||||
func _ParseEmblFile(
|
func _ParseEmblFile(
|
||||||
input ChannelFileChunk,
|
input ChannelFileChunk,
|
||||||
out obiiter.IBioSequence,
|
out obiiter.IBioSequence,
|
||||||
withFeatureTable bool,
|
withFeatureTable, UtoT bool,
|
||||||
) {
|
) {
|
||||||
|
|
||||||
parser := EmblChunkParser(withFeatureTable)
|
parser := EmblChunkParser(withFeatureTable, UtoT)
|
||||||
|
|
||||||
for chunks := range input {
|
for chunks := range input {
|
||||||
order := chunks.Order
|
order := chunks.Order
|
||||||
@@ -206,6 +209,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er
|
|||||||
entry_channel,
|
entry_channel,
|
||||||
newIter,
|
newIter,
|
||||||
opt.WithFeatureTable(),
|
opt.WithFeatureTable(),
|
||||||
|
opt.UtoT(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -39,7 +39,7 @@ func EndOfLastFastaEntry(buffer []byte) int {
|
|||||||
return last
|
return last
|
||||||
}
|
}
|
||||||
|
|
||||||
func FastaChunkParser() func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
|
func FastaChunkParser(UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
|
||||||
|
|
||||||
parser := func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
|
parser := func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
|
||||||
var identifier string
|
var identifier string
|
||||||
@@ -131,7 +131,9 @@ func FastaChunkParser() func(string, io.Reader) (obiseq.BioSequenceSlice, error)
|
|||||||
if C >= 'A' && C <= 'Z' {
|
if C >= 'A' && C <= 'Z' {
|
||||||
C = C + 'a' - 'A'
|
C = C + 'a' - 'A'
|
||||||
}
|
}
|
||||||
|
if UtoT && C == 'u' {
|
||||||
|
C = 't'
|
||||||
|
}
|
||||||
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
|
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
|
||||||
seqBytes.WriteByte(C)
|
seqBytes.WriteByte(C)
|
||||||
} else {
|
} else {
|
||||||
@@ -170,6 +172,9 @@ func FastaChunkParser() func(string, io.Reader) (obiseq.BioSequenceSlice, error)
|
|||||||
if C >= 'A' && C <= 'Z' {
|
if C >= 'A' && C <= 'Z' {
|
||||||
C = C + 'a' - 'A'
|
C = C + 'a' - 'A'
|
||||||
}
|
}
|
||||||
|
if UtoT && C == 'u' {
|
||||||
|
C = 't'
|
||||||
|
}
|
||||||
// Removing white space from the sequence
|
// Removing white space from the sequence
|
||||||
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
|
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
|
||||||
seqBytes.WriteByte(C)
|
seqBytes.WriteByte(C)
|
||||||
@@ -207,9 +212,10 @@ func FastaChunkParser() func(string, io.Reader) (obiseq.BioSequenceSlice, error)
|
|||||||
func _ParseFastaFile(
|
func _ParseFastaFile(
|
||||||
input ChannelFileChunk,
|
input ChannelFileChunk,
|
||||||
out obiiter.IBioSequence,
|
out obiiter.IBioSequence,
|
||||||
|
UtoT bool,
|
||||||
) {
|
) {
|
||||||
|
|
||||||
parser := FastaChunkParser()
|
parser := FastaChunkParser(UtoT)
|
||||||
|
|
||||||
for chunks := range input {
|
for chunks := range input {
|
||||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
sequences, err := parser(chunks.Source, chunks.Raw)
|
||||||
@@ -243,7 +249,7 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
|||||||
|
|
||||||
for i := 0; i < nworker; i++ {
|
for i := 0; i < nworker; i++ {
|
||||||
out.Add(1)
|
out.Add(1)
|
||||||
go _ParseFastaFile(chkchan, out)
|
go _ParseFastaFile(chkchan, out, opt.UtoT())
|
||||||
}
|
}
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
|
@@ -131,7 +131,7 @@ func _storeSequenceQuality(bytes *bytes.Buffer, out *obiseq.BioSequence, quality
|
|||||||
out.SetQualities(q)
|
out.SetQualities(q)
|
||||||
}
|
}
|
||||||
|
|
||||||
func FastqChunkParser(quality_shift byte, with_quality bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
|
func FastqChunkParser(quality_shift byte, with_quality bool, UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
|
||||||
parser := func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
|
parser := func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
|
||||||
|
|
||||||
var identifier string
|
var identifier string
|
||||||
@@ -209,6 +209,9 @@ func FastqChunkParser(quality_shift byte, with_quality bool) func(string, io.Rea
|
|||||||
if C >= 'A' && C <= 'Z' {
|
if C >= 'A' && C <= 'Z' {
|
||||||
C = C + 'a' - 'A'
|
C = C + 'a' - 'A'
|
||||||
}
|
}
|
||||||
|
if C == 'u' && UtoT {
|
||||||
|
C = 't'
|
||||||
|
}
|
||||||
seqBytes.Reset()
|
seqBytes.Reset()
|
||||||
seqBytes.WriteByte(C)
|
seqBytes.WriteByte(C)
|
||||||
state = 6
|
state = 6
|
||||||
@@ -228,6 +231,9 @@ func FastqChunkParser(quality_shift byte, with_quality bool) func(string, io.Rea
|
|||||||
if C >= 'A' && C <= 'Z' {
|
if C >= 'A' && C <= 'Z' {
|
||||||
C = C + 'a' - 'A'
|
C = C + 'a' - 'A'
|
||||||
}
|
}
|
||||||
|
if C == 'u' && UtoT {
|
||||||
|
C = 't'
|
||||||
|
}
|
||||||
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
|
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
|
||||||
seqBytes.WriteByte(C)
|
seqBytes.WriteByte(C)
|
||||||
} else {
|
} else {
|
||||||
@@ -301,10 +307,10 @@ func _ParseFastqFile(
|
|||||||
input ChannelFileChunk,
|
input ChannelFileChunk,
|
||||||
out obiiter.IBioSequence,
|
out obiiter.IBioSequence,
|
||||||
quality_shift byte,
|
quality_shift byte,
|
||||||
with_quality bool,
|
with_quality, UtoT bool,
|
||||||
) {
|
) {
|
||||||
|
|
||||||
parser := FastqChunkParser(quality_shift, with_quality)
|
parser := FastqChunkParser(quality_shift, with_quality, UtoT)
|
||||||
|
|
||||||
for chunks := range input {
|
for chunks := range input {
|
||||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
sequences, err := parser(chunks.Source, chunks.Raw)
|
||||||
@@ -342,6 +348,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
|||||||
out,
|
out,
|
||||||
obidefault.ReadQualitiesShift(),
|
obidefault.ReadQualitiesShift(),
|
||||||
opt.ReadQualities(),
|
opt.ReadQualities(),
|
||||||
|
opt.UtoT(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -29,7 +29,7 @@ const (
|
|||||||
|
|
||||||
var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
|
var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
|
||||||
|
|
||||||
func GenbankChunkParser(withFeatureTable bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
|
func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
|
||||||
return func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
|
return func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
|
||||||
state := inHeader
|
state := inHeader
|
||||||
scanner := bufio.NewReader(input)
|
scanner := bufio.NewReader(input)
|
||||||
@@ -165,6 +165,9 @@ func GenbankChunkParser(withFeatureTable bool) func(string, io.Reader) (obiseq.B
|
|||||||
parts := strings.SplitN(line[10:], " ", 6)
|
parts := strings.SplitN(line[10:], " ", 6)
|
||||||
lparts := len(parts)
|
lparts := len(parts)
|
||||||
for i := 0; i < lparts; i++ {
|
for i := 0; i < lparts; i++ {
|
||||||
|
if UtoT {
|
||||||
|
parts[i] = strings.ReplaceAll(parts[i], "u", "t")
|
||||||
|
}
|
||||||
seqBytes.WriteString(parts[i])
|
seqBytes.WriteString(parts[i])
|
||||||
}
|
}
|
||||||
processed = true
|
processed = true
|
||||||
@@ -200,9 +203,9 @@ func GenbankChunkParser(withFeatureTable bool) func(string, io.Reader) (obiseq.B
|
|||||||
|
|
||||||
func _ParseGenbankFile(input ChannelFileChunk,
|
func _ParseGenbankFile(input ChannelFileChunk,
|
||||||
out obiiter.IBioSequence,
|
out obiiter.IBioSequence,
|
||||||
withFeatureTable bool) {
|
withFeatureTable, UtoT bool) {
|
||||||
|
|
||||||
parser := GenbankChunkParser(withFeatureTable)
|
parser := GenbankChunkParser(withFeatureTable, UtoT)
|
||||||
|
|
||||||
for chunks := range input {
|
for chunks := range input {
|
||||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
sequences, err := parser(chunks.Source, chunks.Raw)
|
||||||
@@ -242,6 +245,7 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
|
|||||||
entry_channel,
|
entry_channel,
|
||||||
newIter,
|
newIter,
|
||||||
opt.WithFeatureTable(),
|
opt.WithFeatureTable(),
|
||||||
|
opt.UtoT(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user