First commit

This commit is contained in:
2022-01-13 23:27:39 +01:00
parent dab6549cad
commit f53bf1b804
93 changed files with 11042 additions and 0 deletions

View File

@ -0,0 +1,18 @@
package main
import (
"os"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obioptions"
)
func main() {
option_parser := obioptions.GenerateOptionParser(obiconvert.OptionSet)
_, args, _ := option_parser(os.Args)
fs, _ := obiconvert.ReadBioSequences(args...)
obiconvert.WriteBioSequences(fs)
}

View File

@ -0,0 +1,65 @@
package main
import (
"fmt"
"log"
"os"
"runtime/trace"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obicount"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obioptions"
)
func main() {
// f, err := os.Create("cpu.pprof")
// if err != nil {
// log.Fatal(err)
// }
// pprof.StartCPUProfile(f)
// defer pprof.StopCPUProfile()
ftrace, err := os.Create("cpu.trace")
if err != nil {
log.Fatal(err)
}
trace.Start(ftrace)
defer trace.Stop()
option_parser := obioptions.GenerateOptionParser(
obiconvert.InputOptionSet,
obicount.OptionSet,
)
_, args, _ := option_parser(os.Args)
fs, _ := obiconvert.ReadBioSequences(args...)
nread := 0
nvariant := 0
nsymbol := 0
for fs.Next() {
s := fs.Get()
if s.IsNil() {
log.Panicln("Read sequence is nil")
}
nread += s.Count()
nvariant++
nsymbol += s.Length()
}
if obicount.IsPrintingVariantCount() {
fmt.Printf(" %d", nvariant)
}
if obicount.IsPrintingReadCount() {
fmt.Printf(" %d", nread)
}
if obicount.IsPrintingSymbolCount() {
fmt.Printf(" %d", nsymbol)
}
fmt.Printf("\n")
}

View File

@ -0,0 +1,68 @@
package main
import (
"fmt"
"os"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obifind"
)
func main() {
option_parser := obioptions.GenerateOptionParser(obifind.OptionSet)
_, args, _ := option_parser(os.Args)
//prof, _ := os.Create("obifind.prof")
//pprof.StartCPUProfile(prof)
restrictions, err := obifind.ITaxonRestrictions()
if err != nil {
fmt.Printf("%+v", err)
}
switch {
case obifind.RequestsPathForTaxid() >= 0:
taxonomy, err := obifind.LoadSelectedTaxonomy()
if err != nil {
fmt.Printf("%+v", err)
}
taxon, err := taxonomy.Taxon(obifind.RequestsPathForTaxid())
if err != nil {
fmt.Printf("%+v", err)
}
s, err := taxon.Path()
if err != nil {
fmt.Printf("%+v", err)
}
obifind.TaxonWriter(s.Iterator(),
fmt.Sprintf("path:%d", taxon.Taxid()))
case len(args) == 0:
taxonomy, err := obifind.LoadSelectedTaxonomy()
if err != nil {
fmt.Printf("%+v", err)
}
obifind.TaxonWriter(restrictions(taxonomy.Iterator()), "")
default:
matcher, err := obifind.ITaxonNameMatcher()
if err != nil {
fmt.Printf("%+v", err)
}
for _, pattern := range args {
s := restrictions(matcher(pattern))
obifind.TaxonWriter(s, pattern)
}
}
//pprof.StopCPUProfile()
}

View File

@ -0,0 +1,38 @@
package main
import (
"log"
"os"
"runtime/pprof"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiformats"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obipairing"
)
func main() {
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
f, err := os.Create("cpu.pprof")
if err != nil {
log.Fatal(err)
}
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
// go tool trace cpu.trace
// ftrace, err := os.Create("cpu.trace")
// if err != nil {
// log.Fatal(err)
// }
// trace.Start(ftrace)
// defer trace.Stop()
option_parser := obioptions.GenerateOptionParser(obipairing.OptionSet)
option_parser(os.Args)
pairs, _ := obipairing.IBatchPairedSequence()
paired := obipairing.IAssemblePESequencesBatch(pairs, 2, 50, 20, true)
written, _ := obiformats.WriteFastqBatchToStdout(paired)
written.Destroy()
}

View File

@ -0,0 +1,34 @@
package main
import (
"os"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obipcr"
)
func main() {
// f, err := os.Create("cpu.pprof")
// if err != nil {
// log.Fatal(err)
// }
// pprof.StartCPUProfile(f)
// defer pprof.StopCPUProfile()
// ftrace, err := os.Create("cpu.trace")
// if err != nil {
// log.Fatal(err)
// }
// trace.Start(ftrace)
// defer trace.Stop()
option_parser := obioptions.GenerateOptionParser(obipcr.OptionSet)
_, args, _ := option_parser(os.Args)
sequences, _ := obiconvert.ReadBioSequencesBatch(args...)
amplicons, _ := obipcr.PCR(sequences)
obiconvert.WriteBioSequences(amplicons)
}

62
cmd/test/main.go Normal file
View File

@ -0,0 +1,62 @@
package main
import (
"fmt"
"log"
"os"
"runtime/trace"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obialign"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
func main() {
ftrace, err := os.Create("cpu.trace")
if err != nil {
log.Fatal(err)
}
trace.Start(ftrace)
defer trace.Stop()
// option_parser := obioptions.GenerateOptionParser(
// obiconvert.InputOptionSet,
// )
//_, args, _ := option_parser(os.Args)
// fs, _ := obiconvert.ReadBioSequences(args...)
// buffer := make([]byte, 0)
// fs.Next()
// s := fs.Get()
// index := obikmer.Index4mer(s, nil, nil)
// for fs.Next() {
// s := fs.Get()
// if s.IsNil() {
// log.Panicln("Read sequence is nil")
// }
// maxshift, maxcount := obikmer.FastShiftFourMer(index, s, buffer)
// fmt.Printf("Shift : %d Score : %d\n", maxshift, maxcount)
// }
A := []byte("ccgcctccttagaacaggctcctctagaaaaccatagtgggatatctaaagaaggcggagatagaaagagcggttcagcaggaatgccgagatggacggcgtgtgacg")
B := []byte("cgccaccaccgagatctacactctttccctacacgacgctcttccgatctccgcctccttagaacaggctcctctagaaaagcatagtggggtatctaaaggaggcgg")
sA := obiseq.MakeBioSequence("A", A, "")
sB := obiseq.MakeBioSequence("B", B, "")
fmt.Println(string(sA.Sequence()))
fmt.Println(sA.Qualities())
fmt.Println(string(sB.Sequence()))
fmt.Println(sB.Qualities())
score, path := obialign.PELeftAlign(sA, sB, 2, obialign.NilPEAlignArena)
fmt.Printf("Score : %d Path : %v\n", score, path)
score, path = obialign.PERightAlign(sA, sB, 2, obialign.NilPEAlignArena)
fmt.Printf("Score : %d Path : %v\n", score, path)
fmt.Println(string(sA.Sequence()))
sA.ReverseComplement(true)
fmt.Println(string(sA.Sequence()))
fmt.Println(string(sA.Id()))
}

58
cmd/test/test_test.go Normal file
View File

@ -0,0 +1,58 @@
package main_test
import (
"fmt"
"testing"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiannot"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiformats"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
func TestParseOBIFasta(t *testing.T) {
f := "/Users/coissac/travail/Adeline/Soumission_data/Zonation/euka03/euka03.ecotag.fasta.gz"
var nseq, nread int
nseq = 0
nread = 0
fs := obiformats.ReaderFromIlluminaFile(f)
fmt.Println(f)
for i := range obiannot.ExtractHeaderChannel(fs, fastseq.ParseOBIHeader) {
for _, s := range i {
nseq++
nread += s.Count()
}
}
fmt.Println(nseq, nread)
}
func ExtractHeaderChannel(fs fastseq.IFastSeq, sequence func(sequence obiseq.Sequence)) {
panic("unimplemented")
}
// Performance test of an ADEXP message parsing
func BenchmarkParseOBIFasta(t *testing.B) {
f := "/Users/coissac/travail/Adeline/Soumission_data/Zonation/euka03/euka03.ecotag.fasta.gz"
var nseq, nread int
nseq = 0
nread = 0
fs := fastseq.ReaderFromIlluminaFile(f)
fmt.Println(f)
for i := range obiannot.ExtractHeaderChannel(fs, fastseq.ParseOBIHeader) {
for _, s := range i {
nseq++
nread += s.Count()
}
}
fmt.Println(nseq, nread)
}

1226
go.sum Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,18 @@
package main
import (
"os"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obioptions"
)
func main() {
option_parser := obioptions.GenerateOptionParser(obiconvert.OptionSet)
_, args, _ := option_parser(os.Args)
fs, _ := obiconvert.ReadBioSequences(args...)
obiconvert.WriteBioSequences(fs)
}

65
obitools/obicount/main.go Normal file
View File

@ -0,0 +1,65 @@
package main
import (
"fmt"
"log"
"os"
"runtime/trace"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obicount"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obioptions"
)
func main() {
// f, err := os.Create("cpu.pprof")
// if err != nil {
// log.Fatal(err)
// }
// pprof.StartCPUProfile(f)
// defer pprof.StopCPUProfile()
ftrace, err := os.Create("cpu.trace")
if err != nil {
log.Fatal(err)
}
trace.Start(ftrace)
defer trace.Stop()
option_parser := obioptions.GenerateOptionParser(
obiconvert.InputOptionSet,
obicount.OptionSet,
)
_, args, _ := option_parser(os.Args)
fs, _ := obiconvert.ReadBioSequences(args...)
nread := 0
nvariant := 0
nsymbol := 0
for fs.Next() {
s := fs.Get()
if s.IsNil() {
log.Panicln("Read sequence is nil")
}
nread += s.Count()
nvariant++
nsymbol += s.Length()
}
if obicount.IsPrintingVariantCount() {
fmt.Printf(" %d", nvariant)
}
if obicount.IsPrintingReadCount() {
fmt.Printf(" %d", nread)
}
if obicount.IsPrintingSymbolCount() {
fmt.Printf(" %d", nsymbol)
}
fmt.Printf("\n")
}

68
obitools/obifind/main.go Normal file
View File

@ -0,0 +1,68 @@
package main
import (
"fmt"
"os"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obifind"
)
func main() {
option_parser := obioptions.GenerateOptionParser(obifind.OptionSet)
_, args, _ := option_parser(os.Args)
//prof, _ := os.Create("obifind.prof")
//pprof.StartCPUProfile(prof)
restrictions, err := obifind.ITaxonRestrictions()
if err != nil {
fmt.Printf("%+v", err)
}
switch {
case obifind.RequestsPathForTaxid() >= 0:
taxonomy, err := obifind.LoadSelectedTaxonomy()
if err != nil {
fmt.Printf("%+v", err)
}
taxon, err := taxonomy.Taxon(obifind.RequestsPathForTaxid())
if err != nil {
fmt.Printf("%+v", err)
}
s, err := taxon.Path()
if err != nil {
fmt.Printf("%+v", err)
}
obifind.TaxonWriter(s.Iterator(),
fmt.Sprintf("path:%d", taxon.Taxid()))
case len(args) == 0:
taxonomy, err := obifind.LoadSelectedTaxonomy()
if err != nil {
fmt.Printf("%+v", err)
}
obifind.TaxonWriter(restrictions(taxonomy.Iterator()), "")
default:
matcher, err := obifind.ITaxonNameMatcher()
if err != nil {
fmt.Printf("%+v", err)
}
for _, pattern := range args {
s := restrictions(matcher(pattern))
obifind.TaxonWriter(s, pattern)
}
}
//pprof.StopCPUProfile()
}

View File

@ -0,0 +1,38 @@
package main
import (
"log"
"os"
"runtime/pprof"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiformats"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obipairing"
)
func main() {
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
f, err := os.Create("cpu.pprof")
if err != nil {
log.Fatal(err)
}
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
// go tool trace cpu.trace
// ftrace, err := os.Create("cpu.trace")
// if err != nil {
// log.Fatal(err)
// }
// trace.Start(ftrace)
// defer trace.Stop()
option_parser := obioptions.GenerateOptionParser(obipairing.OptionSet)
option_parser(os.Args)
pairs, _ := obipairing.IBatchPairedSequence()
paired := obipairing.IAssemblePESequencesBatch(pairs, 2, 50, 20, true)
written, _ := obiformats.WriteFastqBatchToStdout(paired)
written.Destroy()
}

34
obitools/obipcr/main.go Normal file
View File

@ -0,0 +1,34 @@
package main
import (
"os"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obipcr"
)
func main() {
// f, err := os.Create("cpu.pprof")
// if err != nil {
// log.Fatal(err)
// }
// pprof.StartCPUProfile(f)
// defer pprof.StopCPUProfile()
// ftrace, err := os.Create("cpu.trace")
// if err != nil {
// log.Fatal(err)
// }
// trace.Start(ftrace)
// defer trace.Stop()
option_parser := obioptions.GenerateOptionParser(obipcr.OptionSet)
_, args, _ := option_parser(os.Args)
sequences, _ := obiconvert.ReadBioSequencesBatch(args...)
amplicons, _ := obipcr.PCR(sequences)
obiconvert.WriteBioSequences(amplicons)
}

19
pkg/cutils/byteslice.go Normal file
View File

@ -0,0 +1,19 @@
package cutils
import "C"
import (
"reflect"
"unsafe"
)
func ByteSlice(pointer unsafe.Pointer, size int) []byte {
var s []byte
h := (*reflect.SliceHeader)((unsafe.Pointer(&s)))
h.Cap = size
h.Len = size
h.Data = uintptr(pointer)
return s
}

54
pkg/goutils/goutils.go Normal file
View File

@ -0,0 +1,54 @@
package goutils
import (
"bytes"
"encoding/gob"
)
type NotAnInteger struct {
message string
}
func (m *NotAnInteger) Error() string {
return m.message
}
func InterfaceToInt(i interface{}) (val int, err error) {
err = nil
val = 0
switch t := i.(type) {
case int:
val = t
case int8:
val = int(t) // standardizes across systems
case int16:
val = int(t) // standardizes across systems
case int32:
val = int(t) // standardizes across systems
case int64:
val = int(t) // standardizes across systems
case float32:
val = int(t) // standardizes across systems
case float64:
val = int(t) // standardizes across systems
case uint8:
val = int(t) // standardizes across systems
case uint16:
val = int(t) // standardizes across systems
case uint32:
val = int(t) // standardizes across systems
case uint64:
val = int(t) // standardizes across systems
default:
err = &NotAnInteger{"count attribute is not an integer"}
}
return
}
func CopyMap(dest, src map[string]interface{}) {
buf := new(bytes.Buffer)
gob.NewEncoder(buf).Encode(src)
gob.NewDecoder(buf).Decode(&dest)
}

167
pkg/obialign/alignment.go Normal file
View File

@ -0,0 +1,167 @@
package obialign
import (
"math"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
type __build_align_arena__ struct {
bufferA []byte
bufferB []byte
}
type BuildAlignArena struct {
pointer *__build_align_arena__
}
var NilBuildAlignArena = BuildAlignArena{nil}
func MakeBuildAlignArena(lseqA, lseqB int) BuildAlignArena {
a := __build_align_arena__{
bufferA: make([]byte, lseqA+lseqB),
bufferB: make([]byte, lseqA+lseqB),
}
return BuildAlignArena{&a}
}
func __build_alignment__(seqA, seqB []byte, path []int, gap byte,
bufferA, bufferB *[]byte) ([]byte, []byte) {
if bufferA == nil {
b := make([]byte, 0, len(seqA)+len(seqB))
bufferA = &b
}
if bufferB == nil {
b := make([]byte, 0, len(seqA)+len(seqB))
bufferB = &b
}
*bufferA = (*bufferA)[:0]
*bufferB = (*bufferB)[:0]
lp := len(path)
pos_a := 0
pos_b := 0
for i := 0; i < lp; i++ {
step := path[i]
if step < 0 {
*bufferA = append(*bufferA, seqA[pos_a:(pos_a-step)]...)
for j := 0; j < -step; j++ {
*bufferB = append(*bufferB, gap)
}
pos_a -= step
}
if step > 0 {
*bufferB = append(*bufferB, seqB[pos_b:(pos_b+step)]...)
for j := 0; j < step; j++ {
*bufferA = append(*bufferA, gap)
}
pos_b += step
}
i++
step = path[i]
if step > 0 {
*bufferA = append(*bufferA, seqA[pos_a:(pos_a+step)]...)
*bufferB = append(*bufferB, seqB[pos_b:(pos_b+step)]...)
pos_a += step
pos_b += step
}
}
return *bufferA, *bufferB
}
func BuildAlignment(seqA, seqB obiseq.BioSequence,
path []int, gap byte, arena BuildAlignArena) (obiseq.BioSequence, obiseq.BioSequence) {
if arena.pointer == nil {
arena = MakeBuildAlignArena(seqA.Length(), seqB.Length())
}
A, B := __build_alignment__(seqA.Sequence(), seqB.Sequence(), path, gap,
&arena.pointer.bufferA,
&arena.pointer.bufferB)
seqA = obiseq.MakeBioSequence(seqA.Id(),
A,
seqA.Definition())
seqB = obiseq.MakeBioSequence(seqB.Id(),
B,
seqB.Definition())
return seqA, seqB
}
func BuildQualityConsensus(seqA, seqB obiseq.BioSequence, path []int,
arena1, arena2 BuildAlignArena) (obiseq.BioSequence, int) {
if arena1.pointer == nil {
arena1 = MakeBuildAlignArena(seqA.Length(), seqB.Length())
}
if arena2.pointer == nil {
arena2 = MakeBuildAlignArena(seqA.Length(), seqB.Length())
}
sA, sB := __build_alignment__(seqA.Sequence(), seqB.Sequence(), path, ' ',
&arena1.pointer.bufferA,
&arena1.pointer.bufferB)
qsA, qsB := __build_alignment__(seqA.Qualities(), seqB.Qualities(), path, byte(0),
&arena2.pointer.bufferA,
&arena2.pointer.bufferB)
consensus := make([]byte, 0, len(sA))
qualities := make([]byte, 0, len(sA))
var qA, qB byte
var qM, qm byte
var i int
match := 0
for i, qA = range qsA {
qB = qsB[i]
if qA > qB {
consensus = append(consensus, sA[i])
qM = qA
qm = qB
}
if qB > qA {
consensus = append(consensus, sB[i])
qM = qB
qm = qA
}
if qB == qA {
nuc := __four_bits_base_code__[sA[i]&31] | __four_bits_base_code__[sB[i]&31]
consensus = append(consensus, __four_bits_base_decode__[nuc])
}
q := qA + qB
if qA > 0 && qB > 0 {
if sA[i] != sB[i] {
q = qM - byte(math.Log10(1-math.Pow(10, -float64(qm)/30))*10+0.5)
}
if sA[i] == sB[i] {
match++
}
}
if q > 90 {
q = 90
}
qualities = append(qualities, q)
}
seq := obiseq.MakeBioSequence(seqA.Id(), consensus, seqA.Definition())
seq.SetQualities(qualities)
return seq, match
}

View File

@ -0,0 +1,95 @@
package obialign
func __backtracking__(path_matrix []int, lseqA, lseqB int, path *[]int) []int {
needed := (lseqA + lseqB) * 2
if needed > cap(*path) {
*path = make([]int, 0, needed)
}
*path = (*path)[:cap(*path)]
p := cap(*path)
i := lseqA - 1
j := lseqB - 1
ldiag := 0
lup := 0
lleft := 0
for i > -1 || j > -1 {
step := __get_matrix__(&path_matrix, lseqA, i, j)
// log.Printf("I: %d J:%d -> %d\n", i, j, step)
switch {
case step == 0:
if lleft != 0 {
p--
(*path)[p] = ldiag
p--
(*path)[p] = lleft
lleft = 0
ldiag = 0
}
if lup != 0 {
p--
(*path)[p] = ldiag
p--
(*path)[p] = lup
lup = 0
ldiag = 0
}
ldiag++
i--
j--
case step > 0:
if lup != 0 {
p--
(*path)[p] = ldiag
p--
(*path)[p] = lup
lup = 0
ldiag = 0
}
lleft += step
j -= step
case step < 0:
if lleft != 0 {
p--
(*path)[p] = ldiag
p--
(*path)[p] = lleft
lleft = 0
ldiag = 0
}
lup += step
i += step
}
}
if lleft != 0 {
p--
(*path)[p] = ldiag
p--
(*path)[p] = lleft
ldiag = 0
}
if lup != 0 {
p--
(*path)[p] = ldiag
p--
(*path)[p] = lup
ldiag = 0
}
if ldiag != 0 {
p--
(*path)[p] = ldiag
p--
(*path)[p] = 0
}
*path = (*path)[p:cap((*path))]
return *path
}

100
pkg/obialign/dnamatrix.go Normal file
View File

@ -0,0 +1,100 @@
package obialign
import (
"math"
)
var __four_bits_count__ = []float64{
0, // 0000
1, // 0001
1, // 0010
2, // 0011
1, // 0100
2, // 0101
2, // 0110
3, // 0111
1, // 1000
2, // 1001
2, // 1010
3, // 1011
2, // 1100
3, // 1101
3, // 1110
4, // 1111
}
var __initialized_dna_score__ = false
var __nuc_part_match__ [32][32]float64
var __nuc_score_part_match_match__ [100][100]int
var __nuc_score_part_match_mismatch__ [100][100]int
func __match_ratio__(a, b byte) float64 {
// count of common bits
cm := __four_bits_count__[a&b&15]
ca := __four_bits_count__[a&15]
cb := __four_bits_count__[b&15]
if cm == 0 || ca == 0 || cb == 0 {
return float64(0)
}
return float64(cm) / float64(ca) / float64(cb)
}
func __logaddexp__(a, b float64) float64 {
if a > b {
a, b = b, a
}
return b + math.Log1p(math.Exp(a-b))
}
func __match_score_ratio__(a, b byte) (float64, float64) {
l2 := math.Log(2)
l3 := math.Log(3)
l4 := math.Log(4)
l10 := math.Log(10)
lE1 := -float64(a)/10*l10 - l4
lE2 := -float64(b)/10*l10 - l4
lO1 := math.Log1p(-math.Exp(lE1 + l3))
lO2 := math.Log1p(-math.Exp(lE2 + l3))
lO1O2 := lO1 + lO2
lE1E2 := lE1 + lE2
lO1E2 := lO1 + lE2
lO2E1 := lO2 + lE1
MM := __logaddexp__(lO1O2, lE1E2+l3) + l4
Mm := __logaddexp__(__logaddexp__(lO1E2, lO2E1), lE1E2+l2) + l4
return MM, Mm
}
func __init_nuc_part_match__() {
for i, a := range __four_bits_base_code__ {
for j, b := range __four_bits_base_code__ {
__nuc_part_match__[i][j] = __match_ratio__(a, b)
}
}
}
func __init_nuc_score_part_match__() {
for i := 0; i < 100; i++ {
for j := 0; j < 100; j++ {
MM, Mm := __match_score_ratio__(byte(i), byte(j))
__nuc_score_part_match_match__[i][j] = int(MM*10 + 0.5)
__nuc_score_part_match_mismatch__[i][j] = int(Mm*10 + 0.5)
}
}
}
func InitDNAScoreMatrix() {
if !__initialized_dna_score__ {
__init_nuc_part_match__()
__init_nuc_score_part_match__()
__initialized_dna_score__ = true
}
}

View File

@ -0,0 +1,74 @@
package obialign
import (
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
var __four_bits_base_code__ = []byte{0b0000,
// IUPAC nucleotide code Base
0b0001, // A Adenine
0b1110, // B C or G or T
0b0010, // C Cytosine
0b1101, // D A or G or T
0b0000, // E not a nucleotide
0b0000, // F not a nucleotide
0b0100, // G Guanine
0b1011, // H A or C or T
0b0000, // I not a nucleotide
0b0000, // J not a nucleotide
0b1100, // K G or T
0b0000, // L not a nucleotide
0b0011, // M A or C
0b1111, // N any base
0b0000, // O not a nucleotide
0b0000, // P not a nucleotide
0b0000, // Q not a nucleotide
0b0101, // R A or G
0b0110, // S G or C
0b1000, // T Thymine
0b1000, // U Uracil
0b0111, // V A or C or G
0b1001, // W A or T
0b0000, // X not a nucleotide
0b1010, // Y C or T
0b0000, // Z not a nucleotide
0b0000,
0b0000,
0b0000,
0b0000,
0b0000}
var __four_bits_base_decode__ = []byte{
// 0b0000 0b0001 0b0010 0b0011
'.', 'a', 'c', 'm',
// 0b0100 0b0101 0b0110 0b0111
'g', 'r', 's', 'v',
// 0b1000 0b1001 0b1010 0b1011
't', 'w', 'y', 'h',
// 0b1100 0b1101 0b1110 0b1111
'k', 'd', 'b', 'n',
}
func Encode4bits(seq obiseq.BioSequence, buffer []byte) []byte {
length := seq.Length()
rawseq := seq.Sequence()
if buffer == nil {
buffer = make([]byte, 0, length)
} else {
buffer = buffer[:0]
}
var code byte
for _, nuc := range rawseq {
if nuc == '.' || nuc == '-' {
code = 0
} else {
code = __four_bits_base_code__[nuc&31]
}
buffer = append(buffer, code)
}
return buffer
}

View File

@ -0,0 +1,366 @@
package obialign
import (
"log"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obikmer"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
type __pe_align_arena__ struct {
score_matrix []int
path_matrix []int
path []int
fast_index [][]int
fast_buffer []byte
}
type PEAlignArena struct {
pointer *__pe_align_arena__
}
var NilPEAlignArena = PEAlignArena{nil}
func MakePEAlignArena(lseqA, lseqB int) PEAlignArena {
a := __pe_align_arena__{
score_matrix: make([]int, 0, (lseqA+1)*(lseqB+1)),
path_matrix: make([]int, 0, (lseqA+1)*(lseqB+1)),
path: make([]int, 2*(lseqA+lseqB)),
fast_index: make([][]int, 256),
fast_buffer: make([]byte, 0, lseqA),
}
return PEAlignArena{&a}
}
func __set_matrices__(matrixA, matrixB *[]int, lenA, a, b, valueA, valueB int) {
i := (b+1)*(lenA+1) + a + 1
(*matrixA)[i] = valueA
(*matrixB)[i] = valueB
}
func __get_matrix__(matrix *[]int, lenA, a, b int) int {
return (*matrix)[(b+1)*(lenA+1)+a+1]
}
func __get_matrix_from__(matrix *[]int, lenA, a, b int) (int, int, int) {
i := (b+1)*(lenA+1) + a
j := i - lenA
m := *matrix
return m[j], m[j-1], m[i]
}
func __pairing_score_pe_align__(baseA, qualA, baseB, qualB byte) int {
part_match := __nuc_part_match__[baseA&31][baseB&31]
// log.Printf("id : %f A : %s %d B : %s %d\n", part_match, string(baseA), qualA, string(baseB), qualB)
switch {
case part_match == 1:
// log.Printf("match\n")
return __nuc_score_part_match_match__[qualA][qualB]
case part_match == 0:
return __nuc_score_part_match_mismatch__[qualA][qualB]
default:
return int(part_match*float64(__nuc_score_part_match_match__[qualA][qualB]) +
(1-part_match)*float64(__nuc_score_part_match_mismatch__[qualA][qualB]) + 0.5)
}
}
func __fill_matrix_pe_left_align__(seqA, qualA, seqB, qualB []byte, gap int,
score_matrix, path_matrix *[]int) int {
la := len(seqA)
lb := len(seqB)
// The actual gap score is the gap score times the mismatch between
// two bases with a score of 40
gap = gap * __nuc_score_part_match_mismatch__[40][40]
needed := (la + 1) * (lb + 1)
if needed > cap(*score_matrix) {
*score_matrix = make([]int, needed)
}
if needed > cap(*path_matrix) {
*path_matrix = make([]int, needed)
}
*score_matrix = (*score_matrix)[:needed]
*path_matrix = (*path_matrix)[:needed]
__set_matrices__(score_matrix, path_matrix, la, -1, -1, 0, 0)
// Fills the first column with score 0
for i := 0; i < la; i++ {
__set_matrices__(score_matrix, path_matrix, la, i, -1, 0, -1)
}
la1 := la - 1
for j := 0; j < lb; j++ {
__set_matrices__(score_matrix, path_matrix, la, -1, j, (j+1)*gap, 1)
for i := 0; i < la1; i++ {
left, diag, top := __get_matrix_from__(score_matrix, la, i, j)
diag += __pairing_score_pe_align__(seqA[i], qualA[i], seqB[j], qualB[j])
left += gap
top += gap
switch {
case diag > left && diag > top:
__set_matrices__(score_matrix, path_matrix, la, i, j, diag, 0)
case left > diag && left > top:
__set_matrices__(score_matrix, path_matrix, la, i, j, left, +1)
default:
__set_matrices__(score_matrix, path_matrix, la, i, j, top, -1)
}
}
// Special case for the last line Left gap are free
left, diag, top := __get_matrix_from__(score_matrix, la, la1, j)
diag += __pairing_score_pe_align__(seqA[la1], qualA[la1], seqB[j], qualB[j])
top += gap
switch {
case diag > left && diag > top:
__set_matrices__(score_matrix, path_matrix, la, la1, j, diag, 0)
case left > diag && left > top:
__set_matrices__(score_matrix, path_matrix, la, la1, j, left, +1)
default:
__set_matrices__(score_matrix, path_matrix, la, la1, j, top, -1)
}
}
return __get_matrix__(score_matrix, la, la1, lb-1)
}
func __fill_matrix_pe_right_align__(seqA, qualA, seqB, qualB []byte, gap int,
score_matrix, path_matrix *[]int) int {
la := len(seqA)
lb := len(seqB)
// The actual gap score is the gap score times the mismatch between
// two bases with a score of 40
gap = gap * __nuc_score_part_match_mismatch__[40][40]
needed := (la + 1) * (lb + 1)
if needed > cap(*score_matrix) {
*score_matrix = make([]int, needed)
}
if needed > cap(*path_matrix) {
*path_matrix = make([]int, needed)
}
*score_matrix = (*score_matrix)[:needed]
*path_matrix = (*path_matrix)[:needed]
__set_matrices__(score_matrix, path_matrix, la, -1, -1, 0, 0)
// Fills the first column with score 0
for i := 0; i < la; i++ {
__set_matrices__(score_matrix, path_matrix, la, i, -1, (i+1)*gap, -1)
}
lb1 := lb - 1
for j := 0; j < lb1; j++ {
__set_matrices__(score_matrix, path_matrix, la, -1, j, 0, 1)
for i := 0; i < la; i++ {
left, diag, top := __get_matrix_from__(score_matrix, la, i, j)
diag += __pairing_score_pe_align__(seqA[i], qualA[i], seqB[j], qualB[j])
left += gap
top += gap
switch {
case diag > left && left > top:
__set_matrices__(score_matrix, path_matrix, la, i, j, diag, 0)
case left > diag && left > top:
__set_matrices__(score_matrix, path_matrix, la, i, j, left, +1)
default:
__set_matrices__(score_matrix, path_matrix, la, i, j, top, -1)
}
}
}
// Special case for the last colump Up gap are free
__set_matrices__(score_matrix, path_matrix, la, -1, lb1, 0, 1)
for i := 0; i < la; i++ {
left, diag, top := __get_matrix_from__(score_matrix, la, i, lb1)
diag += __pairing_score_pe_align__(seqA[i], qualA[i], seqB[lb1], qualB[lb1])
left += gap
switch {
case diag > left && diag > top:
__set_matrices__(score_matrix, path_matrix, la, i, lb1, diag, 0)
case left > diag && left > top:
__set_matrices__(score_matrix, path_matrix, la, i, lb1, left, +1)
default:
__set_matrices__(score_matrix, path_matrix, la, i, lb1, top, -1)
}
}
return __get_matrix__(score_matrix, la, la-1, lb1)
}
func PELeftAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (int, []int) {
if !__initialized_dna_score__ {
log.Println("Initializing the DNA Scoring matrix")
InitDNAScoreMatrix()
}
if arena.pointer == nil {
arena = MakePEAlignArena(seqA.Length(), seqB.Length())
}
score := __fill_matrix_pe_left_align__(seqA.Sequence(), seqA.Qualities(),
seqB.Sequence(), seqB.Qualities(), gap,
&arena.pointer.score_matrix,
&arena.pointer.path_matrix)
arena.pointer.path = __backtracking__(arena.pointer.path_matrix,
seqA.Length(), seqB.Length(),
&arena.pointer.path)
return score, arena.pointer.path
}
func PERightAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (int, []int) {
if !__initialized_dna_score__ {
log.Println("Initializing the DNA Scoring matrix")
InitDNAScoreMatrix()
}
if arena.pointer == nil {
arena = MakePEAlignArena(seqA.Length(), seqB.Length())
}
score := __fill_matrix_pe_right_align__(seqA.Sequence(), seqA.Qualities(),
seqB.Sequence(), seqB.Qualities(), gap,
&arena.pointer.score_matrix,
&arena.pointer.path_matrix)
arena.pointer.path = __backtracking__(arena.pointer.path_matrix,
seqA.Length(), seqB.Length(),
&arena.pointer.path)
return score, arena.pointer.path
}
func PEAlign(seqA, seqB obiseq.BioSequence,
gap, delta int,
arena PEAlignArena) (int, []int) {
var score, shift int
var startA, startB int
var part_len, over int
var raw_seqA, qual_seqA []byte
var raw_seqB, qual_seqB []byte
var extra5, extra3 int
if !__initialized_dna_score__ {
log.Println("Initializing the DNA Scoring matrix")
InitDNAScoreMatrix()
}
index := obikmer.Index4mer(seqA,
&arena.pointer.fast_index,
&arena.pointer.fast_buffer)
shift, fast_score := obikmer.FastShiftFourMer(index, seqB, nil)
if shift > 0 {
over = seqA.Length() - shift
} else {
over = seqB.Length() + shift
}
if fast_score+3 < over {
if shift > 0 {
startA = shift - delta
if startA < 0 {
startA = 0
}
extra5 = -startA
startB = 0
raw_seqA = seqA.Sequence()[startA:]
qual_seqA = seqA.Qualities()[startA:]
part_len = len(raw_seqA)
raw_seqB = seqB.Sequence()[0:part_len]
qual_seqB = seqB.Qualities()[0:part_len]
extra3 = seqB.Length() - part_len
score = __fill_matrix_pe_left_align__(
raw_seqA, qual_seqA, raw_seqB, qual_seqB, gap,
&arena.pointer.score_matrix,
&arena.pointer.path_matrix)
} else {
startA = 0
startB = -shift - delta
if startB < 0 {
startB = 0
}
extra5 = startB
raw_seqB = seqB.Sequence()[startB:]
qual_seqB = seqB.Qualities()[startB:]
part_len = len(raw_seqB)
raw_seqA = seqA.Sequence()[:part_len]
qual_seqA = seqA.Qualities()[:part_len]
extra3 = part_len - seqA.Length()
score = __fill_matrix_pe_right_align__(
raw_seqA, qual_seqA, raw_seqB, qual_seqB, gap,
&arena.pointer.score_matrix,
&arena.pointer.path_matrix)
}
arena.pointer.path = __backtracking__(arena.pointer.path_matrix,
len(raw_seqA), len(raw_seqB),
&arena.pointer.path)
} else {
if shift > 0 {
startA = shift
startB = 0
extra5 = -startA
qual_seqA = seqA.Qualities()[startA:]
part_len = len(qual_seqA)
qual_seqB = seqB.Qualities()[0:part_len]
extra3 = seqB.Length() - part_len
score = 0
} else {
startA = 0
startB = -shift
extra5 = startB
qual_seqB = seqB.Qualities()[startB:]
part_len = len(qual_seqB)
extra3 = part_len - seqA.Length()
qual_seqA = seqA.Qualities()[:part_len]
}
score = 0
for i, qualA := range qual_seqA {
qualB := qual_seqB[i]
score += __nuc_score_part_match_match__[qualA][qualB]
}
arena.pointer.path = arena.pointer.path[:0]
arena.pointer.path = append(arena.pointer.path, 0, part_len)
}
arena.pointer.path[0] += extra5
if arena.pointer.path[len(arena.pointer.path)-1] == 0 {
arena.pointer.path[len(arena.pointer.path)-2] += extra3
} else {
arena.pointer.path = append(arena.pointer.path, extra3, 0)
}
return score, arena.pointer.path
}

View File

@ -0,0 +1,14 @@
/* ----------------------------------------------- */
/* dft_pat_seq_code.h */
/* default alphabet encoding for alpha */
/* ----------------------------------------------- */
0x00000001 /* A */, 0x00000002 /* B */, 0x00000004 /* C */,
0x00000008 /* D */, 0x00000010 /* E */, 0x00000020 /* F */,
0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */,
0x00000200 /* J */, 0x00000400 /* K */, 0x00000800 /* L */,
0x00001000 /* M */, 0x00002000 /* N */, 0x00004000 /* O */,
0x00008000 /* P */, 0x00010000 /* Q */, 0x00020000 /* R */,
0x00040000 /* S */, 0x00080000 /* T */, 0x00100000 /* U */,
0x00200000 /* V */, 0x00400000 /* W */, 0x00800000 /* X */,
0x01000000 /* Y */, 0x02000000 /* Z */

View File

@ -0,0 +1,71 @@
/* ----------------------------------------------- */
/* dna_code.h */
/* alphabet encoding for dna/rna */
/* ----------------------------------------- */
/* IUPAC encoding */
/* ----------------------------------------- */
/* G/A/T/C */
/* U=T */
/* R=AG */
/* Y=CT */
/* M=AC */
/* K=GT */
/* S=CG */
/* W=AT */
/* H=ACT */
/* B=CGT */
/* V=ACG */
/* D=AGT */
/* N=ACGT */
/* X=ACGT */
/* EFIJLOPQZ not recognized */
/* ----------------------------------------- */
/* dual encoding */
/* ----------------------------------------- */
/* A=ADHMNRVW */
/* B=BCDGHKMNRSTUVWY */
/* C=BCHMNSVY */
/* D=ABDGHKMNRSTUVWY */
/* G=BDGKNRSV */
/* H=ABCDHKMNRSTUVWY */
/* K=BDGHKNRSTUVWY */
/* M=ABCDHMNRSVWY */
/* N=ABCDGHKMNRSTUVWY */
/* R=ABDGHKMNRSVW */
/* S=BCDGHKMNRSVY */
/* T=BDHKNTUWY */
/* U=BDHKNTUWY */
/* V=ABCDGHKMNRSVWY */
/* W=ABDHKMNRTUVWY */
/* X=ABCDGHKMNRSTUVWY */
/* Y=BCDHKMNSTUVWY */
/* EFIJLOPQZ not recognized */
/* ----------------------------------------------- */
#ifndef USE_DUAL
/* IUPAC */
0x00000001 /* A */, 0x00080044 /* B */, 0x00000004 /* C */,
0x00080041 /* D */, 0x00000000 /* E */, 0x00000000 /* F */,
0x00000040 /* G */, 0x00080005 /* H */, 0x00000000 /* I */,
0x00000000 /* J */, 0x00080040 /* K */, 0x00000000 /* L */,
0x00000005 /* M */, 0x00080045 /* N */, 0x00000000 /* O */,
0x00000000 /* P */, 0x00000000 /* Q */, 0x00000041 /* R */,
0x00000044 /* S */, 0x00080000 /* T */, 0x00080000 /* U */,
0x00000045 /* V */, 0x00080001 /* W */, 0x00080045 /* X */,
0x00080004 /* Y */, 0x00000000 /* Z */
#else
/* DUAL */
0x00623089 /* A */, 0x017e34ce /* B */, 0x01243086 /* C */,
0x017e34cb /* D */, 0x00000000 /* E */, 0x00000000 /* F */,
0x0026244a /* G */, 0x017e348f /* H */, 0x00000000 /* I */,
0x00000000 /* J */, 0x017e24ca /* K */, 0x00000000 /* L */,
0x0166308f /* M */, 0x017e34cf /* N */, 0x00000000 /* O */,
0x00000000 /* P */, 0x00000000 /* Q */, 0x006634cb /* R */,
0x012634ce /* S */, 0x0158248a /* T */, 0x0158248a /* U */,
0x016634cf /* V */, 0x017a348b /* W */, 0x017e34cf /* X */,
0x017c348e /* Y */, 0x00000000 /* Z */
#endif

View File

@ -0,0 +1,51 @@
/* ----------------------------------------------- */
/* prot_code.h */
/* alphabet encoding for proteins */
/* ----------------------------------------- */
/* IUPAC encoding */
/* ----------------------------------------- */
/* B=DN */
/* Z=EQ */
/* X=any - {X} */
/* JOU not recognized */
/* ----------------------------------------- */
/* dual encoding */
/* ----------------------------------------- */
/* B=BDN */
/* D=BD */
/* E=EZ */
/* N=BN */
/* Q=QZ */
/* X=any - {X} */
/* Z=EQZ */
/* JOU not recognized */
/* ----------------------------------------------- */
#ifndef USE_DUAL
/* IUPAC */
0x00000001 /* A */, 0x00002008 /* B */, 0x00000004 /* C */,
0x00000008 /* D */, 0x00000010 /* E */, 0x00000020 /* F */,
0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */,
0x00000000 /* J */, 0x00000400 /* K */, 0x00000800 /* L */,
0x00001000 /* M */, 0x00002000 /* N */, 0x00000000 /* O */,
0x00008000 /* P */, 0x00010000 /* Q */, 0x00020000 /* R */,
0x00040000 /* S */, 0x00080000 /* T */, 0x00000000 /* U */,
0x00200000 /* V */, 0x00400000 /* W */, 0x037fffff /* X */,
0x01000000 /* Y */, 0x00010010 /* Z */
#else
/* DUAL */
0x00000001 /* A */, 0x0000200a /* B */, 0x00000004 /* C */,
0x0000000a /* D */, 0x02000010 /* E */, 0x00000020 /* F */,
0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */,
0x00000000 /* J */, 0x00000400 /* K */, 0x00000800 /* L */,
0x00001000 /* M */, 0x00002002 /* N */, 0x00000000 /* O */,
0x00008000 /* P */, 0x02010000 /* Q */, 0x00020000 /* R */,
0x00040000 /* S */, 0x00080000 /* T */, 0x00000000 /* U */,
0x00200000 /* V */, 0x00400000 /* W */, 0x037fffff /* X */,
0x01000000 /* Y */, 0x02010010 /* Z */
#endif

View File

@ -0,0 +1,24 @@
SOURCES = apat_parse.c \
apat_search.c \
libstki.c
SRCS=$(SOURCES)
OBJECTS= $(patsubst %.c,%.o,$(SOURCES))
LIBFILE= libapat.a
RANLIB=ranlib
include ../global.mk
all: $(LIBFILE)
clean:
rm -rf $(OBJECTS) $(LIBFILE)
$(LIBFILE): $(OBJECTS)
ar -cr $@ $?
$(RANLIB) $@

165
pkg/obiapat/apat.h Normal file
View File

@ -0,0 +1,165 @@
/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Dec. 94 */
/* File: apat.h */
/* Purpose: pattern scan */
/* History: */
/* 28/12/94 : <Gloup> ascan first version */
/* 14/05/99 : <Gloup> last revision */
/* 07/12/21 : <Zafacs> last some cleaning for 2020 */
/* ==================================================== */
#ifndef H_apat
#define H_apat
#include <stdio.h>
#include "libstki.h"
/* ----------------------------------------------- */
/* constantes */
/* ----------------------------------------------- */
#ifndef BUFSIZ
#define BUFSIZ 1024 /* io buffer size */
#endif
#define MAX_NAME_LEN BUFSIZ /* max length of sequence name */
#define ALPHA_LEN 26 /* alphabet length */
/* *DO NOT* modify */
#define MAX_PATTERN 1 /* max # of patterns */
/* *DO NOT* modify */
#define MAX_PAT_LEN 64 /* max pattern length */
/* *DO NOT* modify */
#define MAX_PAT_ERR 64 /* max # of errors */
/* *DO NOT* modify */
#define PATMASK 0x3ffffff /* mask for 26 symbols */
/* *DO NOT* modify */
#define OBLIBIT 0x4000000 /* bit 27 to 1 -> oblig. pos */
/* *DO NOT* modify */
/* mask for position */
#define ONEMASK 0x8000000000000000 /* mask for highest position */
/* masks for Levenhstein edit */
#define OPER_IDT 0x0000000000000000 /* identity */
#define OPER_INS 0x4000000000000000 /* insertion */
#define OPER_DEL 0x8000000000000000 /* deletion */
#define OPER_SUB 0xc000000000000000 /* substitution */
#define OPER_SHFT 30 /* <unused> shift */
/* Levenhstein Opcodes */
#define SOPER_IDT 0x0 /* identity */
#define SOPER_INS 0x1 /* insertion */
#define SOPER_DEL 0x2 /* deletion */
#define SOPER_SUB 0x3 /* substitution */
/* Levenhstein Opcodes masks */
#define OPERMASK 0xc000000000000000 /* mask for Opcodes /!\ */
#define NOPERMASK 0x3fffffffffffffff /* negate of previous /!\ */
/* special chars in pattern */
#define PATCHARS "[]!#"
/* 26 letter alphabet */
/* in alphabetical order */
#define ORD_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
/* protein alphabet */
#define PROT_ALPHA "ACDEFGHIKLMNPQRSTVWY"
/* dna/rna alphabet */
#define DNA_ALPHA "ABCDGHKMNRSTUVWXY"
/* ----------------------------------------------- */
/* data structures */
/* ----------------------------------------------- */
typedef uint64_t patword_t;
/* -------------------- */
typedef enum { /* data encoding */
/* -------------------- */
alpha = 0, /* [A-Z] */
dna, /* IUPAC DNA */
protein /* IUPAC proteins */
} CodType;
/* -------------------- */
typedef struct { /* sequence */
/* -------------------- */
char *name; /* sequence name */
int32_t seqlen; /* sequence length */
int32_t seqsiz; /* sequence buffer size */
int32_t datsiz; /* data buffer size */
int32_t circular;
uint8_t *data; /* data buffer */
char *cseq; /* sequence buffer */
StackiPtr hitpos[MAX_PATTERN]; /* stack of hit pos. */
StackiPtr hiterr[MAX_PATTERN]; /* stack of errors */
} Seq, *SeqPtr;
/* -------------------- */
typedef struct { /* pattern */
/* -------------------- */
int32_t patlen; /* pattern length */
int32_t maxerr; /* max # of errors */
char *cpat; /* pattern string */
uint32_t *patcode; /* encoded pattern */
patword_t *smat; /* S matrix */
patword_t omask; /* oblig. bits mask */
bool hasIndel; /* are indels allowed */
bool ok; /* is pattern ok */
} Pattern, *PatternPtr;
/* ----------------------------------------------- */
/* prototypes */
/* ----------------------------------------------- */
/* apat_seq.c */
SeqPtr FreeSequence (SeqPtr pseq);
SeqPtr NewSequence (void);
int32_t ReadNextSequence (SeqPtr pseq);
int32_t WriteSequence (FILE *filou , SeqPtr pseq);
/* apat_parse.c */
uint32_t *GetCode (CodType ctype);
int32_t CheckPattern (Pattern *ppat);
int32_t EncodePattern (Pattern *ppat, CodType ctype);
int32_t ReadPattern (Pattern *ppat);
void PrintDebugPattern (Pattern *ppat);
int lenPattern (const char *pat);
/* apat_search.c */
int32_t CreateS (Pattern *ppat, int32_t lalpha);
int32_t ManberNoErr (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length);
int32_t ManberSub (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length);
int32_t ManberIndel (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length);
int32_t ManberAll (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length);
int32_t NwsPatAlign (Seq *pseq , Pattern *ppat, int32_t nerr ,
int32_t *reslen , int32_t *reserr);
/* apat_sys.c */
float UserCpuTime (int32_t reset);
float SysCpuTime (int32_t reset);
char *StrCpuTime (int32_t reset);
void Erreur (char *msg , int32_t stat);
int32_t AccessFile (char *path, char *mode);
#endif /* H_apat */

15
pkg/obiapat/apat_mem.h Normal file
View File

@ -0,0 +1,15 @@
#ifndef __APAT_MEM_H__
#define __APAT_MEM_H__
/* ----------------------------------------------- */
/* macros */
/* ----------------------------------------------- */
#define NEW(typ) (typ*)malloc(sizeof(typ))
#define NEWN(typ, dim) (typ*)malloc((uint64_t)(dim) * sizeof(typ))
#define REALLOC(typ, ptr, dim) (typ*)realloc((void *) (ptr), (uint64_t)(dim) * sizeof(typ))
#define FREE(ptr) free((void *) ptr)
#endif /* __APAT_MEM_H__ */

393
pkg/obiapat/apat_parse.c Normal file
View File

@ -0,0 +1,393 @@
/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Mar. 92 */
/* File: apat_parse.c */
/* Purpose: Codage du pattern */
/* History: */
/* 00/07/94 : <Gloup> first version (stanford) */
/* 00/11/94 : <Gloup> revised for DNA/PROTEIN */
/* 30/12/94 : <Gloup> modified EncodePattern */
/* for manber search */
/* 14/05/99 : <Gloup> indels added */
/* 07/12/21 : <Zafacs> some cleaning for 2020 */
/* ==================================================== */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "apat.h"
/* -------------------- */
/* default char */
/* encodings */
/* -------------------- */
static uint32_t sDftCode[] = {
0x00000001 /* A */, 0x00000002 /* B */, 0x00000004 /* C */,
0x00000008 /* D */, 0x00000010 /* E */, 0x00000020 /* F */,
0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */,
0x00000200 /* J */, 0x00000400 /* K */, 0x00000800 /* L */,
0x00001000 /* M */, 0x00002000 /* N */, 0x00004000 /* O */,
0x00008000 /* P */, 0x00010000 /* Q */, 0x00020000 /* R */,
0x00040000 /* S */, 0x00080000 /* T */, 0x00100000 /* U */,
0x00200000 /* V */, 0x00400000 /* W */, 0x00800000 /* X */,
0x01000000 /* Y */, 0x02000000 /* Z */
};
/* -------------------- */
/* char encodings */
/* IUPAC */
/* -------------------- */
/* IUPAC Proteins */
static uint32_t sProtCode[] = {
0x00000001 /* A */, 0x00002008 /* B */, 0x00000004 /* C */,
0x00000008 /* D */, 0x00000010 /* E */, 0x00000020 /* F */,
0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */,
0x00000000 /* J */, 0x00000400 /* K */, 0x00000800 /* L */,
0x00001000 /* M */, 0x00002000 /* N */, 0x00000000 /* O */,
0x00008000 /* P */, 0x00010000 /* Q */, 0x00020000 /* R */,
0x00040000 /* S */, 0x00080000 /* T */, 0x00000000 /* U */,
0x00200000 /* V */, 0x00400000 /* W */, 0x037fffff /* X */,
0x01000000 /* Y */, 0x00010010 /* Z */
};
/* IUPAC Dna/Rna */
static uint32_t sDnaCode[] = {
0x00000001 /* A */, 0x00080044 /* B */, 0x00000004 /* C */,
0x00080041 /* D */, 0x00000000 /* E */, 0x00000000 /* F */,
0x00000040 /* G */, 0x00080005 /* H */, 0x00000000 /* I */,
0x00000000 /* J */, 0x00080040 /* K */, 0x00000000 /* L */,
0x00000005 /* M */, 0x00080045 /* N */, 0x00000000 /* O */,
0x00000000 /* P */, 0x00000000 /* Q */, 0x00000041 /* R */,
0x00000044 /* S */, 0x00080000 /* T */, 0x00080000 /* U */,
0x00000045 /* V */, 0x00080001 /* W */, 0x00080045 /* X */,
0x00080004 /* Y */, 0x00000000 /* Z */
};
/* -------------------------------------------- */
/* internal replacement of gets */
/* -------------------------------------------- */
static char *sGets(char *buffer, int size) {
char *ebuf;
if (! fgets(buffer, size-1, stdin))
return NULL;
/* remove trailing line feed */
ebuf = buffer + strlen(buffer);
while (--ebuf >= buffer) {
if ((*ebuf == '\n') || (*ebuf == '\r'))
*ebuf = '\000';
else
break;
}
return buffer;
}
/* -------------------------------------------- */
/* returns actual code associated to type */
/* -------------------------------------------- */
uint32_t *GetCode(CodType ctype)
{
uint32_t *code = sDftCode;
switch (ctype) {
case dna : code = sDnaCode ; break;
case protein : code = sProtCode ; break;
default : code = sDftCode ; break;
}
return code;
}
/* -------------------------------------------- */
#define BAD_IF(tst) if (tst) return 0
int CheckPattern(Pattern *ppat)
{
int lev;
char *pat;
pat = ppat->cpat;
BAD_IF (*pat == '#');
for (lev = 0; *pat ; pat++)
switch (*pat) {
case '[' :
BAD_IF (lev);
BAD_IF (*(pat+1) == ']');
lev++;
break;
case ']' :
lev--;
BAD_IF (lev);
break;
case '!' :
BAD_IF (lev);
BAD_IF (! *(pat+1));
BAD_IF (*(pat+1) == ']');
break;
case '#' :
BAD_IF (lev);
BAD_IF (*(pat-1) == '[');
break;
default :
if (! isupper(*pat))
return 0;
break;
}
return (lev ? 0 : 1);
}
#undef BAD_IF
/* -------------------------------------------- */
static const char *skipOblig(const char *pat)
{
return (*(pat+1) == '#' ? pat+1 : pat);
}
/* -------------------------------------------- */
static const char *splitPattern(const char *pat)
{
switch (*pat) {
case '[' :
for (; *pat; pat++)
if (*pat == ']')
return skipOblig(pat);
return NULL;
break;
case '!' :
return splitPattern(pat+1);
break;
}
return skipOblig(pat);
}
/* -------------------------------------------- */
static uint32_t valPattern(char *pat, uint32_t *code)
{
uint32_t val;
switch (*pat) {
case '[' :
return valPattern(pat+1, code);
break;
case '!' :
val = valPattern(pat+1, code);
return (~val & PATMASK);
break;
default :
val = 0x0;
while (isupper(*pat)) {
val |= code[*pat - 'A'];
pat++;
}
return val;
}
return 0x0;
}
/* -------------------------------------------- */
static uint32_t obliBitPattern(char *pat)
{
return (*(pat + strlen(pat) - 1) == '#' ? OBLIBIT : 0x0);
}
/* -------------------------------------------- */
int lenPattern(const char *pat)
{
int lpat;
lpat = 0;
while (*pat) {
if (! (pat = splitPattern(pat)))
return 0;
pat++;
lpat++;
}
return lpat;
}
/* -------------------------------------------- */
/* Interface */
/* -------------------------------------------- */
/* -------------------------------------------- */
/* encode un pattern */
/* -------------------------------------------- */
int EncodePattern(Pattern *ppat, CodType ctype)
{
int pos, lpat;
uint32_t *code;
char *pp, *pa, c;
ppat->ok = false;
code = GetCode(ctype);
ppat->patlen = lpat = lenPattern(ppat->cpat);
if (lpat <= 0)
return 0;
// if (! (ppat->patcode = NEWN(uint32_t, lpat)))
// return 0;
pa = pp = ppat->cpat;
pos = 0;
while (*pa) {
pp = (char*)splitPattern(pa);
c = *++pp;
*pp = '\000';
ppat->patcode[pos++] = valPattern(pa, code) | obliBitPattern(pa);
*pp = c;
pa = pp;
}
ppat->ok = true;
return lpat;
}
/* -------------------------------------------- */
/* remove blanks */
/* -------------------------------------------- */
static char *RemBlanks(char *s)
{
char *sb, *sc;
for (sb = sc = s ; *sb ; sb++)
if (! isspace(*sb))
*sc++ = *sb;
return s;
}
/* -------------------------------------------- */
/* count non blanks */
/* -------------------------------------------- */
static uint32_t CountAlpha(char *s)
{
uint32_t n;
for (n = 0 ; *s ; s++)
if (! isspace(*s))
n++;
return n;
}
/* -------------------------------------------- */
/* lit un pattern */
/* <pattern> #mis */
/* ligne starting with '/' are comments */
/* -------------------------------------------- */
int ReadPattern(Pattern *ppat)
{
int val;
char *spac;
char buffer[BUFSIZ];
ppat->ok = true;
if (! sGets(buffer, sizeof(buffer)))
return 0;
if (*buffer == '/')
return ReadPattern(ppat);
if (! CountAlpha(buffer))
return ReadPattern(ppat);
for (spac = buffer ; *spac ; spac++)
if ((*spac == ' ') || (*spac == '\t'))
break;
ppat->ok = false;
if (! *spac)
return 0;
if (sscanf(spac, "%d", &val) != 1)
return 0;
ppat->hasIndel = (val < 0);
ppat->maxerr = ((val >= 0) ? val : -val);
*spac = '\000';
(void) RemBlanks(buffer);
if ((ppat->cpat = NEWN(char, strlen(buffer)+1)))
strcpy(ppat->cpat, buffer);
ppat->ok = (ppat->cpat != NULL);
return (ppat->ok ? 1 : 0);
}
/* -------------------------------------------- */
/* ecrit un pattern - Debug - */
/* -------------------------------------------- */
void PrintDebugPattern(Pattern *ppat)
{
int i;
printf("Pattern : %s (length : %d)\n", ppat->cpat, ppat->patlen);
printf("Encoding : \n\t");
for (i = 0 ; i < ppat->patlen ; i++) {
printf("0x%8.8x ", ppat->patcode[i]);
if (i%4 == 3)
printf("\n\t");
}
printf("\n");
}

337
pkg/obiapat/apat_search.c Normal file
View File

@ -0,0 +1,337 @@
/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Dec. 94 */
/* File: apat_search.c */
/* Purpose: recherche du pattern */
/* algorithme de Baeza-Yates/Gonnet */
/* Manber (agrep) */
/* History: */
/* 07/12/94 : <MFS> first version */
/* 28/12/94 : <Gloup> revised version */
/* 14/05/99 : <Gloup> last revision */
/* 07/12/21 : <Zafacs> last some cleaning for 2020 */
/* ==================================================== */
#include <stdlib.h>
#include <string.h>
#include "libstki.h"
#include "apat.h"
#define POP PopiOut
#define PUSH PushiIn
#define TOPCURS CursiToTop
#define DOWNREAD ReadiDown
#define KRONECK(x, msk) ((~x & msk) ? 0 : 1)
#define MIN(x, y) ((x) < (y) ? (x) : (y))
/* -------------------------------------------- */
/* Construction de la matrice S */
/* -------------------------------------------- */
int CreateS(Pattern *ppat, int32_t lalpha)
{
int32_t indx, pindx, i, j;
patword_t amask, omask, *smat;
ppat->ok = false;
omask = 0x0L;
// if (! (smat = NEWN(uint32_t, lalpha)))
// return 0;
smat = ppat->smat;
for (i = 0 ; i < lalpha ; i++)
smat[i] = 0x0;
for (i = ppat->patlen - 1, amask = 0x1L ; i >= 0 ; i--, amask <<= 1) {
indx = ppat->patcode[i];
if (ppat->patcode[i] & OBLIBIT)
omask |= amask;
for (j = 0, pindx = 0x1L ; j < lalpha ; j++, pindx <<= 1)
if (indx & pindx)
smat[j] |= amask;
}
ppat->smat = smat;
ppat->omask = omask;
ppat->ok = true;
return 1;
}
/* -------------------------------------------- */
/* Baeza-Yates/Manber algorithm */
/* NoError */
/* -------------------------------------------- */
int32_t ManberNoErr(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{
int32_t pos;
patword_t smask, r;
uint8_t *data;
StackiPtr *stkpos, *stkerr;
int32_t end;
end = begin + length;
end = (end <= (size_t)(pseq->seqlen+pseq->circular)) ? end:(size_t)(pseq->seqlen+pseq->circular);
/* create local masks */
smask = r = 0x1L << ppat->patlen;
/* init. scan */
data = pseq->data + begin;
stkpos = pseq->hitpos + patnum;
EmptyStacki(stkpos[0]);
stkerr = pseq->hiterr + patnum;
EmptyStacki(stkerr[0]);
/* loop on text data */
for (pos = begin ; pos < end ; pos++) {
r = (r >> 1) & ppat->smat[*data++];
if (r & 0x1L) {
PUSH(stkpos, pos - ppat->patlen + 1);
PUSH(stkerr, 0);
}
r |= smask;
}
return (*stkpos)->top; /* aka # of hits */
}
/* -------------------------------------------- */
/* Baeza-Yates/Manber algorithm */
/* Substitution only */
/* */
/* Note : r array is stored as : */
/* 0 0 r(0,j) r(0,j+1) r(1,j) r(1,j+1) ... */
/* */
/* -------------------------------------------- */
int32_t ManberSub(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{
int e, emax, found;
uint32_t pos;
uint32_t smask, cmask, sindx;
uint32_t *pr, r[2 * MAX_PAT_ERR + 2];
uint8_t *data;
StackiPtr *stkpos, *stkerr;
uint32_t end;
end = begin + length;
end = (end <= (size_t)(pseq->seqlen+pseq->circular)) ? end:(size_t)(pseq->seqlen+pseq->circular);
/* create local masks */
emax = ppat->maxerr;
r[0] = r[1] = 0x0;
cmask = smask = 0x1L << ppat->patlen;
for (e = 0, pr = r + 3 ; e <= emax ; e++, pr += 2)
*pr = cmask;
cmask = ~ ppat->omask;
/* init. scan */
data = pseq->data + begin;
stkpos = pseq->hitpos + patnum;
EmptyStacki(stkpos[0]);
stkerr = pseq->hiterr + patnum;
EmptyStacki(stkerr[0]);
/* loop on text data */
for (pos = begin ; pos < end ; pos++) {
sindx = ppat->smat[*data++];
for (e = found = 0, pr = r ; e <= emax ; e++, pr += 2) {
pr[2] = pr[3] | smask;
pr[3] = ((pr[0] >> 1) & cmask) /* sub */
| ((pr[2] >> 1) & sindx); /* ident */
if (pr[3] & 0x1L) { /* found */
if (! found) {
PUSH(stkpos, pos - ppat->patlen + 1);
PUSH(stkerr, e);
}
found++;
}
}
}
return (*stkpos)->top; /* aka # of hits */
}
/* -------------------------------------------- */
/* Baeza-Yates/Manber algorithm */
/* Substitution + Indels */
/* */
/* Note : r array is stored as : */
/* 0 0 r(0,j) r(0,j+1) r(1,j) r(1,j+1) ... */
/* */
/* Warning: may return shifted pos. */
/* */
/* -------------------------------------------- */
int32_t ManberIndel(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{
int e, emax, found;
uint32_t pos;
uint32_t smask, cmask, sindx;
uint32_t *pr, r[2 * MAX_PAT_ERR + 2];
uint8_t *data;
StackiPtr *stkpos, *stkerr;
uint32_t end;
end = begin + length;
end = (end <= (size_t)(pseq->seqlen+pseq->circular)) ? end:(size_t)(pseq->seqlen+pseq->circular);
/* create local masks */
emax = ppat->maxerr;
r[0] = r[1] = 0x0;
cmask = smask = 0x1L << ppat->patlen;
for (e = 0, pr = r + 3 ; e <= emax ; e++, pr += 2) {
*pr = cmask;
cmask = (cmask >> 1) | smask;
}
cmask = ~ ppat->omask;
/* init. scan */
data = pseq->data + begin;
stkpos = pseq->hitpos + patnum;
stkerr = pseq->hiterr + patnum;
/* loop on text data */
for (pos = begin ; pos < end ; pos++) {
sindx = ppat->smat[*data++];
for (e = found = 0, pr = r ; e <= emax ; e++, pr += 2) {
pr[2] = pr[3] | smask;
pr[3] = (( pr[0] /* ins */
| (pr[0] >> 1) /* sub */
| (pr[1] >> 1)) /* del */
& cmask)
| ((pr[2] >> 1) & sindx); /* ident */
if (pr[3] & 0x1L) { /* found */
if (! found) {
PUSH(stkpos, pos - ppat->patlen + 1);
PUSH(stkerr, e);
}
found++;
}
}
}
return (*stkpos)->top; /* aka # of hits */
}
/* -------------------------------------------- */
/* Baeza-Yates/Manber algorithm */
/* API call to previous functions */
/* -------------------------------------------- */
int32_t ManberAll(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{
if (ppat->maxerr == 0)
return ManberNoErr(pseq, ppat, patnum, begin, length);
else if (ppat->hasIndel)
return ManberIndel(pseq, ppat, patnum, begin, length);
else
return ManberSub(pseq, ppat, patnum, begin, length);
}
/* -------------------------------------------- */
/* Alignement NWS */
/* pour edition des hits */
/* (avec substitution obligatoire aux bords) */
/* -------------------------------------------- */
int32_t NwsPatAlign(pseq, ppat, nerr, reslen, reserr)
Seq *pseq;
Pattern *ppat;
int32_t nerr, *reslen, *reserr;
{
uint8_t *sseq, *px;
int32_t i, j, lseq, lpat, npos, dindel, dsub,
*pc, *pi, *pd, *ps;
uint32_t amask;
static int32_t sTab[(MAX_PAT_LEN+MAX_PAT_ERR+1) * (MAX_PAT_LEN+1)];
lseq = pseq->seqlen;
pc = sTab; /* |----|----| --> i */
pi = pc - 1; /* | ps | pd | | */
pd = pi - lseq; /* |----|----| | */
ps = pd - 1; /* | pi | pc | v j */
/* |---------| */
lseq = pseq->seqlen;
lpat = ppat->patlen;
sseq = pseq->data - 1;
amask = ONEMASK >> lpat;
for (j = 0 ; j <= lpat ; j++) {
for (i = 0 , px = sseq ; i <= lseq ; i++, px++) {
if (i && j) {
dindel = MIN(*pi, *pd) + 1;
dsub = *ps + KRONECK(ppat->smat[*px], amask);
*pc = MIN(dindel, dsub);
}
else if (i) /* j == 0 */
*pc = *pi + 1;
else if (j) /* i == 0 */
*pc = *pd + 1;
else /* root */
*pc = 0;
pc++;
pi++;
pd++;
ps++;
}
amask <<= 1;
}
pc--;
for (i = lseq, npos = 0 ; i >= 0 ; i--, pc--) {
if (*pc <= nerr) {
*reslen++ = i;
*reserr++ = *pc;
npos++;
}
}
return npos;
}

82
pkg/obiapat/ecoMalloc.c Normal file
View File

@ -0,0 +1,82 @@
#include "obiapat.h"
#include <stdlib.h>
static int eco_log_malloc = 0;
void eco_trace_memory_allocation()
{
eco_log_malloc=1;
}
void eco_untrace_memory_allocation()
{
eco_log_malloc=0;
}
void *eco_malloc(int32_t chunksize,
const char *error_message,
const char *filename,
int32_t line,
int *errno, char **errmsg)
{
void * chunk;
chunk = calloc(1,chunksize);
if (!chunk)
ecoError(ECO_MEM_ERROR,error_message,filename,line,errno,errmsg);
if (eco_log_malloc)
fprintf(stderr,
"Memory segment located at %p of size %d is allocated (file : %s [%d])",
chunk,
chunksize,
filename,
line);
return chunk;
}
void *eco_realloc(void *chunk,
int32_t newsize,
const char *error_message,
const char *filename,
int32_t line,
int *errno, char **errmsg)
{
void *newchunk;
newchunk = realloc(chunk,newsize);
if (!newchunk)
ecoError(ECO_MEM_ERROR,error_message,filename,line,errno,errmsg);
if (eco_log_malloc)
fprintf(stderr,
"Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])",
chunk,
newchunk,
newsize,
filename,
line);
return newchunk;
}
void eco_free(void *chunk,
const char *error_message,
const char *filename,
int32_t line,
int *errno, char **errmsg)
{
free(chunk);
if (eco_log_malloc)
fprintf(stderr,
"Memory segment %p is released => %s (file : %s [%d])",
chunk,
error_message,
filename,
line);
}

391
pkg/obiapat/libstki.c Normal file
View File

@ -0,0 +1,391 @@
/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Mar. 92 */
/* File: libstki.c */
/* Purpose: A library to deal with 'stacks' of */
/* integers */
/* Note: 'stacks' are dynamic (i.e. size is */
/* automatically readjusted when needed) */
/* History: */
/* 00/03/92 : <Gloup> first draft */
/* 15/08/93 : <Gloup> revised version */
/* 14/05/99 : <Gloup> last revision */
/* 07/12/21 : <Zafacs> some cleaning for 2020's */
/* ==================================================== */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// #include "Gtypes.h"
#include "libstki.h"
/* ============================ */
/* Constantes et Macros locales */
/* ============================ */
#define ExpandStack(stkh) ResizeStacki((stkh), (*stkh)->size << 1)
#define ShrinkStack(stkh) ResizeStacki((stkh), (*stkh)->size >> 1)
static int16_t sStkiLastError = kStkiNoErr;
/* -------------------------------------------- */
/* gestion des erreurs */
/* get/reset erreur flag */
/* */
/* @function: StkiError */
/* -------------------------------------------- */
int16_t StkiError(bool reset)
{
int16_t err;
err = sStkiLastError;
if (reset)
sStkiLastError = kStkiNoErr;
return err;
} /* end of StkiError */
/* -------------------------------------------- */
/* creation d'un stack */
/* */
/* @function: NewStacki */
/* -------------------------------------------- */
StackiPtr NewStacki(int32_t size)
{
StackiPtr stki;
if (! (stki = NEW(Stacki)))
return NULL;
stki->size = size;
stki->top = 0;
stki->cursor = 0;
if ( ! (stki->val = NEWN(int32_t, size))) {
sStkiLastError = kStkiMemErr;
return FreeStacki(stki);
}
return stki;
} /* end of NewStacki */
/* -------------------------------------------- */
/* liberation d'un stack */
/* */
/* @function: FreeStacki */
/* -------------------------------------------- */
StackiPtr FreeStacki(StackiPtr stki)
{
if (stki) {
if (stki->val)
FREE(stki->val);
FREE(stki);
}
return NULL;
} /* end of FreeStacki */
/* -------------------------------------------- */
/* creation d'un vecteur de stacks */
/* */
/* @function: NewStackiVector */
/* -------------------------------------------- */
StackiHdle NewStackiVector(int32_t vectSize, int32_t stackSize)
{
int32_t i;
StackiHdle stkh;
if (! (stkh = NEWN(StackiPtr, vectSize))) {
sStkiLastError = kStkiMemErr;
return NULL;
}
for (i = 0 ; i < vectSize ; i++)
if (! (stkh[i] = NewStacki(stackSize)))
return FreeStackiVector(stkh, i);
return stkh;
} /* end of NewStackiVector */
/* -------------------------------------------- */
/* liberation d'un vecteur de stacks */
/* */
/* @function: FreeStackiVector */
/* -------------------------------------------- */
StackiHdle FreeStackiVector(StackiHdle stkh, int32_t vectSize)
{
int32_t i;
if (stkh) {
for (i = 0 ; i < vectSize ; i++)
(void) FreeStacki(stkh[i]);
FREE(stkh);
}
return NULL;
} /* end of FreeStackiVector */
/* -------------------------------------------- */
/* resize d'un stack */
/* */
/* @function: ResizeStacki */
/* -------------------------------------------- */
int32_t ResizeStacki(StackiHdle stkh, int32_t size)
{
int32_t resize = 0; /* assume error */
int32_t *val;
if ((val = REALLOC(int32_t, (*stkh)->val, size))) {
(*stkh)->size = resize = size;
(*stkh)->val = val;
}
if (! resize)
sStkiLastError = kStkiMemErr;
return resize;
} /* end of ResizeStacki */
/* -------------------------------------------- */
/* empilage(/lement) */
/* */
/* @function: PushiIn */
/* -------------------------------------------- */
bool PushiIn(StackiHdle stkh, int32_t val)
{
if (((*stkh)->top >= (*stkh)->size) && (! ExpandStack(stkh)))
return false;
(*stkh)->val[((*stkh)->top)++] = val;
return true;
} /* end of PushiIn */
/* -------------------------------------------- */
/* depilage(/lement) */
/* */
/* @function: PopiOut */
/* -------------------------------------------- */
bool PopiOut(StackiHdle stkh, int32_t *val)
{
if ((*stkh)->top <= 0)
return false;
*val = (*stkh)->val[--((*stkh)->top)];
if ( ((*stkh)->top < ((*stkh)->size >> 1))
&& ((*stkh)->top > kMinStackiSize))
(void) ShrinkStack(stkh);
return true;
} /* end of PopiOut */
/* -------------------------------------------- */
/* lecture descendante */
/* */
/* @function: ReadiDown */
/* -------------------------------------------- */
bool ReadiDown(StackiPtr stki, int32_t *val)
{
if (stki->cursor <= 0)
return false;
*val = stki->val[--(stki->cursor)];
return true;
} /* end of ReadiDown */
/* -------------------------------------------- */
/* lecture ascendante */
/* */
/* @function: ReadiUp */
/* -------------------------------------------- */
bool ReadiUp(StackiPtr stki, int32_t *val)
{
if (stki->cursor >= stki->top)
return false;
*val = stki->val[(stki->cursor)++];
return true;
} /* end of ReadiUp */
/* -------------------------------------------- */
/* remontee/descente du curseur */
/* */
/* @function: CursiToTop */
/* @function: CursiToBottom */
/* -------------------------------------------- */
void CursiToTop(StackiPtr stki)
{
stki->cursor = stki->top;
} /* end of CursiToTop */
void CursiToBottom(stki)
StackiPtr stki;
{
stki->cursor = 0;
} /* end of CursiToBottom */
/* -------------------------------------------- */
/* echange des valeurs cursor <-> (top - 1) */
/* */
/* @function: CursiSwap */
/* -------------------------------------------- */
void CursiSwap(StackiPtr stki)
{
int32_t tmp;
if ((stki->top <= 0) || (stki->cursor < 0))
return;
tmp = stki->val[stki->cursor];
stki->val[stki->cursor] = stki->val[stki->top - 1];
stki->val[stki->top - 1] = tmp;
} /* end of CursiSwap */
/* -------------------------------------------- */
/* Recherche d'une valeur en stack a partir du */
/* curseur courant en descendant. */
/* on laisse le curseur a l'endroit trouve */
/* */
/* @function: SearchDownStacki */
/* -------------------------------------------- */
bool SearchDownStacki(StackiPtr stki, int32_t sval)
{
int32_t val;
bool more;
while ((more = ReadiDown(stki, &val)))
if (val == sval)
break;
return more;
} /* end of SearchDownStacki */
/* -------------------------------------------- */
/* Recherche dichotomique d'une valeur en stack */
/* le stack est suppose trie par valeurs */
/* croissantes. */
/* on place le curseur a l'endroit trouve */
/* */
/* @function: BinSearchStacki */
/* -------------------------------------------- */
bool BinSearchStacki(StackiPtr stki, int32_t sval)
{
int32_t midd, low, high, span;
low = 0;
high = stki->top - 1;
while (high >= low) {
midd = (high + low) / 2;
span = stki->val[midd] - sval;
if (span == 0) {
stki->cursor = midd;
return true;
}
if (span > 0)
high = midd - 1;
else
low = midd + 1;
}
return false;
} /* end of BinSearchStacki */
/* -------------------------------------------- */
/* teste l'egalite *physique* de deux stacks */
/* */
/* @function: SameStacki */
/* -------------------------------------------- */
bool SameStacki(StackiPtr stki1, StackiPtr stki2)
{
if (stki1->top != stki2->top)
return false;
return ((memcmp(stki1->val, stki2->val,
stki1->top * sizeof(int32_t)) == 0) ? true : false);
} /* end of SameStacki */
/* -------------------------------------------- */
/* inverse l'ordre des elements dans un stack */
/* */
/* @function: ReverseStacki */
/* -------------------------------------------- */
bool ReverseStacki(StackiPtr stki)
{
int32_t *t, *b, swp;
if (stki->top <= 0)
return false;
b = stki->val;
t = b + stki->top - 1;
while (t > b) {
swp = *t;
*t-- = *b;
*b++ = swp;
}
return true;
} /* end of ReverseStacki */
/* -------------------------------------------- */
/* Remove every values from a stack by moving */
/* back the top member to 0. */
/* */
/* @function: EmptyStacki */
/* -------------------------------------------- */
bool EmptyStacki(StackiPtr stki)
{
stki->top = 0;
return true;
}

81
pkg/obiapat/libstki.h Normal file
View File

@ -0,0 +1,81 @@
/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Mar. 92 */
/* File: libstki.h */
/* Purpose: library of dynamic stacks holding */
/* integer values */
/* History: */
/* 00/03/92 : <Gloup> first draft */
/* 07/07/93 : <Gloup> complete revision */
/* 10/03/94 : <Gloup> added xxxVector funcs */
/* 14/05/99 : <Gloup> last revision */
/* 07/12/21 : <Zafacs> last some cleaning for 2020 */
/* ==================================================== */
#ifndef _H_libstki
#define _H_libstki
#include <stdint.h>
#include <stdbool.h>
#include "apat_mem.h"
/* ==================================================== */
/* Constantes de dimensionnement */
/* ==================================================== */
#ifndef kMinStackiSize
#define kMinStackiSize 2 /* taille mini stack */
#endif
#define kStkiNoErr 0 /* ok */
#define kStkiMemErr 1 /* not enough memory */
#define kStkiReset true
#define kStkiGet false
/* ==================================================== */
/* Types & Structures de donnees */
/* ==================================================== */
/* -------------------- */
/* structure : pile */
/* -------------------- */
typedef struct Stacki {
/* ---------------------*/
int32_t size; /* stack size */
int32_t top; /* current free pos. */
int32_t cursor; /* current cursor */
int32_t *val; /* values */
/* ---------------------*/
} Stacki, *StackiPtr, **StackiHdle;
/* ==================================================== */
/* Prototypes (generated by mproto) */
/* ==================================================== */
/* libstki.c */
int16_t StkiError (bool reset );
StackiPtr NewStacki (int32_t size );
StackiPtr FreeStacki (StackiPtr stki );
StackiHdle NewStackiVector (int32_t vectSize, int32_t stackSize );
StackiHdle FreeStackiVector (StackiHdle stkh , int32_t vectSize );
int32_t ResizeStacki (StackiHdle stkh , int32_t size );
bool PushiIn (StackiHdle stkh , int32_t val );
bool PopiOut (StackiHdle stkh , int32_t *val );
bool ReadiDown (StackiPtr stki , int32_t *val );
bool ReadiUp (StackiPtr stki , int32_t *val );
void CursiToTop (StackiPtr stki );
void CursiToBottom (StackiPtr stki );
void CursiSwap (StackiPtr stki );
bool SearchDownStacki (StackiPtr stki , int32_t sval );
bool BinSearchStacki (StackiPtr stki , int32_t sval );
bool SameStacki (StackiPtr stki1 , StackiPtr stki2 );
bool ReverseStacki (StackiPtr stki );
bool EmptyStacki (StackiPtr stki );
#endif /* _H_libstki */

417
pkg/obiapat/obiapat.c Normal file
View File

@ -0,0 +1,417 @@
#include <string.h>
#include <stdio.h>
#include "libstki.h"
#include "apat.h"
#include "obiapat.h"
static void EncodeSequence(SeqPtr seq);
static void UpperSequence(char *seq);
/*
* print the message given as argument and exit the program
* @param error error number
* @param message the text explaining what's going on
* @param filename the file source where the program failed
* @param linenumber the line where it has failed
* filename and linenumber are written at pre-processing
* time by a macro
*/
void* ecoError(int error,
const char* message,
const char * filename,
int linenumber,
int *errno,
char **error_msg)
{
asprintf(error_msg,
"Error %d in file %s line %d : %s",
error,
filename,
linenumber,
message);
*errno = error;
return NULL;
}
/*
* @doc: DNA alphabet (IUPAC)
*/
#define LX_BIO_DNA_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
/*
* @doc: complementary DNA alphabet (IUPAC)
*/
#define LX_BIO_CDNA_ALPHA "TVGHEFCDIJMLKNOPQYSAABWXRZ#!]["
static char sNuc[] = LX_BIO_DNA_ALPHA;
static char sAnuc[] = LX_BIO_CDNA_ALPHA;
static char LXBioBaseComplement(char nucAc);
static char *LXBioSeqComplement(char *nucAcSeq);
static char *reverseSequence(char *str,char isPattern);
/* ---------------------------- */
char LXBioBaseComplement(char nucAc)
{
char *c;
if ((c = strchr(sNuc, nucAc)))
return sAnuc[(c - sNuc)];
else
return nucAc;
}
/* ---------------------------- */
char *LXBioSeqComplement(char *nucAcSeq)
{
char *s;
for (s = nucAcSeq ; *s ; s++)
*s = LXBioBaseComplement(*s);
return nucAcSeq;
}
char *reverseSequence(char *str,char isPattern)
{
char *sb, *se, c;
if (! str)
return str;
sb = str;
se = str + strlen(str) - 1;
while(sb <= se) {
c = *sb;
*sb++ = *se;
*se-- = c;
}
sb = str;
se = str + strlen(str) - 1;
if (isPattern)
for (;sb <= se; sb++)
{
if (*sb=='#')
{
if (*(sb+1) == '[') {
while(*sb !=']') {
*sb = *(sb+1);
sb++;
}
*sb='#';
} else {
if (((se - sb) > 2) && (*(sb+2)=='!'))
{
*sb='!';
sb+=2;
*sb='#';
}
else
{
*sb=*(sb+1);
sb++;
*sb='#';
}}
}
else if (*sb=='!')
{
*sb=*(sb-1);
*(sb-1)='!';
}
}
return str;
}
char *ecoComplementPattern(char *nucAcSeq)
{
return reverseSequence(LXBioSeqComplement(nucAcSeq),1);
}
char *ecoComplementSequence(char *nucAcSeq)
{
return reverseSequence(LXBioSeqComplement(nucAcSeq),0);
}
char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end,
int *errno, char **errmsg)
/*
extract subsequence from nucAcSeq [begin,end[
*/
{
static char *buffer = NULL;
static int32_t buffSize= 0;
int32_t length;
if (begin < end)
{
length = end - begin;
if (length >= buffSize)
{
buffSize = length+1;
if (buffer)
buffer=ECOREALLOC(buffer,buffSize,
"Error in reallocating sub sequence buffer",errno,errmsg);
else
buffer=ECOMALLOC(buffSize,
"Error in allocating sub sequence buffer",errno,errmsg);
}
strncpy(buffer,nucAcSeq + begin,length);
buffer[length]=0;
}
else
{
length = end + strlen(nucAcSeq) - begin;
if (length >= buffSize)
{
buffSize = length+1;
if (buffer)
buffer=ECOREALLOC(buffer,buffSize,
"Error in reallocating sub sequence buffer",errno,errmsg);
else
buffer=ECOMALLOC(buffSize,
"Error in allocating sub sequence buffer",errno,errmsg);
}
strncpy(buffer,nucAcSeq+begin,length - end);
strncpy(buffer+(length-end),nucAcSeq ,end);
buffer[length]=0;
}
return buffer;
}
/* -------------------------------------------- */
/* uppercase sequence */
/* -------------------------------------------- */
#define IS_LOWER(c) (((c) >= 'a') && ((c) <= 'z'))
#define TO_UPPER(c) ((c) - 'a' + 'A')
void UpperSequence(char *seq)
{
char *cseq;
for (cseq = seq ; *cseq ; cseq++)
if (IS_LOWER(*cseq))
*cseq = TO_UPPER(*cseq);
}
#undef IS_LOWER
#undef TO_UPPER
/* -------------------------------------------- */
/* encode sequence */
/* IS_UPPER is slightly faster than isupper */
/* -------------------------------------------- */
#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'Z'))
void EncodeSequence(SeqPtr seq)
{
int i;
uint8_t *data;
char *cseq;
char nuc;
data = seq->data;
cseq = seq->cseq;
while (*cseq) {
nuc = *cseq & (~32);
*data = (IS_UPPER(nuc) ? nuc - 'A' : 0x0);
data++;
cseq++;
}
for (i=0,cseq=seq->cseq;i < seq->circular; i++,cseq++,data++) {
nuc = *cseq & (~32);
*data = (IS_UPPER(nuc) ? nuc - 'A' : 0x0);
}
for (i = 0 ; i < MAX_PATTERN ; i++)
seq->hitpos[i]->top = seq->hiterr[i]->top = 0;
}
#undef IS_UPPER
SeqPtr new_apatseq(const char *in,int32_t circular, int32_t seqlen,
SeqPtr out,
int *errno, char **errmsg)
{
int i;
if (circular != 0) circular=MAX_PAT_LEN;
if (!out)
{
out = ECOMALLOC(sizeof(Seq),
"Error in Allocation of a new Seq structure",errno,errmsg);
for (i = 0 ; i < MAX_PATTERN ; i++)
{
if (! (out->hitpos[i] = NewStacki(kMinStackiSize)))
ECOERROR(ECO_MEM_ERROR,"Error in hit stack Allocation",errno,errmsg);
if (! (out->hiterr[i] = NewStacki(kMinStackiSize)))
ECOERROR(ECO_MEM_ERROR,"Error in error stack Allocation",errno,errmsg);
}
}
out->seqsiz = out->seqlen = seqlen;
out->circular = circular;
if (!out->data)
{
out->data = ECOMALLOC((out->seqlen+circular) *sizeof(uint8_t),
"Error in Allocation of a new Seq data member",
errno,errmsg);
out->datsiz= out->seqlen+circular;
}
else if ((out->seqlen +circular) >= out->datsiz)
{
out->data = ECOREALLOC(out->data,(out->seqlen+circular) *sizeof(uint8_t),
"Error during Seq data buffer realloc",
errno,errmsg);
out->datsiz= out->seqlen+circular;
}
out->cseq = (char *)in;
EncodeSequence(out);
return out;
}
int32_t delete_apatseq(SeqPtr pseq,
int *errno, char **errmsg)
{
int i;
if (pseq) {
if (pseq->data)
ECOFREE(pseq->data,"Freeing sequence data buffer",
errno,errmsg);
for (i = 0 ; i < MAX_PATTERN ; i++) {
if (pseq->hitpos[i]) FreeStacki(pseq->hitpos[i]);
if (pseq->hiterr[i]) FreeStacki(pseq->hiterr[i]);
}
ECOFREE(pseq,"Freeing apat sequence structure",
errno,errmsg);
return 0;
}
return 1;
}
PatternPtr buildPattern(const char *pat, int32_t error_max,
int *errno, char **errmsg)
{
PatternPtr pattern;
int32_t patlen;
int32_t patlen2;
patlen = strlen(pat);
patlen2 = lenPattern(pat);
pattern = ECOMALLOC(sizeof(Pattern) + // Space for struct Pattern
sizeof(char)*patlen+1 + // Space for cpat
sizeof(uint32_t) * patlen2 + // Space for patcode
sizeof(patword_t) * ALPHA_LEN , // Space for smat
"Error in pattern allocation",
errno,errmsg);
pattern->ok = true;
pattern->hasIndel= false;
pattern->maxerr = error_max;
pattern->cpat = (char*)pattern + sizeof(Pattern);
pattern->patcode = (uint32_t*)(pattern->cpat + patlen + 1);
pattern->smat = (patword_t*)(pattern->patcode + patlen2);
strncpy(pattern->cpat,pat,patlen);
pattern->cpat[patlen]=0;
UpperSequence(pattern->cpat);
if (!CheckPattern(pattern))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking",errno,errmsg);
if (! EncodePattern(pattern, dna))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding",errno,errmsg);
if (! CreateS(pattern, ALPHA_LEN))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling",errno,errmsg);
return pattern;
}
PatternPtr complementPattern(PatternPtr pat, int *errno,
char **errmsg)
{
PatternPtr pattern;
pattern = ECOMALLOC(sizeof(Pattern) +
sizeof(char) * strlen(pat->cpat) + 1 +
sizeof(uint32_t) * pat->patlen +
sizeof(patword_t) * ALPHA_LEN,
"Error in pattern allocation",
errno,errmsg);
pattern->ok = true;
pattern->hasIndel= pat->hasIndel;
pattern->maxerr = pat->maxerr;
pattern->patlen = pat->patlen;
pattern->cpat = (char*)pattern + sizeof(Pattern);
pattern->patcode = (uint32_t*)(pattern->cpat + strlen(pat->cpat) + 1);
pattern->smat = (patword_t*)(pattern->patcode + pat->patlen);
strcpy(pattern->cpat,pat->cpat);
ecoComplementPattern(pattern->cpat);
if (!CheckPattern(pattern))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking",errno,errmsg);
if (! EncodePattern(pattern, dna))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding",errno,errmsg);
if (! CreateS(pattern, ALPHA_LEN))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling",errno,errmsg);
return pattern;
}

134
pkg/obiapat/obiapat.h Normal file
View File

@ -0,0 +1,134 @@
#ifndef __obiapat_h__
#define __obiapat_h__
#include <stdio.h>
#include <stdint.h>
#include "apat.h"
/*****************************************************
*
* Data type declarations
*
*****************************************************/
/*
*
* Sequence types
*
*/
typedef struct {
int32_t taxid;
char AC[20];
int32_t DE_length;
int32_t SQ_length;
int32_t CSQ_length;
char data[1];
} ecoseqformat_t;
typedef struct {
int32_t taxid;
int32_t SQ_length;
char *AC;
char *DE;
char *SQ;
} ecoseq_t;
/*****************************************************
*
* Function declarations
*
*****************************************************/
void* ecoError(int error,
const char* message,
const char * filename,
int linenumber,
int *errno,
char **error_msg);
#define ECOERROR(code,message,errno,errmsg) \
{ return ecoError((code),(message),__FILE__,__LINE__,errno,errmsg); }
#define ECO_IO_ERROR (1)
#define ECO_MEM_ERROR (2)
#define ECO_ASSERT_ERROR (3)
#define ECO_NOTFOUND_ERROR (4)
/*
*
* Low level system functions
*
*/
int32_t is_big_endian();
int32_t swap_int32_t(int32_t);
void *eco_malloc(int32_t chunksize,
const char *error_message,
const char *filename,
int32_t line,
int *errno, char **errmsg);
void *eco_realloc(void *chunk,
int32_t chunksize,
const char *error_message,
const char *filename,
int32_t line,
int *errno, char **errmsg);
void eco_free(void *chunk,
const char *error_message,
const char *filename,
int32_t line,
int *errno, char **errmsg);
void eco_trace_memory_allocation();
void eco_untrace_memory_allocation();
#define ECOMALLOC(size,error_message,errno,errmsg) \
eco_malloc((size),(error_message),__FILE__,__LINE__,errno,errmsg)
#define ECOREALLOC(chunk,size,error_message,errno,errmsg) \
eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__,errno,errmsg)
#define ECOFREE(chunk,error_message,errno,errmsg) \
eco_free((chunk),(error_message),__FILE__,__LINE__,errno,errmsg)
ecoseq_t *new_ecoseq();
int32_t delete_ecoseq(ecoseq_t *);
ecoseq_t *new_ecoseq_with_data( char *AC,
char *DE,
char *SQ,
int32_t taxid
);
int32_t delete_apatseq(SeqPtr pseq,
int *errno, char **errmsg);
PatternPtr buildPattern(const char *pat, int32_t error_max, int *errno, char **errmsg);
PatternPtr complementPattern(PatternPtr pat, int *errno, char **errmsg);
SeqPtr new_apatseq(const char *in,int32_t circular, int32_t seqlen,
SeqPtr out,
int *errno, char **errmsg);
char *ecoComplementPattern(char *nucAcSeq);
char *ecoComplementSequence(char *nucAcSeq);
char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end,
int *errno, char **errmsg);
#endif /* __obiapat_h__ */

168
pkg/obiapat/pattern.go Normal file
View File

@ -0,0 +1,168 @@
package obiapat
/*
#cgo CFLAGS: -g -Wall
#include <stdlib.h>
#include "obiapat.h"
*/
import "C"
import (
"errors"
"unsafe"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
var MAX_PAT_LEN = int(C.MAX_PAT_LEN)
type ApatPattern struct {
pointer *C.Pattern
}
type ApatSequence struct {
pointer *C.Seq
}
var NilApatPattern = ApatPattern{nil}
var NilApatSequence = ApatSequence{nil}
func MakeApatPattern(pattern string, errormax int) (ApatPattern, error) {
cpattern := C.CString(pattern)
defer C.free(unsafe.Pointer(cpattern))
cerrormax := C.int32_t(errormax)
var errno C.int32_t
var errmsg *C.char
ap := C.buildPattern(cpattern, cerrormax, &errno, &errmsg)
if ap == nil {
message := C.GoString(errmsg)
C.free(unsafe.Pointer(errmsg))
return NilApatPattern, errors.New(message)
}
return ApatPattern{pointer: ap}, nil
}
func (pattern ApatPattern) ReverseComplement() (ApatPattern, error) {
var errno C.int32_t
var errmsg *C.char
ap := C.complementPattern((*C.Pattern)(pattern.pointer), &errno, &errmsg)
if ap == nil {
message := C.GoString(errmsg)
C.free(unsafe.Pointer(errmsg))
return ApatPattern{nil}, errors.New(message)
}
return ApatPattern{pointer: ap}, nil
}
func (pattern ApatPattern) String() string {
return C.GoString(pattern.pointer.cpat)
}
func (pattern ApatPattern) Length() int {
return int(pattern.pointer.patlen)
}
func (pattern ApatPattern) Free() {
C.free(unsafe.Pointer(pattern.pointer))
}
func (pattern ApatPattern) Print() {
C.PrintDebugPattern(C.PatternPtr(pattern.pointer))
}
func MakeApatSequence(sequence obiseq.BioSequence, circular bool, recycle ...ApatSequence) (ApatSequence, error) {
var errno C.int32_t
var errmsg *C.char
seqlen := sequence.Length()
p := C.malloc(C.size_t(seqlen) + 1)
ic := 0
if circular {
ic = 1
}
// copy the data into the buffer, by converting it to a Go array
cBuf := (*[1 << 30]byte)(p)
copy(cBuf[:], sequence.Sequence())
cBuf[sequence.Length()] = 0
var out *C.Seq
if len(recycle) > 0 {
out = recycle[0].pointer
} else {
out = nil
}
pseq := C.new_apatseq((*C.char)(p), C.int32_t(ic), C.int32_t(seqlen),
(*C.Seq)(out),
&errno, &errmsg)
if pseq == nil {
message := C.GoString(errmsg)
C.free(unsafe.Pointer(errmsg))
return NilApatSequence, errors.New(message)
}
seq := ApatSequence{pointer: pseq}
//log.Println(C.GoString(pseq.cseq))
// runtime.SetFinalizer(&seq, __free_apat_sequence__)
return seq, nil
}
func (sequence ApatSequence) Length() int {
return int(sequence.pointer.seqlen)
}
func (sequence ApatSequence) Free() {
var errno C.int32_t
var errmsg *C.char
C.delete_apatseq(sequence.pointer,
&errno, &errmsg)
sequence.pointer = nil
}
func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, limits ...int) (loc [][3]int) {
begin := 0
length := sequence.Length()
if len(limits) > 0 {
begin = limits[0]
}
if len(limits) > 1 {
length = limits[1]
}
nhits := int(C.ManberAll(sequence.pointer,
pattern.pointer,
0,
C.int32_t(begin),
C.int32_t(length+C.MAX_PAT_LEN)))
//log.Printf("match count : %d\n", nhits)
if nhits == 0 {
return nil
}
stktmp := (*[1 << 30]int32)(unsafe.Pointer(sequence.pointer.hitpos[0].val))
errtmp := (*[1 << 30]int32)(unsafe.Pointer(sequence.pointer.hiterr[0].val))
patlen := int(pattern.pointer.patlen)
for i := 0; i < nhits; i++ {
start := int(stktmp[i])
err := int(errtmp[i])
loc = append(loc, [3]int{start, start + patlen, err})
}
return loc
}

370
pkg/obiapat/pcr.go Normal file
View File

@ -0,0 +1,370 @@
package obiapat
import (
"git.metabarcoding.org/lecasofts/go/oa2/pkg/goutils"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
type __options__ struct {
min_length int
max_length int
circular bool
forward_error int
reverse_error int
buffer_size int
batch_size int
parallel_workers int
}
type Options struct {
pointer *__options__
}
type WithOption func(Options)
func (options Options) MinLength() int {
return options.pointer.min_length
}
func (options Options) MaxLength() int {
return options.pointer.max_length
}
func (options Options) ForwardError() int {
return options.pointer.forward_error
}
func (options Options) ReverseError() int {
return options.pointer.reverse_error
}
func (options Options) Circular() bool {
return options.pointer.circular
}
func (opt Options) BufferSize() int {
return opt.pointer.buffer_size
}
func (opt Options) BatchSize() int {
return opt.pointer.batch_size
}
func (opt Options) ParallelWorkers() int {
return opt.pointer.parallel_workers
}
func MakeOptions(setters []WithOption) Options {
o := __options__{
min_length: 0,
max_length: 0,
forward_error: 0,
reverse_error: 0,
circular: false,
parallel_workers: 4,
batch_size: 100,
buffer_size: 100,
}
opt := Options{&o}
for _, set := range setters {
set(opt)
}
return opt
}
func OptionMinLength(min_length int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.min_length = min_length
})
return f
}
func OptionMaxLength(max_length int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.max_length = max_length
})
return f
}
func OptionForwardError(max int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.forward_error = max
})
return f
}
func OptionReverseError(max int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.reverse_error = max
})
return f
}
func OptionCircular(circular bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.circular = circular
})
return f
}
func OptionBufferSize(size int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.buffer_size = size
})
return f
}
func OptionParallelWorkers(nworkers int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.parallel_workers = nworkers
})
return f
}
func OptionBatchSize(size int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.batch_size = size
})
return f
}
func __pcr__(seq ApatSequence, sequence obiseq.BioSequence,
forward, cfwd, reverse, crev ApatPattern,
opt Options) obiseq.BioSequenceSlice {
results := make(obiseq.BioSequenceSlice, 0, 10)
forward_matches := forward.FindAllIndex(seq)
if forward_matches != nil {
begin := forward_matches[0][0]
length := seq.Length() - begin
if opt.pointer.max_length > 0 {
length = forward_matches[len(forward_matches)-1][2] - begin + opt.MaxLength() + reverse.Length()
}
if opt.Circular() {
begin = 0
length = seq.Length() + MAX_PAT_LEN
}
reverse_matches := crev.FindAllIndex(seq, begin, length)
if reverse_matches != nil {
for _, fm := range forward_matches {
posi := fm[0]
if posi < seq.Length() {
erri := fm[2]
for _, rm := range reverse_matches {
posj := rm[0]
if posj < seq.Length() {
posj := rm[1]
errj := rm[2]
length = 0
if posj > posi {
length = rm[0] - fm[1]
} else {
if opt.Circular() {
length = rm[0] + seq.Length() - posi - forward.Length()
}
}
if length > 0 && // For when primers touch or overlap
(opt.MinLength() == 0 || length >= opt.MinLength()) &&
(opt.MaxLength() == 0 || length <= opt.MaxLength()) {
amplicon, _ := sequence.Subsequence(fm[1], rm[0], opt.pointer.circular)
annot := amplicon.Annotations()
goutils.CopyMap(annot, sequence.Annotations())
annot["forward_primer"] = forward.String()
match, _ := sequence.Subsequence(fm[0], fm[1], opt.pointer.circular)
annot["forward_match"] = match.String()
match.Revoke()
annot["forward_error"] = erri
annot["reverse_primer"] = reverse.String()
match, _ = sequence.Subsequence(rm[0], rm[1], opt.pointer.circular)
match = match.ReverseComplement(true)
annot["reverse_match"] = match.String()
match.Revoke()
annot["reverse_error"] = errj
results = append(results, amplicon)
}
}
}
}
}
}
}
forward_matches = reverse.FindAllIndex(seq)
if forward_matches != nil {
begin := forward_matches[0][0]
length := seq.Length() - begin
if opt.pointer.max_length > 0 {
length = forward_matches[len(forward_matches)-1][2] - begin + opt.MaxLength() + reverse.Length()
}
if opt.Circular() {
begin = 0
length = seq.Length() + MAX_PAT_LEN
}
reverse_matches := cfwd.FindAllIndex(seq, begin, length)
if reverse_matches != nil {
for _, fm := range forward_matches {
posi := fm[0]
if posi < seq.Length() {
erri := fm[2]
for _, rm := range reverse_matches {
posj := rm[0]
if posj < seq.Length() {
posj := rm[1]
errj := rm[2]
length = 0
if posj > posi {
length = rm[0] - fm[1]
} else {
if opt.Circular() {
length = rm[0] + seq.Length() - posi - forward.Length()
}
}
if length > 0 && // For when primers touch or overlap
(opt.MinLength() == 0 || length >= opt.MinLength()) &&
(opt.MaxLength() == 0 || length <= opt.MaxLength()) {
amplicon, _ := sequence.Subsequence(fm[1], rm[0], opt.pointer.circular)
amplicon = amplicon.ReverseComplement(true)
annot := amplicon.Annotations()
goutils.CopyMap(annot, sequence.Annotations())
annot["forward_primer"] = forward.String()
match, _ := sequence.Subsequence(rm[0], rm[1], opt.pointer.circular)
match.ReverseComplement(true)
annot["forward_match"] = match.String()
match.Revoke()
annot["forward_error"] = errj
annot["reverse_primer"] = reverse.String()
match, _ = sequence.Subsequence(fm[0], fm[1], opt.pointer.circular)
annot["reverse_match"] = match.String()
match.Revoke()
annot["reverse_error"] = erri
results = append(results, amplicon)
}
}
}
}
}
}
}
return results
}
func PCR(sequence obiseq.BioSequence,
forward, reverse string, options ...WithOption) obiseq.BioSequenceSlice {
opt := MakeOptions(options)
seq, _ := MakeApatSequence(sequence, opt.Circular())
fwd, _ := MakeApatPattern(forward, opt.ForwardError())
rev, _ := MakeApatPattern(reverse, opt.ReverseError())
cfwd, _ := fwd.ReverseComplement()
crev, _ := rev.ReverseComplement()
results := __pcr__(seq, sequence,
fwd, cfwd, rev, crev,
opt)
seq.Free()
fwd.Free()
rev.Free()
cfwd.Free()
crev.Free()
return results
}
func PCRSlice(sequences obiseq.BioSequenceSlice,
forward, reverse string, options ...WithOption) obiseq.BioSequenceSlice {
results := make(obiseq.BioSequenceSlice, 0, len(sequences))
opt := MakeOptions(options)
fwd, _ := MakeApatPattern(forward, opt.ForwardError())
rev, _ := MakeApatPattern(reverse, opt.ReverseError())
cfwd, _ := fwd.ReverseComplement()
crev, _ := rev.ReverseComplement()
if len(sequences) > 0 {
seq, _ := MakeApatSequence(sequences[0], opt.Circular())
amplicons := __pcr__(seq, sequences[0],
fwd, cfwd, rev, crev,
opt)
if len(amplicons) > 0 {
results = append(results, amplicons...)
}
for _, sequence := range sequences[1:] {
seq, _ := MakeApatSequence(sequence, opt.Circular(), seq)
amplicons = __pcr__(seq, sequence,
fwd, cfwd, rev, crev,
opt)
if len(amplicons) > 0 {
results = append(results, amplicons...)
}
}
seq.Free()
}
fwd.Free()
rev.Free()
cfwd.Free()
crev.Free()
return results
}
func PCRSliceWorker(forward, reverse string,
options ...WithOption) obiseq.SeqSliceWorker {
worker := func(sequences obiseq.BioSequenceSlice) obiseq.BioSequenceSlice {
return PCRSlice(sequences, forward, reverse, options...)
}
return worker
}

View File

@ -0,0 +1,238 @@
package obiformats
import (
"compress/gzip"
"encoding/csv"
"fmt"
"io"
"log"
"os"
"strconv"
"strings"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
type __ecopcr_file__ struct {
file io.Reader
csv *csv.Reader
names map[string]int
version int
mode string
forward_primer string
reverse_primer string
}
func __readline__(stream io.Reader) string {
line := make([]byte, 1024)
char := make([]byte, 1)
i := 0
for n, err := stream.Read(char); err == nil && n == 1 && char[0] != '\n'; n, err = stream.Read(char) {
line[i] = char[0]
i++
}
return string(line[0:i])
}
func __read_ecopcr_bioseq__(file *__ecopcr_file__) (obiseq.BioSequence, error) {
record, err := file.csv.Read()
if err != nil {
return obiseq.NilBioSequence, err
}
name := strings.TrimSpace(record[0])
// Ensure that sequence name is unique accross a file.
if val, ok := file.names[name]; ok {
file.names[name]++
name = fmt.Sprintf("%s_%d", name, val)
} else {
file.names[name] = 1
}
var sequence []byte
var comment string
if file.version == 2 {
sequence = []byte(strings.TrimSpace(record[20]))
comment = strings.TrimSpace(record[21])
} else {
sequence = []byte(strings.TrimSpace(record[18]))
comment = strings.TrimSpace(record[19])
}
bseq := obiseq.MakeBioSequence(name, sequence, comment)
annotation := bseq.Annotations()
annotation["ac"] = name
annotation["seq_length"], _ = strconv.Atoi(strings.TrimSpace(record[1]))
annotation["taxid"], _ = strconv.Atoi(strings.TrimSpace(record[2]))
annotation["rank"] = strings.TrimSpace(record[3])
annotation["species_taxid"], _ = strconv.Atoi(strings.TrimSpace(record[4]))
annotation["species_name"] = strings.TrimSpace(record[5])
annotation["genus_taxid"], _ = strconv.Atoi(strings.TrimSpace(record[6]))
annotation["genus_name"] = strings.TrimSpace(record[7])
annotation["family_taxid"], _ = strconv.Atoi(strings.TrimSpace(record[8]))
annotation["family_name"] = strings.TrimSpace(record[9])
k_m_taxid := file.mode + "_taxid"
k_m_name := file.mode + "_name"
annotation[k_m_taxid], _ = strconv.Atoi(strings.TrimSpace(record[10]))
annotation[k_m_name] = strings.TrimSpace(record[11])
annotation["strand"] = strings.TrimSpace(record[12])
annotation["forward_primer"] = file.forward_primer
annotation["forward_match"] = strings.TrimSpace(record[13])
annotation["forward_mismatch"], _ = strconv.Atoi(strings.TrimSpace(record[14]))
delta := 0
if file.version == 2 {
value, err := strconv.ParseFloat(strings.TrimSpace(record[15]), 64)
if err != nil {
annotation["forward_tm"] = value
} else {
annotation["forward_tm"] = -1
}
delta++
}
annotation["reverse_primer"] = file.reverse_primer
annotation["reverse_match"] = strings.TrimSpace(record[15+delta])
annotation["reverse_mismatch"], _ = strconv.Atoi(strings.TrimSpace(record[16+delta]))
if file.version == 2 {
value, err := strconv.ParseFloat(strings.TrimSpace(record[17+delta]), 64)
if err != nil {
annotation["reverse_tm"] = value
} else {
annotation["reverse_tm"] = -1
}
delta++
}
annotation["amplicon_length"], _ = strconv.Atoi(strings.TrimSpace(record[17+delta]))
return bseq, nil
}
func ReadEcoPCRBatch(reader io.Reader, options ...WithOption) obiseq.IBioSequenceBatch {
tag := make([]byte, 11)
n, _ := reader.Read(tag)
version := 1
if n == 11 && string(tag) == "#@ecopcr-v2" {
version = 2
}
line := __readline__(reader)
for !strings.HasPrefix(line, "# direct strand oligo1") {
line = __readline__(reader)
}
forward_primer := (strings.Split(line, " "))[6]
line = __readline__(reader)
for !strings.HasPrefix(line, "# reverse strand oligo2") {
line = __readline__(reader)
}
reverse_primer := (strings.Split(line, " "))[5]
line = __readline__(reader)
for !strings.HasPrefix(line, "# output in") {
line = __readline__(reader)
}
mode := (strings.Split(line, " "))[3]
file := csv.NewReader(reader)
file.Comma = '|'
file.Comment = '#'
file.TrimLeadingSpace = true
file.ReuseRecord = true
log.Printf("EcoPCR file version : %d Mode : %s\n", version, mode)
ecopcr := __ecopcr_file__{
file: reader,
csv: file,
names: make(map[string]int),
version: version,
mode: mode,
forward_primer: forward_primer,
reverse_primer: reverse_primer}
opt := MakeOptions(options)
new_iter := obiseq.MakeIBioSequenceBatch(opt.BufferSize())
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.Channel())
}()
go func() {
seq, err := __read_ecopcr_bioseq__(&ecopcr)
slice := make(obiseq.BioSequenceSlice, 0, opt.BatchSize())
i := 0
ii := 0
for err == nil {
slice = append(slice, seq)
ii++
if ii >= opt.BatchSize() {
new_iter.Channel() <- obiseq.MakeBioSequenceBatch(i, slice...)
slice = make(obiseq.BioSequenceSlice, 0, opt.BatchSize())
i++
ii = 0
}
seq, err = __read_ecopcr_bioseq__(&ecopcr)
}
if len(slice) > 0 {
new_iter.Channel() <- obiseq.MakeBioSequenceBatch(i, slice...)
}
new_iter.Done()
if err != nil && err != io.EOF {
log.Panicf("%+v", err)
}
}()
return new_iter
}
func ReadEcoPCR(reader io.Reader, options ...WithOption) obiseq.IBioSequence {
ib := ReadEcoPCRBatch(reader, options...)
return ib.SortBatches().IBioSequence()
}
func ReadEcoPCRBatchFromFile(filename string, options ...WithOption) (obiseq.IBioSequenceBatch, error) {
var reader io.Reader
var greader io.Reader
var err error
reader, err = os.Open(filename)
if err != nil {
log.Printf("open file error: %+v", err)
return obiseq.NilIBioSequenceBatch, err
}
// Test if the flux is compressed by gzip
greader, err = gzip.NewReader(reader)
if err == nil {
reader = greader
}
return ReadEcoPCRBatch(reader, options...), nil
}
func ReadEcoPCRFromFile(filename string, options ...WithOption) (obiseq.IBioSequence, error) {
ib, err := ReadEcoPCRBatchFromFile(filename, options...)
return ib.SortBatches().IBioSequence(), err
}

246
pkg/obiformats/embl_read.go Normal file
View File

@ -0,0 +1,246 @@
package obiformats
import (
"bufio"
"bytes"
"compress/gzip"
"io"
"log"
"os"
"strconv"
"strings"
"time"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
var __FILE_CHUNK_SIZE__ = 1 << 20
func __slice_grow__(slice []string) []string {
return slice
}
type __embl_chunk__ struct {
entries [][]string
order int
}
type __file_chunk__ struct {
raw io.Reader
order int
}
func __end_of_last_entry__(buff []byte) int {
// 6 5 43 2 1
// <CR>?<LF>//<CR>?<LF>
var i int
var state = 0
var start = 0
for i = len(buff) - 1; i >= 0 && state < 5; i-- {
switch state {
case 0: // outside of the pattern
if buff[i] == '\n' {
state = 1
}
case 1: // a \n have been matched
start = i + 2
switch buff[i] {
case '\r':
state = 2
case '/':
state = 3
case '\n':
state = 1
default:
state = 0
}
case 2: // a \r have been matched
switch buff[i] {
case '/':
state = 3
case '\n':
state = 1
default:
state = 0
}
case 3: // the first / have been matched
switch buff[i] {
case '/':
state = 4
case '\n':
state = 1
default:
state = 0
}
case 4: // the second / have been matched
switch buff[i] {
case '\n':
state = 5
default:
state = 0
}
}
}
if i > 0 {
return start
} else {
return -1
}
}
func __parse_embl_file__(input <-chan __file_chunk__, out obiseq.IBioSequenceBatch) {
for chunks := range input {
scanner := bufio.NewScanner(chunks.raw)
order := chunks.order
sequences := make(obiseq.BioSequenceSlice, 0, 100)
id := ""
scientific_name := ""
def_bytes := new(bytes.Buffer)
feat_bytes := new(bytes.Buffer)
seq_bytes := new(bytes.Buffer)
taxid := 1
for scanner.Scan() {
line := scanner.Text()
switch {
case strings.HasPrefix(line, "ID "):
id = strings.SplitN(line[5:], ";", 2)[0]
case strings.HasPrefix(line, "OS "):
scientific_name = strings.TrimSpace(line[5:])
case strings.HasPrefix(line, "DE "):
if def_bytes.Len() > 0 {
def_bytes.WriteByte(' ')
}
def_bytes.WriteString(strings.TrimSpace(line[5:]))
case strings.HasPrefix(line, "FH "):
feat_bytes.WriteString(line)
case line == "FH":
feat_bytes.WriteByte('\n')
feat_bytes.WriteString(line)
case strings.HasPrefix(line, "FT "):
feat_bytes.WriteByte('\n')
feat_bytes.WriteString(line)
if strings.HasPrefix(line, `FT /db_xref="taxon:`) {
taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
}
case strings.HasPrefix(line, " "):
parts := strings.SplitN(line[5:], " ", 7)
for i := 0; i < 6; i++ {
seq_bytes.WriteString(parts[i])
}
case line == "//":
sequence := obiseq.MakeBioSequence(id,
seq_bytes.Bytes(),
def_bytes.String())
sequence.SetFeatures(feat_bytes.String())
annot := sequence.Annotations()
annot["scientific_name"] = scientific_name
annot["taxid"] = taxid
// log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader))
sequences = append(sequences, sequence)
def_bytes = new(bytes.Buffer)
feat_bytes = new(bytes.Buffer)
seq_bytes = new(bytes.Buffer)
}
}
out.Channel() <- obiseq.MakeBioSequenceBatch(order, sequences...)
}
out.Done()
}
func __read_flat_file_chunk__(reader io.Reader, readers chan __file_chunk__) {
var err error
var buff []byte
size := 0
l := 0
i := 0
buff = make([]byte, 1<<20)
for err == nil {
for ; err == nil && l < len(buff); l += size {
size, err = reader.Read(buff[l:])
}
buff = buff[:l]
end := __end_of_last_entry__(buff)
remains := buff[end:]
buff = buff[:end]
io := bytes.NewBuffer(buff)
readers <- __file_chunk__{io, i}
i++
buff = make([]byte, __FILE_CHUNK_SIZE__)
copy(buff, remains)
l = len(remains)
}
close(readers)
}
// 6 5 43 2 1
// <CR>?<LF>//<CR>?<LF>
func ReadEMBLBatch(reader io.Reader, options ...WithOption) obiseq.IBioSequenceBatch {
opt := MakeOptions(options)
entry_channel := make(chan __file_chunk__, opt.BufferSize())
new_iter := obiseq.MakeIBioSequenceBatch(opt.BufferSize())
// new_iter.Add(opt.ParallelWorkers())
new_iter.Add(2)
go func() {
new_iter.Wait()
for len(new_iter.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
close(new_iter.Channel())
}()
// for j := 0; j < opt.ParallelWorkers(); j++ {
for j := 0; j < 2; j++ {
go __parse_embl_file__(entry_channel, new_iter)
}
go __read_flat_file_chunk__(reader, entry_channel)
return new_iter
}
func ReadEMBL(reader io.Reader, options ...WithOption) obiseq.IBioSequence {
ib := ReadEMBLBatch(reader, options...)
return ib.SortBatches().IBioSequence()
}
func ReadEMBLBatchFromFile(filename string, options ...WithOption) (obiseq.IBioSequenceBatch, error) {
var reader io.Reader
var greader io.Reader
var err error
reader, err = os.Open(filename)
if err != nil {
log.Printf("open file error: %+v", err)
return obiseq.NilIBioSequenceBatch, err
}
// Test if the flux is compressed by gzip
greader, err = gzip.NewReader(reader)
if err == nil {
reader = greader
}
return ReadEMBLBatch(reader, options...), nil
}
func ReadEMBLFromFile(filename string, options ...WithOption) (obiseq.IBioSequence, error) {
ib, err := ReadEMBLBatchFromFile(filename, options...)
return ib.SortBatches().IBioSequence(), err
}

View File

@ -0,0 +1,30 @@
package obiformats
import (
"strings"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
func ParseGuessedFastSeqHeader(sequence obiseq.BioSequence) {
if strings.HasPrefix(sequence.Definition(), "{") {
ParseFastSeqJsonHeader(sequence)
} else {
ParseFastSeqOBIHeader(sequence)
}
}
func IParseFastSeqHeaderBatch(iterator obiseq.IBioSequenceBatch, options ...WithOption) obiseq.IBioSequenceBatch {
opt := MakeOptions(options)
return iterator.MakeIWorker(obiseq.AnnotatorToSeqWorker(opt.ParseFastSeqHeader()),
opt.ParallelWorkers(),
opt.BufferSize())
}
func IParseFastSeqHeader(iterator obiseq.IBioSequence, options ...WithOption) obiseq.IBioSequence {
opt := MakeOptions(options)
return IParseFastSeqHeaderBatch(iterator.IBioSequenceBatch(opt.BatchSize(),
opt.BufferSize()),
options...).SortBatches().IBioSequence()
}

View File

@ -0,0 +1,5 @@
package obiformats
import "git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
type FormatHeader func(sequence obiseq.BioSequence) string

View File

@ -0,0 +1,66 @@
package obiformats
import (
"strings"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
"github.com/goccy/go-json"
)
func _parse_json_header_(header string, annotations obiseq.Annotation) string {
start := -1
stop := -1
level := 0
lh := len(header)
for i := 0; (i < lh) && (stop < 0); i++ {
// fmt.Printf("[%d,%d-%d] : %d (%c) (%d,%c)\n", i, start, stop, header[i], header[i], '{', '{')
if level == 0 && header[i] == '{' {
start = i
}
if header[i] == '{' {
level++
}
if header[i] == '}' {
level--
}
if start >= 0 && level == 0 {
stop = i
}
}
if start < 0 || stop < 0 {
return header
}
stop++
json.Unmarshal([]byte(header)[start:stop], annotations)
return strings.TrimSpace(header[stop:])
}
func ParseFastSeqJsonHeader(sequence obiseq.BioSequence) {
sequence.SetDefinition(_parse_json_header_(sequence.Definition(),
sequence.Annotations()))
}
func FormatFastSeqJsonHeader(sequence obiseq.BioSequence) string {
annotations := sequence.Annotations()
if annotations != nil {
text, err := json.Marshal(sequence.Annotations())
if err != nil {
panic(err)
}
return string(text)
}
return ""
}

View File

@ -0,0 +1,288 @@
package obiformats
import (
"bytes"
"fmt"
"regexp"
"strconv"
"strings"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
"github.com/goccy/go-json"
)
var __obi_header_value_string_pattern__ = regexp.MustCompile(`^'\s*([^']*'|"[^"]*")\s*;`)
var __obi_header_value_numeric_pattern__ = regexp.MustCompile(`^\s*([+-]?\.\d+|[+-]?\d+(\.\d*)?([eE][+-]?\d+)?)\s*;`)
func __match__dict__(text []byte) []int {
state := 0
level := 0
start := 0
instring := byte(0)
for i, r := range text {
if state == 2 {
if r == ';' {
// end of the pattern
return []int{start, i + 1}
}
if r != ' ' && r != '\t' {
// Bad character at the end of the pattern
return []int{}
}
}
if r == '{' && instring == 0 { // Beginning of dict
level++
if state == 0 {
// Beginning of the main dict
state++
start = i
}
continue
}
if state == 0 && r != ' ' && r != '\t' {
// It's not a dict
return []int{}
}
if state == 1 {
if r == '"' || r == '\'' {
if instring == 0 {
// start of a string
instring = r
} else {
if instring == r {
// end of a string
instring = 0
}
}
continue
}
}
if r == '}' && instring == 0 {
// end of a dict
level--
if level == 0 {
// end of the main dict
state++
}
}
}
return []int{}
}
func __match__key__(text []byte) []int {
state := 0
start := 0
for i, r := range text {
if state == 0 {
if (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') {
// Beginning of the key
// fmt.Printf("Beginning of the key (%c) %d\n", r, i)
state++
start = i
continue
}
if r != ' ' && r != '\t' {
// It's not a key
return []int{}
}
continue
}
if state > 0 && r == '=' {
// End of thee pattern
// fmt.Printf("End of the pattern (%c) %d\n", r, i)
return []int{start, i + 1}
}
if state == 1 {
if r == ' ' || r == '\t' {
// End of the key
state++
continue
}
if (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') ||
(r >= '0' && r <= '9') ||
r == '_' || r == '-' || r == '.' {
// Continuing the key
continue
}
// Not allowed character in a key
// fmt.Printf("Not allowed char (%c) %d\n", r, i)
return []int{}
}
if state == 2 && r != ' ' && r != '\t' {
// fmt.Printf("Not allowed char 2 (%c) %d\n", r, i)
// Not allowed character after a key
return []int{}
}
}
return []int{} // Not a key
}
func __match__general__(text []byte) []int {
for i, r := range text {
if r == ';' {
return []int{0, i + 1}
}
}
return []int{} // Not generic value
}
var __false__ = []byte{'f', 'a', 'l', 's', 'e'}
var __False__ = []byte{'F', 'a', 'l', 's', 'e'}
var __FALSE__ = []byte{'F', 'A', 'L', 'S', 'E'}
var __true__ = []byte{'t', 'r', 'u', 'e'}
var __True__ = []byte{'T', 'r', 'u', 'e'}
var __TRUE__ = []byte{'T', 'R', 'U', 'E'}
func __is_true__(text []byte) bool {
return (len(text) == 1 && (text[0] == 't' || text[0] == 'T')) ||
bytes.Equal(text, __true__) ||
bytes.Equal(text, __True__) ||
bytes.Equal(text, __TRUE__)
}
func __is_false__(text []byte) bool {
return (len(text) == 1 && (text[0] == 'f' || text[0] == 'F')) ||
bytes.Equal(text, __false__) ||
bytes.Equal(text, __False__) ||
bytes.Equal(text, __FALSE__)
}
func ParseFastSeqOBIHeader(sequence obiseq.BioSequence) {
definition := []byte(sequence.Definition())
annotations := sequence.Annotations()
// all_matches := __obi_header_pattern__.FindAllSubmatchIndex(definition, -1)
d := definition
//for m := __obi_header_key_pattern__.FindIndex(definition); len(m) > 0; {
//fmt.Println(string(definition[0:20]), __match__key__(definition))
for m := __match__key__(definition); len(m) > 0; {
var bvalue []byte
var value interface{}
start := m[0]
stop := -1
key := string(bytes.TrimSpace(d[start:(m[1] - 1)]))
part := d[m[1]:]
// numeric value
m = __obi_header_value_numeric_pattern__.FindIndex(part)
if len(m) > 0 {
bvalue = bytes.TrimSpace(part[m[0]:(m[1] - 1)])
value, _ = strconv.ParseFloat(string(bvalue), 64)
stop = m[1] + 1
} else {
// string value
m = __obi_header_value_string_pattern__.FindIndex(part)
if len(m) > 0 {
bvalue = bytes.TrimSpace(part[m[0]:(m[1] - 1)])
value = string(bvalue[1:(len(bvalue) - 1)])
stop = m[1] + 1
} else {
// dict value
m = __match__dict__(part)
if len(m) > 0 {
bvalue = bytes.TrimSpace(part[m[0]:(m[1] - 1)])
j := bytes.ReplaceAll(bvalue, []byte("'"), []byte(`"`))
var err error
if strings.HasPrefix(key, "merged_") ||
strings.HasSuffix(key, "_count") {
dict := make(map[string]int)
err = json.Unmarshal(j, &dict)
value = dict
} else {
dict := make(map[string]interface{})
err = json.Unmarshal(j, &dict)
value = dict
}
if err != nil {
value = string(bvalue)
}
stop = m[1] + 1
} else {
// Generic value
// m = __obi_header_value_general_pattern__.FindIndex(part)
m = __match__general__(part)
if len(m) > 0 {
bvalue = bytes.TrimSpace(part[m[0]:(m[1] - 1)])
if __is_false__(bvalue) {
value = false
} else {
if __is_true__(bvalue) {
value = true
} else {
value = string(bvalue)
}
}
stop = m[1] + 1
} else {
// no value
break
} // End of No value
} // End of not dict
} // End of not string
} // End of not numeric
annotations[key] = value
d = part[stop:]
//m = __obi_header_key_pattern__.FindIndex(d)
m = __match__key__(d)
}
sequence.SetDefinition(string(bytes.TrimSpace(d)))
}
func FormatFastSeqOBIHeader(sequence obiseq.BioSequence) string {
annotations := sequence.Annotations()
if annotations != nil {
var text strings.Builder
for key, value := range annotations {
switch t := value.(type) {
case string:
text.WriteString(fmt.Sprintf("%s=%s; ", key, t))
default:
text.WriteString(fmt.Sprintf("%s=%v; ", key, value))
}
}
return text.String()
}
return ""
}

View File

@ -0,0 +1,104 @@
#include "fastseq_read.h"
static fast_kseq_t* _open_fast_sek(gzFile fp, int shift) {
fast_kseq_t* iterator;
iterator = (fast_kseq_t*)malloc(sizeof(fast_kseq_t));
if (iterator == NULL)
return NULL;
iterator->filez = fp;
iterator->finished = false;
iterator->shift = shift;
if (fp != Z_NULL) {
iterator->seq = kseq_init(fp);
if (iterator->seq == NULL) {
free(iterator);
iterator=NULL;
}
}
else {
free(iterator);
iterator=NULL;
}
return iterator;
}
/**
* @brief open a FastA or FastQ file gizzed or not
*
* @param filename a const char* indicating the path of the
* fast* file
* @return kseq_t* a pointer to a kseq_t structure or NULL on
* failing
*/
fast_kseq_t* open_fast_sek_file(const char* filename, int shift) {
gzFile fp;
fp = gzopen(filename, "r");
return _open_fast_sek(fp, shift);
}
fast_kseq_p open_fast_sek_fd(int fd, bool keep_open, int shift) {
gzFile fp;
if (keep_open)
fd = dup(fd);
fp = gzdopen(fd, "r");
return _open_fast_sek(fp, shift);
}
fast_kseq_p open_fast_sek_stdin(int shift) {
return open_fast_sek_fd(fileno(stdin), true, shift);
}
int64_t next_fast_sek(fast_kseq_t* iterator) {
int64_t l;
if (iterator == NULL || iterator->seq == NULL)
return -3;
l = kseq_read(iterator->seq);
iterator->finished = l==0;
if (l>0) l = gzoffset(iterator->filez);
return l;
}
int rewind_fast_sek(fast_kseq_t* iterator) {
if (iterator == NULL || iterator->seq == NULL)
return -3;
kseq_rewind(iterator->seq);
return 0;
}
int close_fast_sek(fast_kseq_t* iterator) {
gzFile fp;
kseq_t *seq;
int rep = -3;
if (iterator == NULL)
return rep;
fp = iterator->filez;
seq = iterator->seq;
free(iterator);
if (seq != NULL)
kseq_destroy(iterator->seq);
if (fp != Z_NULL)
rep = gzclose(fp);
return rep;
}

View File

@ -0,0 +1,153 @@
package obiformats
// #cgo CFLAGS: -g -Wall
// #cgo LDFLAGS: -lz
// #include <stdlib.h>
// #include "fastseq_read.h"
import "C"
import (
"errors"
"fmt"
"log"
"os"
"time"
"unsafe"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/cutils"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
func __fastseq_reader__(seqfile C.fast_kseq_p,
iterator obiseq.IBioSequenceBatch,
batch_size int) {
var comment string
i := 0
ii := 0
slice := make(obiseq.BioSequenceSlice, 0, batch_size)
for l := int64(C.next_fast_sek(seqfile)); l > 0; l = int64(C.next_fast_sek(seqfile)) {
s := seqfile.seq
sequence := C.GoBytes(unsafe.Pointer(s.seq.s),
C.int(s.seq.l))
name := C.GoString(s.name.s)
if s.comment.l > C.ulong(0) {
comment = C.GoString(s.comment.s)
} else {
comment = ""
}
rep := obiseq.MakeBioSequence(name, sequence, comment)
if s.qual.l > C.ulong(0) {
cquality := cutils.ByteSlice(unsafe.Pointer(s.qual.s), int(s.qual.l))
quality := make(obiseq.Quality, s.qual.l)
l := int(s.qual.l)
shift := uint8(seqfile.shift)
for j := 0; j < l; j++ {
quality[j] = uint8(cquality[j]) - shift
}
rep.SetQualities(quality)
}
slice = append(slice, rep)
ii++
if ii >= batch_size {
// log.Printf("\n==> Pushing sequence batch\n")
// start := time.Now()
iterator.Channel() <- obiseq.MakeBioSequenceBatch(i, slice...)
// elapsed := time.Since(start)
// log.Printf("\n==>sequences pushed after %s\n", elapsed)
slice = make(obiseq.BioSequenceSlice, 0, batch_size)
i++
ii = 0
}
}
if len(slice) > 0 {
iterator.Channel() <- obiseq.MakeBioSequenceBatch(i, slice...)
}
iterator.Done()
}
func ReadFastSeqBatchFromFile(filename string, options ...WithOption) (obiseq.IBioSequenceBatch, error) {
opt := MakeOptions(options)
name := C.CString(filename)
defer C.free(unsafe.Pointer(name))
pointer := C.open_fast_sek_file(name, C.int32_t(opt.QualityShift()))
var err error
err = nil
if pointer == nil {
err = errors.New(fmt.Sprintf("Cannot open file %s", filename))
return obiseq.NilIBioSequenceBatch, err
}
size := int64(-1)
fi, err := os.Stat(filename)
if err == nil {
size = fi.Size()
log.Printf("File size of %s is %d bytes\n", filename, size)
} else {
size = -1
}
new_iter := obiseq.MakeIBioSequenceBatch(opt.BufferSize())
new_iter.Add(1)
go func() {
new_iter.Wait()
for len(new_iter.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
close(new_iter.Channel())
log.Println("End of the fastq file reading")
}()
log.Println("Start of the fastq file reading")
go __fastseq_reader__(pointer, new_iter, opt.BatchSize())
parser := opt.ParseFastSeqHeader()
if parser != nil {
return IParseFastSeqHeaderBatch(new_iter, options...), err
}
return new_iter, err
}
func ReadFastSeqFromFile(filename string, options ...WithOption) (obiseq.IBioSequence, error) {
ib, err := ReadFastSeqBatchFromFile(filename, options...)
return ib.SortBatches().IBioSequence(), err
}
func ReadFastSeqBatchFromStdin(options ...WithOption) obiseq.IBioSequenceBatch {
opt := MakeOptions(options)
new_iter := obiseq.MakeIBioSequenceBatch(opt.BufferSize())
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.Channel())
}()
go __fastseq_reader__(C.open_fast_sek_stdin(C.int32_t(opt.QualityShift())), new_iter, opt.BatchSize())
return new_iter
}
func ReadFastSeqFromStdin(options ...WithOption) obiseq.IBioSequence {
ib := ReadFastSeqBatchFromStdin(options...)
return ib.SortBatches().IBioSequence()
}

View File

@ -0,0 +1,41 @@
#ifndef _READ_H
#define _READ_H
#include <zlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include "kseq/kseq.h"
KSEQ_INIT(gzFile, gzread)
typedef struct {
kseq_t *seq;
bool finished;
int16_t shift;
gzFile filez;
} fast_kseq_t, *fast_kseq_p;
fast_kseq_t* open_fast_sek_file(const char* filename, int shift);
fast_kseq_t* open_fast_sek_fd(int fd, bool keep_open, int shift);
fast_kseq_t* open_fast_sek_stdin(int shift);
/**
* @brief read the next sequence on the fast* stream
*
* @param seq a kseq_t* created using function open_fast_sek
* @return int if greater than 0 represents the length of the
* sequence, otherwise indicates an error
* - -1 : no more sequence in the stream
* - -2 : too short quality sequence
* - -3 : called with NULL pointer
*/
int64_t next_fast_sek(fast_kseq_t* iterator);
int close_fast_sek(fast_kseq_t* iterator);
int rewind_fast_sek(fast_kseq_t* iterator);
#endif

View File

@ -0,0 +1,164 @@
package obiformats
import (
"bytes"
"fmt"
"io"
"log"
"os"
"strings"
"sync"
"time"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
func min(x, y int) int {
if x < y {
return x
}
return y
}
func FormatFasta(seq obiseq.BioSequence, formater FormatHeader) string {
var fragments strings.Builder
s := seq.Sequence()
l := len(s)
fragments.Grow(l + int(l/60) + 10)
for i := 0; i < l; i += 60 {
to := min(i+60, l)
fmt.Fprintf(&fragments, "%s\n", string(s[i:to]))
}
folded := fragments.String()
folded = folded[:fragments.Len()-1]
info := formater(seq)
return fmt.Sprintf(">%s %s %s\n%s",
seq.Id(), info,
seq.Definition(),
folded)
}
func FormatFastaBatch(batch obiseq.BioSequenceBatch, formater FormatHeader) []byte {
var bs bytes.Buffer
for _, seq := range batch.Slice() {
bs.WriteString(FormatFasta(seq, formater))
bs.WriteString("\n")
}
return bs.Bytes()
}
func WriteFasta(iterator obiseq.IBioSequence, file io.Writer, options ...WithOption) error {
opt := MakeOptions(options)
header_format := opt.FormatFastSeqHeader()
for iterator.Next() {
seq := iterator.Get()
fmt.Fprintln(file, FormatFasta(seq, header_format))
}
return nil
}
func WriteFastaToFile(iterator obiseq.IBioSequence,
filename string,
options ...WithOption) error {
file, err := os.Create(filename)
if err != nil {
log.Fatalf("open file error: %v", err)
return err
}
return WriteFasta(iterator, file, options...)
}
func WriteFastaToStdout(iterator obiseq.IBioSequence, options ...WithOption) error {
return WriteFasta(iterator, os.Stdout, options...)
}
func WriteFastaBatch(iterator obiseq.IBioSequenceBatch, file io.Writer, options ...WithOption) error {
buffsize := iterator.BufferSize()
new_iter := obiseq.MakeIBioSequenceBatch(buffsize)
opt := MakeOptions(options)
nwriters := 4
chunkchan := make(chan FileChunck)
chunkwait := sync.WaitGroup{}
header_format := opt.FormatFastSeqHeader()
chunkwait.Add(nwriters)
go func() {
chunkwait.Wait()
for len(chunkchan) > 0 {
time.Sleep(time.Millisecond)
}
close(chunkchan)
}()
ff := func(iterator obiseq.IBioSequenceBatch) {
for iterator.Next() {
batch := iterator.Get()
chunkchan <- FileChunck{
FormatFastaBatch(batch, header_format),
batch.Order(),
}
new_iter.Channel() <- batch
}
new_iter.Done()
}
for i := 0; i < nwriters; i++ {
go ff(iterator.Split())
}
next_to_send := 0
received := make(map[int]FileChunck, 100)
go func() {
for chunk := range chunkchan {
if chunk.order == next_to_send {
file.Write(chunk.text)
next_to_send++
chunk, ok := received[next_to_send]
for ok {
file.Write(chunk.text)
delete(received, next_to_send)
next_to_send++
chunk, ok = received[next_to_send]
}
} else {
received[chunk.order] = chunk
}
}
}()
return nil
}
func WriteFastaBatchToStdout(iterator obiseq.IBioSequenceBatch, options ...WithOption) error {
return WriteFastaBatch(iterator, os.Stdout, options...)
}
func WriteFastaBatchToFile(iterator obiseq.IBioSequenceBatch,
filename string,
options ...WithOption) error {
file, err := os.Create(filename)
if err != nil {
log.Fatalf("open file error: %v", err)
return err
}
return WriteFastaBatch(iterator, file, options...)
}

View File

@ -0,0 +1,168 @@
package obiformats
import (
"bytes"
"fmt"
"io"
"log"
"os"
"time"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
func FormatFastq(seq obiseq.BioSequence, quality_shift int, formater FormatHeader) string {
l := seq.Length()
q := seq.Qualities()
ascii := make([]byte, seq.Length())
for j := 0; j < l; j++ {
ascii[j] = uint8(q[j]) + uint8(quality_shift)
}
info := ""
if formater != nil {
info = formater(seq)
}
return fmt.Sprintf("@%s %s %s\n%s\n+\n%s",
seq.Id(), info,
seq.Definition(),
string(seq.Sequence()),
string(ascii),
)
}
func FormatFastqBatch(batch obiseq.BioSequenceBatch, quality_shift int,
formater FormatHeader) []byte {
var bs bytes.Buffer
for _, seq := range batch.Slice() {
bs.WriteString(FormatFastq(seq, quality_shift, formater))
bs.WriteString("\n")
}
return bs.Bytes()
}
func WriteFastq(iterator obiseq.IBioSequence, file io.Writer, options ...WithOption) error {
opt := MakeOptions(options)
header_format := opt.FormatFastSeqHeader()
quality := opt.QualityShift()
for iterator.Next() {
seq := iterator.Get()
fmt.Fprintln(file, FormatFastq(seq, quality, header_format))
}
return nil
}
func WriteFastqToFile(iterator obiseq.IBioSequence,
filename string,
options ...WithOption) error {
file, err := os.Create(filename)
if err != nil {
log.Fatalf("open file error: %v", err)
return err
}
return WriteFastq(iterator, file, options...)
}
func WriteFastqToStdout(iterator obiseq.IBioSequence, options ...WithOption) error {
return WriteFastq(iterator, os.Stdout, options...)
}
type FileChunck struct {
text []byte
order int
}
func WriteFastqBatch(iterator obiseq.IBioSequenceBatch, file io.Writer, options ...WithOption) (obiseq.IBioSequenceBatch, error) {
buffsize := iterator.BufferSize()
new_iter := obiseq.MakeIBioSequenceBatch(buffsize)
opt := MakeOptions(options)
nwriters := 4
chunkchan := make(chan FileChunck)
header_format := opt.FormatFastSeqHeader()
quality := opt.QualityShift()
new_iter.Add(nwriters)
go func() {
new_iter.Wait()
for len(chunkchan) > 0 {
time.Sleep(time.Millisecond)
}
close(chunkchan)
for len(new_iter.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
close(new_iter.Channel())
}()
ff := func(iterator obiseq.IBioSequenceBatch) {
for iterator.Next() {
batch := iterator.Get()
chunkchan <- FileChunck{
FormatFastqBatch(batch, quality, header_format),
batch.Order(),
}
new_iter.Channel() <- batch
}
new_iter.Done()
}
log.Println("Start of the fastq file reading")
for i := 0; i < nwriters; i++ {
go ff(iterator.Split())
}
next_to_send := 0
received := make(map[int]FileChunck, 100)
go func() {
for chunk := range chunkchan {
if chunk.order == next_to_send {
file.Write(chunk.text)
next_to_send++
chunk, ok := received[next_to_send]
for ok {
file.Write(chunk.text)
delete(received, next_to_send)
next_to_send++
chunk, ok = received[next_to_send]
}
} else {
received[chunk.order] = chunk
}
}
}()
return new_iter, nil
}
func WriteFastqBatchToStdout(iterator obiseq.IBioSequenceBatch, options ...WithOption) (obiseq.IBioSequenceBatch, error) {
return WriteFastqBatch(iterator, os.Stdout, options...)
}
func WriteFastqBatchToFile(iterator obiseq.IBioSequenceBatch,
filename string,
options ...WithOption) (obiseq.IBioSequenceBatch, error) {
file, err := os.Create(filename)
if err != nil {
log.Fatalf("open file error: %v", err)
return obiseq.NilIBioSequenceBatch, err
}
return WriteFastqBatch(iterator, file, options...)
}

View File

@ -0,0 +1,5 @@
all:kseq.h kseq_test.c
$(CC) -g -O2 kseq_test.c -o kseq_test -lz
clean:
rm -f *.o kseq_test

223
pkg/obiformats/kseq/kseq.h Normal file
View File

@ -0,0 +1,223 @@
/* The MIT License
Copyright (c) 2008 Genome Research Ltd (GRL).
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/* Contact: Heng Li <lh3@sanger.ac.uk> */
/* Last Modified: 12APR2009 */
#ifndef AC_KSEQ_H
#define AC_KSEQ_H
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
#define KS_SEP_TAB 1 // isspace() && !' '
#define KS_SEP_MAX 1
#define __KS_TYPE(type_t) \
typedef struct __kstream_t { \
char *buf; \
int begin, end, is_eof; \
type_t f; \
} kstream_t;
#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
#define __KS_BASIC(type_t, __bufsize) \
static inline kstream_t *ks_init(type_t f) \
{ \
kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
ks->f = f; \
ks->buf = (char*)malloc(__bufsize); \
return ks; \
} \
static inline void ks_destroy(kstream_t *ks) \
{ \
if (ks) { \
free(ks->buf); \
free(ks); \
} \
}
#define __KS_GETC(__read, __bufsize) \
static inline int ks_getc(kstream_t *ks) \
{ \
if (ks->is_eof && ks->begin >= ks->end) return -1; \
if (ks->begin >= ks->end) { \
ks->begin = 0; \
ks->end = __read(ks->f, ks->buf, __bufsize); \
if (ks->end < __bufsize) ks->is_eof = 1; \
if (ks->end == 0) return -1; \
} \
return (int)ks->buf[ks->begin++]; \
}
#ifndef KSTRING_T
#define KSTRING_T kstring_t
typedef struct __kstring_t {
size_t l, m;
char *s;
} kstring_t;
#endif
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
#define __KS_GETUNTIL(__read, __bufsize) \
static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
{ \
if (dret) *dret = 0; \
str->l = 0; \
if (ks->begin >= ks->end && ks->is_eof) return -1; \
for (;;) { \
int i; \
if (ks->begin >= ks->end) { \
if (!ks->is_eof) { \
ks->begin = 0; \
ks->end = __read(ks->f, ks->buf, __bufsize); \
if (ks->end < __bufsize) ks->is_eof = 1; \
if (ks->end == 0) break; \
} else break; \
} \
if (delimiter > KS_SEP_MAX) { \
for (i = ks->begin; i < ks->end; ++i) \
if (ks->buf[i] == delimiter) break; \
} else if (delimiter == KS_SEP_SPACE) { \
for (i = ks->begin; i < ks->end; ++i) \
if (isspace(ks->buf[i])) break; \
} else if (delimiter == KS_SEP_TAB) { \
for (i = ks->begin; i < ks->end; ++i) \
if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
} else i = 0; /* never come to here! */ \
if (str->m - str->l < i - ks->begin + 1) { \
str->m = str->l + (i - ks->begin) + 1; \
kroundup32(str->m); \
str->s = (char*)realloc(str->s, str->m); \
} \
memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
str->l = str->l + (i - ks->begin); \
ks->begin = i + 1; \
if (i < ks->end) { \
if (dret) *dret = ks->buf[i]; \
break; \
} \
} \
if (str->l == 0) { \
str->m = 1; \
str->s = (char*)calloc(1, 1); \
} \
str->s[str->l] = '\0'; \
return str->l; \
}
#define KSTREAM_INIT(type_t, __read, __bufsize) \
__KS_TYPE(type_t) \
__KS_BASIC(type_t, __bufsize) \
__KS_GETC(__read, __bufsize) \
__KS_GETUNTIL(__read, __bufsize)
#define __KSEQ_BASIC(type_t) \
static inline kseq_t *kseq_init(type_t fd) \
{ \
kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
s->f = ks_init(fd); \
return s; \
} \
static inline void kseq_rewind(kseq_t *ks) \
{ \
ks->last_char = 0; \
ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
} \
static inline void kseq_destroy(kseq_t *ks) \
{ \
if (!ks) return; \
free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
ks_destroy(ks->f); \
free(ks); \
}
/* Return value:
>=0 length of the sequence (normal)
-1 end-of-file
-2 truncated quality string
*/
#define __KSEQ_READ \
static int kseq_read(kseq_t *seq) \
{ \
int c; \
kstream_t *ks = seq->f; \
if (seq->last_char == 0) { /* then jump to the next header line */ \
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
if (c == -1) return -1; /* end of file */ \
seq->last_char = c; \
} /* the first header char has been read */ \
seq->comment.l = seq->seq.l = seq->qual.l = 0; \
if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
if (isgraph(c)) { /* printable non-space character */ \
if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
seq->seq.m = seq->seq.l + 2; \
kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
} \
seq->seq.s[seq->seq.l++] = (char)c; \
} \
} \
if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
if (c != '+') return seq->seq.l; /* FASTA */ \
if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
seq->qual.m = seq->seq.m; \
seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
} \
while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
if (c == -1) return -2; /* we should not stop here */ \
while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
seq->last_char = 0; /* we have not come to the next header line */ \
if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
return seq->seq.l; \
}
#define __KSEQ_TYPE(type_t) \
typedef struct { \
kstring_t name, comment, seq, qual; \
int last_char; \
kstream_t *f; \
} kseq_t;
#define KSEQ_INIT(type_t, __read) \
KSTREAM_INIT(type_t, __read, 4096) \
__KSEQ_TYPE(type_t) \
__KSEQ_BASIC(type_t) \
__KSEQ_READ
#endif

BIN
pkg/obiformats/kseq/kseq_test Executable file

Binary file not shown.

View File

@ -0,0 +1,27 @@
#include <zlib.h>
#include <stdio.h>
#include "kseq.h"
KSEQ_INIT(gzFile, gzread)
int main(int argc, char *argv[])
{
gzFile fp;
kseq_t *seq;
int l;
if (argc == 1) {
fprintf(stderr, "Usage: %s <in.seq>\n", argv[0]);
return 1;
}
fp = gzopen(argv[1], "r");
seq = kseq_init(fp);
while ((l = kseq_read(seq)) >= 0) {
printf("name: %s\n", seq->name.s);
if (seq->comment.l) printf("comment: %s\n", seq->comment.s);
printf("seq: %s\n", seq->seq.s);
if (seq->qual.l) printf("qual: %s\n", seq->qual.s);
}
printf("return value: %d\n", l);
kseq_destroy(seq);
gzclose(fp);
return 0;
}

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleDevelopmentRegion</key>
<string>English</string>
<key>CFBundleIdentifier</key>
<string>com.apple.xcode.dsym.kseq_test</string>
<key>CFBundleInfoDictionaryVersion</key>
<string>6.0</string>
<key>CFBundlePackageType</key>
<string>dSYM</string>
<key>CFBundleSignature</key>
<string>????</string>
<key>CFBundleShortVersionString</key>
<string>1.0</string>
<key>CFBundleVersion</key>
<string>1</string>
</dict>
</plist>

View File

@ -0,0 +1,291 @@
>HWI-D00393:103:C6KCUANXX:2:2309:18209:70743_CONS_SUB_SUB reverse_score=72.0; count=2; direction=forward; experiment=australie; seq_a_mismatch=0; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; status=full; seq_a_deletion=0; seq_length=98; start=aaaac; merged_sample={'AN5-30_b': 1, 'AML-33_b': 1}; seq_a_insertion=0; mode=alignment; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; goodAli=Alignement;
aaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagagtt
ggcataactatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2209:18639:37342_CONS_SUB_SUB_CMP ali_length=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-45_b': 1, 'AN2-30_a': 1}; forward_score=72.0; seq_b_mismatch=0; start=ataaa; experiment=australie; seq_a_single=33; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=2; seq_length=100; status=full; mode=alignment; seq_b_single=33;
ataaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2115:3400:66119_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-03_a': 1}; forward_score=72.0; score=367.38470594; seq_a_mismatch=0; forward_tag=gtacgact; seq_b_mismatch=0; start=ccaaa; experiment=australie; mid_quality=62.0797101449; avg_quality=58.9050632911; seq_a_single=33; score_norm=3.99331202109; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=gtgtacat; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=03_03A; seq_b_single=33;
ccaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:2310:20070:75862_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=23.1; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN5-12_b': 1}; forward_score=72.0; score=360.189837214; seq_a_mismatch=0; forward_tag=catcagtc; seq_b_mismatch=0; start=ccaaa; experiment=australie; mid_quality=53.0507246377; avg_quality=49.6265822785; seq_a_single=33; score_norm=3.91510692624; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=cgagtcgt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=28.9; position=11_11D; seq_b_single=33;
ccaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggaataactatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2114:5633:21903_CONS_SUB_SUB merged_sample={'ABR-15_a': 1, 'AN5-12_b': 1, 'AW2-35_b': 1}; forward_score=72.0; seq_b_insertion=0; seq_a_insertion=0; seq_b_deletion=0; seq_b_mismatch=0; seq_a_deletion=0; count=3; seq_length=100; start=ccaaa; experiment=australie; reverse_score=72.0; mode=alignment; status=full; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; goodAli=Alignement;
ccaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1112:5602:81492_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctggttaattccg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN5-12_b': 1}; forward_score=72.0; score=367.699098517; seq_a_mismatch=0; forward_tag=catcagtc; seq_b_mismatch=0; start=ccaaa; experiment=australie; mid_quality=62.2463768116; avg_quality=59.0506329114; seq_a_single=33; score_norm=3.9967293317; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=cgagtcgt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=11_11D; seq_b_single=33;
ccaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtcttgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1101:4074:21538_CONS_SUB_SUB merged_sample={'AN4-36_b': 51, 'AW2-04_b': 11, 'AW2-04_a': 10, 'AN4-38_b': 158, 'AN5-43_b': 2, 'ADR-14_b': 2, 'AW2-34_b': 10, 'AW2-34_a': 1, 'AN3-38_b': 1, 'ABR-30_b': 1, 'AW2-36_b': 17, 'AN2-15_a': 11, 'AN4-34_b': 44, 'AN5-45_b': 5, 'AW2-27_b': 1, 'AW2-27_a': 3, 'ADR-25_a': 1, 'AN4-19_b': 30, 'ABR-44_a': 2, 'ABR-31_b': 1, 'AN2-20_b': 1, 'AN1-43_a': 2, 'AN2-13_b': 21, 'AN5-11_b': 2, 'AN4-22_b': 16, 'AN3-14_b': 1, 'AN3-14_a': 3, 'AW2-35_b': 101, 'AN2-19_b': 1, 'ABR-33_a': 1, 'ABR-29_a': 11, 'AW1-40_b': 9, 'ADR-13_a': 5, 'ADR-06_b': 1, 'AN4-26_a': 1, 'AN5-15_b': 20, 'AN2-01_a': 5, 'ADR-21_b': 1, 'AW2-23_b': 29, 'AW2-23_a': 3, 'ADR-13_b': 1, 'AW2-21_a': 1, 'AWO-27_b': 1, 'AW2-21_b': 20, 'AN2-10_a': 9, 'AN5-13_b': 19, 'AN4-20_b': 1, 'AN2-30_a': 1, 'AN3-22_b': 4, 'AN3-25_b': 3, 'AN3-25_a': 5, 'AN4-24_b': 26, 'ABR-15_a': 34, 'AW2-36_a': 5, 'AW2-12_b': 7, 'AW2-12_a': 5, 'AN5-48_a': 1, 'AW1-06_a': 6, 'AN5-21_a': 1, 'AN5-21_b': 20, 'AN2-09_b': 11, 'ABR-13_b': 2, 'ABR-13_a': 23, 'AN2-09_a': 7, 'AN3-20_b': 2, 'AN2-07_a': 4, 'AN2-07_b': 8, 'AN4-44_b': 26, 'AN4-21_b': 19, 'AW2-03_a': 28, 'AN4-42_b': 30, 'AW2-03_b': 47, 'ADR-26_b': 1, 'AN1-03_a': 6, 'AML-47_a': 1, 'AWO-30_b': 1, 'AN1-03_b': 3, 'ADR-30_b': 1, 'AN4-37_b': 40, 'AW2-07_b': 43, 'AN4-40_a': 1, 'AN3-44_b': 1, 'AN4-40_b': 36, 'AW2-01_b': 32, 'AW2-01_a': 13, 'AN5-10_b': 64, 'AW1-11_a': 29, 'AN2-43_a': 1, 'AN2-43_b': 2, 'AN1-17_a': 6, 'AN5-37_b': 1, 'blk-12_b': 1, 'ABR-35_b': 1, 'blk-02_b': 1, 'AN2-45_b': 8, 'AN3-40_a': 1, 'AW1-17_a': 5, 'AN2-45_a': 7, 'AW2-17_b': 4, 'ABR-25_a': 1, 'ABR-08_b': 1, 'AN5-14_b': 6, 'AWO-10_b': 1, 'AN2-38_a': 3, 'AN5-33_b': 15, 'AN2-12_a': 7, 'blk-04_b': 1, 'AN3-08_b': 4, 'AN3-01_b': 2, 'AN5-12_b': 106, 'ABR-23_a': 2, 'AN3-42_b': 19, 'AN4-23_b': 96, 'AN2-35_b': 1, 'ABR-02_a': 7, 'AN5-35_a': 1, 'ABR-17_a': 5, 'AML-42_a': 1, 'ADR-10_b': 1, 'ABR-39_a': 14, 'AN3-20_a': 3, 'pos-01_a': 1, 'AN5-03_b': 13, 'AN3-05_a': 1, 'AWO-42_b': 1, 'ABR-14_b': 1, 'AW2-13_b': 2, 'AW2-13_a': 5, 'blk-06_b': 2, 'AML-24_a': 1, 'ADR-12_b': 1, 'ABR-09_a': 3, 'ABR-12_b': 1, 'AN3-03_b': 3, 'AN3-03_a': 4, 'ABR-12_a': 34, 'ABR-27_b': 1, 'AN4-11_b': 15, 'AN5-22_b': 11, 'AN3-47_b': 2, 'AN3-47_a': 1, 'AN4-13_b': 35, 'AN2-41_b': 1, 'AN4-41_b': 14, 'AN4-22_a': 1, 'AN2-02_b': 5, 'ADR-37_b': 1, 'AN4-15_b': 2, 'AN5-20_a': 1, 'AN5-20_b': 18, 'AN3-45_b': 4, 'AW2-40_b': 20, 'AN5-09_b': 2, 'AN5-09_a': 1, 'AW2-40_a': 5, 'AN4-47_b': 11, 'AWO-19_b': 1, 'ABR-41_a': 12, 'AW1-32_b': 1, 'AW1-30_a': 15, 'AN4-05_a': 1, 'AN3-41_a': 1, 'AN3-46_b': 2, 'AN5-24_b': 35, 'ABR-43_a': 1, 'AW1-34_b': 1, 'AN5-30_b': 20, 'AN5-30_a': 1, 'AW2-28_a': 10, 'AW2-28_b': 5, 'AN5-25_b': 16, 'AN4-01_a': 1, 'AN2-04_b': 2, 'AW2-30_a': 22, 'AML-19_a': 1, 'AN2-34_a': 2, 'AN2-38_b': 3, 'pos-06_a': 1, 'AN5-19_b': 4, 'blk-07_b': 1, 'ABR-47_a': 1, 'AN2-02_a': 7, 'AN3-43_b': 12, 'AW1-20_a': 2, 'AW2-39_a': 3, 'AW2-43_a': 1, 'AWO-41_a': 10, 'AN5-38_a': 1, 'AW2-43_b': 3, 'AN4-17_b': 12, 'AW2-07_a': 48, 'AN3-04_b': 2, 'AN4-35_b': 86, 'AW1-26_a': 8, 'AWO-34_b': 1, 'ABR-14_a': 13, 'AN2-13_a': 4, 'AN4-39_a': 1, 'ABR-01_a': 78, 'AN5-44_b': 1, 'AN4-39_b': 85, 'AW1-30_b': 4, 'AN2-31_b': 7, 'AN3-37_a': 1, 'AN4-12_b': 142, 'AN3-35_b': 12, 'ABR-42_a': 9, 'ABR-03_b': 1, 'AN3-17_b': 19, 'AML-08_a': 2, 'AW1-29_a': 4, 'AN2-05_b': 3, 'AN4-46_b': 39, 'AN2-05_a': 5, 'AN4-14_b': 4, 'AN5-23_b': 24, 'AN4-25_a': 1, 'AML-12_a': 1, 'AN3-34_a': 1, 'AN5-28_b': 31, 'AN3-34_b': 2, 'AN5-27_b': 3, 'ABR-32_b': 1, 'AWO-15_b': 1, 'ABR-46_a': 8, 'AW1-18_a': 76, 'AN3-13_b': 1, 'AN4-18_b': 2, 'AN4-24_a': 1, 'AWO-06_b': 1, 'AN5-42_b': 14, 'ABR-28_a': 2, 'AN2-40_a': 2, 'AW1-40_a': 9, 'AW2-35_a': 70, 'ABR-40_b': 2, 'AN2-10_b': 3, 'AN3-27_b': 3, 'ABR-44_b': 1, 'ADR-38_b': 1, 'AN3-19_b': 2, 'ABR-40_a': 22, 'AN4-06_b': 17, 'ADR-05_b': 1, 'AN2-12_b': 1, 'ABR-08_a': 6, 'AN5-41_b': 10, 'AWO-37_b': 1, 'AN4-29_b': 40, 'AW2-42_b': 3, 'AW1-27_a': 6, 'ADR-34_b': 1, 'AN4-45_b': 115, 'AWO-13_b': 1, 'AN5-34_b': 73, 'ABR-22_b': 2, 'AN5-39_b': 2, 'AW2-42_a': 1, 'AN2-04_a': 4, 'AN4-43_b': 19, 'AW1-08_b': 1, 'AW1-08_a': 1, 'AW1-11_b': 1, 'AN4-48_a': 1, 'AML-04_a': 1, 'AML-41_a': 1, 'AW1-35_a': 2}; count=3190; seq_b_insertion=0; status=full; seq_a_deletion=0; seq_length=100; start=ctaaa; experiment=australie; seq_a_insertion=0; mode=alignment; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; goodAli=Alignement;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1102:20365:63690_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'ABR-13_a': 1}; forward_score=72.0; score=367.706336455; seq_a_mismatch=0; forward_tag=tagtcgca; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.3333333333; avg_quality=59.1265822785; seq_a_single=33; score_norm=3.99680800494; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=cgctctcg; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=01_09E; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgctcgttga
>HWI-D00393:103:C6KCUANXX:2:1109:5898:71477_CONS_SUB_SUB_CMP merged_sample={'AW2-04_a': 1, 'AN4-38_b': 1, 'AW1-07_b': 1, 'AN4-34_b': 5, 'AN4-19_b': 1, 'AN2-20_b': 1, 'AN2-13_b': 3, 'AN5-11_b': 1, 'AN3-14_a': 1, 'ABR-29_a': 1, 'AW2-23_b': 3, 'AW2-23_a': 1, 'AN4-28_b': 1, 'AW2-21_b': 3, 'AN4-24_b': 1, 'AN5-21_b': 1, 'ABR-13_a': 2, 'AN3-20_b': 1, 'ADR-11_b': 1, 'AN4-44_b': 1, 'AN3-46_b': 2, 'AN4-42_b': 1, 'AW2-03_b': 2, 'AN1-03_a': 1, 'AN4-37_b': 2, 'AW2-07_b': 3, 'AN4-40_b': 3, 'AW2-01_a': 1, 'AW1-11_a': 5, 'AN2-43_b': 2, 'pos-09_a': 1, 'ABR-25_a': 1, 'AN2-38_b': 1, 'AN3-08_a': 1, 'AN3-01_b': 1, 'AN5-12_b': 2, 'AN3-42_b': 3, 'AN4-23_b': 1, 'ABR-02_a': 1, 'ABR-17_a': 1, 'ABR-39_a': 1, 'ABR-14_a': 3, 'AW2-13_b': 1, 'ABR-12_a': 4, 'AN4-13_b': 2, 'AN2-41_b': 1, 'AN4-15_b': 1, 'AN5-20_b': 1, 'AW2-40_b': 3, 'ABR-41_a': 1, 'AN3-35_a': 1, 'AN3-43_b': 1, 'AN5-24_b': 2, 'AN5-30_b': 1, 'AN5-34_b': 3, 'AN5-19_b': 1, 'AWO-41_a': 1, 'AN5-38_a': 1, 'AW2-07_a': 1, 'AN4-35_b': 5, 'ABR-01_a': 4, 'AN4-39_b': 2, 'AN2-31_b': 1, 'AW1-30_a': 1, 'AN4-12_b': 2, 'ABR-42_a': 2, 'AN3-17_a': 1, 'AN3-17_b': 1, 'AN2-05_b': 1, 'AN4-46_b': 1, 'AN4-14_b': 4, 'AN5-23_b': 4, 'ABR-32_a': 1, 'ABR-46_a': 2, 'AW1-18_a': 4, 'AN4-18_b': 1, 'AW1-35_a': 1, 'AW2-35_b': 6, 'AW1-40_b': 3, 'AW2-35_a': 2, 'AN2-10_a': 2, 'AN2-02_a': 1, 'AN4-29_b': 1, 'AN4-45_b': 3, 'AW2-44_b': 1, 'ABR-22_b': 1, 'AW1-28_b': 1, 'AW2-42_b': 1, 'AN4-43_b': 2, 'AW1-27_a': 1}; count=159; status=full; seq_a_deletion=0; seq_length=100; start=ctaaa; experiment=australie; seq_a_insertion=0; mode=alignment; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; goodAli=Alignement;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:1116:11515:15328_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-44_b': 1, 'AN4-39_b': 1}; forward_score=72.0; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; reverse_score=72.0; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=2; seq_length=100; status=full; mode=alignment; head_quality=37.0; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttactaggttaaggtctcgtttgttaa
>HWI-D00393:103:C6KCUANXX:2:1206:17870:33853_CONS_SUB_SUB ali_length=92; seq_ab_match=92; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW1-30_a': 7}; forward_score=72.0; seq_a_mismatch=0; forward_tag=cgctctcg; seq_b_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; reverse_score=72.0; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=gactgatg; goodAli=Alignement; count=7; seq_length=100; status=full; mode=alignment; position=02_12A; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcttaactatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2106:7652:12042_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-40_b': 1, 'AW2-35_b': 1, 'ABR-01_a': 1}; forward_score=72.0; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; reverse_score=72.0; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=3; seq_length=100; status=full; mode=alignment; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcgtaactatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2115:4325:51738_CONS_SUB_SUB_CMP status=full; merged_sample={'AW1-29_a': 1, 'ABR-41_a': 1, 'AN2-02_a': 1, 'AN4-39_b': 1, 'AN2-07_a': 1}; seq_b_insertion=0; seq_a_insertion=0; seq_a_mismatch=0; forward_score=72.0; tail_quality=37.0; seq_b_deletion=0; seq_b_mismatch=0; seq_a_deletion=0; count=5; seq_length=100; start=ctaaa; experiment=australie; reverse_score=72.0; mode=alignment; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; goodAli=Alignement;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcgctaa
>HWI-D00393:103:C6KCUANXX:2:2211:17027:90962_CONS_SUB_SUB status=full; merged_sample={'AML-46_a': 1, 'AN4-46_b': 7}; seq_b_insertion=0; seq_a_insertion=0; seq_a_mismatch=0; forward_score=72.0; seq_b_deletion=0; seq_b_mismatch=0; seq_a_deletion=0; count=8; seq_length=100; start=ctaaa; experiment=australie; reverse_score=72.0; mode=alignment; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; goodAli=Alignement;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcattcgttaa
>HWI-D00393:103:C6KCUANXX:2:1113:14380:43631_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctggttaattccg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-34_b': 1}; forward_score=72.0; score=367.706336455; seq_a_mismatch=0; forward_tag=atgatcgc; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.3333333333; avg_quality=59.1265822785; seq_a_single=33; score_norm=3.99680800494; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=tagctagt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=11_08B; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagtatgtaaaggtctcgatcgttat
>HWI-D00393:103:C6KCUANXX:2:1116:10928:36920_CONS_SUB_SUB_CMP ali_length=92; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-38_b': 1, 'AN5-25_b': 1, 'AW2-01_a': 1}; forward_score=72.0; seq_a_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; reverse_score=72.0; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=3; seq_length=100; status=full; mode=alignment; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgtttgttaa
>HWI-D00393:103:C6KCUANXX:2:1204:11153:34132_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-38_b': 1, 'AN4-39_b': 1, 'AN2-31_b': 1}; forward_score=72.0; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; status=full; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=3; seq_length=100; mode=alignment; head_quality=37.0; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactattcagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2106:2429:2922_CONS_SUB_SUB_CMP ali_length=93; seq_ab_match=93; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-38_b': 1}; forward_score=72.0; score=371.700731814; seq_a_mismatch=0; forward_tag=tctactga; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.7299270073; avg_quality=59.4522292994; seq_a_single=32; score_norm=3.99678206251; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=157; reverse_tag=tagctagt; goodAli=Alignement; count=1; seq_length=99; status=full; mode=alignment; head_quality=37.0; position=11_08F; seq_b_single=32;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggtaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2201:17654:97503_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=34.6; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN5-34_b': 1}; forward_score=72.0; score=307.231253197; seq_a_mismatch=0; forward_tag=tcagtgtc; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=53.8115942029; avg_quality=51.4113924051; seq_a_single=33; score_norm=3.33947014344; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=tcagtgtc; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=35.1; position=12_02B; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcgtgaa
>HWI-D00393:103:C6KCUANXX:2:2206:15790:49247_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN5-24_b': 1, 'AWO-41_a': 1, 'AW2-35_a': 1, 'AN4-35_b': 1, 'AW2-07_a': 1, 'AN3-04_b': 1, 'AN2-13_a': 1, 'AN4-22_b': 1}; forward_score=72.0; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; reverse_score=72.0; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=8; seq_length=100; status=full; mode=alignment; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttagggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2301:12930:40102_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'ABR-15_a': 1, 'AN4-39_b': 1}; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=2; seq_length=100; status=full; mode=alignment; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttaacaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2315:17692:90977_CONS_SUB_SUB_CMP ali_length=93; seq_a_deletion=0; reverse_score=72.0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-38_b': 1, 'ABR-41_a': 1, 'AN3-17_b': 1}; forward_score=72.0; seq_a_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=32; status=full; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=157; goodAli=Alignement; count=3; seq_length=99; mode=alignment; seq_b_single=32;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactattagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1207:9047:41800_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctggttaattccg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'ABR-13_a': 2, 'AN5-19_a': 1}; forward_score=72.0; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; reverse_score=72.0; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=3; seq_length=100; status=full; mode=alignment; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgctcgttaa
>HWI-D00393:103:C6KCUANXX:2:2210:14421:16016_CONS_SUB_SUB_CMP reverse_score=66.0; count=7; direction=reverse; seq_b_insertion=0; experiment=australie; seq_a_mismatch=0; seq_b_deletion=0; seq_b_mismatch=0; seq_a_deletion=0; seq_length=99; start=ctaaa; merged_sample={'AW2-12_b': 1, 'AN4-41_b': 1, 'AW2-07_a': 1, 'AN5-21_b': 1, 'AN2-15_a': 1, 'AN4-23_b': 1, 'AN4-37_b': 1}; seq_a_insertion=0; mode=alignment; status=full; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; goodAli=Alignement;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcgtta
>HWI-D00393:103:C6KCUANXX:2:2211:13367:14752_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=32.6; reverse_match=tttgtctggttaattccg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW1-18_a': 1}; forward_score=72.0; score=367.610399661; seq_a_mismatch=0; forward_tag=gtcacgtc; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=61.0434782609; avg_quality=57.6329113924; seq_a_single=33; score_norm=3.9957652137; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=gtcgtaga; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=35.6; position=02_10C; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaagatctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:2307:6880:68904_CONS_SUB_SUB_CMP ali_length=81; seq_ab_match=56; tail_quality=37.0; reverse_match=tttgtctggttaattccg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-37_b': 1}; forward_score=72.0; score=59.9384545346; seq_a_mismatch=1; forward_tag=atcagtca; seq_b_mismatch=22; start=ctaaa; experiment=australie; mid_quality=52.6666666667; avg_quality=50.6835443038; seq_a_single=33; score_norm=0.73998092018; reverse_score=72.0; direction=reverse; seq_b_insertion=2; seq_b_deletion=13; seq_a_insertion=0; seq_length_ori=158; reverse_tag=tagctagt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=11_08E; seq_b_single=44;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaagtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1312:11682:89893_CONS_SUB_SUB_CMP ali_length=92; seq_a_deletion=0; reverse_score=72.0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-19_b': 1, 'AN5-10_b': 1, 'AN5-15_b': 1, 'AW2-34_b': 1, 'AN5-45_b': 1, 'AN4-40_b': 1}; forward_score=72.0; seq_a_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; status=full; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=6; seq_length=100; mode=alignment; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcattaa
>HWI-D00393:103:C6KCUANXX:2:2109:18223:83128_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'ABR-15_a': 1, 'ABR-13_a': 1}; forward_score=72.0; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; reverse_score=72.0; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=cgctctcg; goodAli=Alignement; count=2; seq_length=100; status=full; mode=alignment; head_quality=37.0; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttggcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2204:19781:26262_CONS_SUB_SUB_CMP ali_length=95; seq_ab_match=95; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'ABR-01_a': 1}; forward_score=72.0; score=379.691935178; seq_a_mismatch=0; forward_tag=gatcgcga; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=63.5703703704; avg_quality=60.1419354839; seq_a_single=30; score_norm=3.9967572124; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=155; reverse_tag=actagatc; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=01_07H; seq_b_single=30;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaagatctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2215:4598:28592_CONS_SUB_SUB_CMP status=full; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-40_b': 1, 'AW2-35_b': 1, 'AN5-03_b': 1}; forward_score=72.0; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; reverse_score=72.0; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; goodAli=Alignement; count=3; seq_length=100; mode=alignment; head_quality=37.0;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggcctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2304:13327:3228_CONS_SUB_SUB merged_sample={'AN4-36_b': 1, 'AN4-29_b': 1, 'AW2-01_b': 1, 'AW2-35_a': 1, 'ABR-46_a': 1, 'AW1-18_a': 1, 'AW1-40_b': 1, 'AN4-24_b': 1, 'AW2-07_b': 2}; forward_score=72.0; seq_b_insertion=0; seq_a_insertion=0; seq_a_mismatch=0; seq_b_deletion=0; seq_b_mismatch=0; seq_a_deletion=0; count=10; seq_length=100; start=ctaaa; experiment=australie; reverse_score=72.0; mode=alignment; status=full; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; goodAli=Alignement;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtcccgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1208:20375:7918_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; seq_a_deletion=0; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-37_b': 1, 'AW2-30_a': 1, 'AN4-35_b': 1}; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; reverse_score=72.0; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=3; seq_length=100; status=full; mode=alignment; head_quality=37.0; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcatagctatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1215:20045:28654_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-12_b': 1, 'AN5-42_b': 1, 'AN4-23_b': 1}; forward_score=72.0; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; reverse_score=72.0; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=3; seq_length=100; status=full; mode=alignment; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcctaactatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1302:19280:14816_CONS_SUB_SUB_CMP status=full; merged_sample={'ABR-01_a': 1, 'AWO-46_b': 1, 'AN4-22_b': 1}; seq_b_insertion=0; seq_a_mismatch=0; forward_score=72.0; seq_b_deletion=0; seq_b_mismatch=0; seq_a_deletion=0; count=3; seq_length=100; start=ctaaa; experiment=australie; seq_a_insertion=0; mode=alignment; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; goodAli=Alignement;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgtacgttaa
>HWI-D00393:103:C6KCUANXX:2:2214:8779:29491_CONS_SUB_SUB_CMP status=full; merged_sample={'AN5-24_b': 1, 'ADR-13_a': 2, 'AW2-35_a': 1}; seq_b_insertion=0; seq_a_insertion=0; forward_score=72.0; reverse_match=tttgtctgcttaattgcg; seq_b_mismatch=0; seq_a_deletion=0; count=4; seq_length=100; start=ctaaa; experiment=australie; reverse_score=72.0; mode=alignment; seq_b_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; head_quality=37.0; goodAli=Alignement;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggctaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2104:3449:54357_CONS_SUB_SUB_CMP status=full; merged_sample={'AN4-13_b': 1, 'AN4-42_b': 1, 'AN4-38_b': 1, 'AN5-21_b': 1, 'AN4-46_b': 1, 'AN4-12_b': 6}; seq_b_insertion=0; seq_a_mismatch=0; forward_score=72.0; seq_b_deletion=0; seq_b_mismatch=0; seq_a_deletion=0; count=11; seq_length=100; start=ctaaa; experiment=australie; seq_a_insertion=0; mode=alignment; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; goodAli=Alignement;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcgttag
>HWI-D00393:103:C6KCUANXX:2:1102:14455:71108_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-23_b': 1}; forward_score=72.0; score=367.706336455; seq_a_mismatch=0; forward_tag=actagatc; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.3333333333; avg_quality=59.1265822785; seq_a_single=33; score_norm=3.99680800494; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=ctatgcta; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=09_05G; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcgtcag
>HWI-D00393:103:C6KCUANXX:2:1110:14996:73808_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-43_b': 1, 'AW2-40_b': 1, 'AW2-07_b': 2}; forward_score=72.0; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; reverse_score=72.0; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=4; seq_length=100; status=full; mode=alignment; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagtaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1204:14078:1983_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctggttaattccg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-13_b': 1}; forward_score=72.0; score=367.706336455; seq_a_mismatch=0; forward_tag=tagtcgca; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.3333333333; avg_quality=59.1265822785; seq_a_single=33; score_norm=3.99680800494; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=atatagcg; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=09_04E; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttactaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1212:8502:98185_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-37_b': 1, 'AW2-35_a': 1}; forward_score=72.0; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=2; seq_length=100; status=full; mode=alignment; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcatgactatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1305:7874:81774_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=37.0; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-07_a': 1, 'AN4-34_b': 1}; forward_score=72.0; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; reverse_score=72.0; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=2; seq_length=100; status=full; mode=alignment; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtcttgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1103:10874:79989_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-35_b': 1}; forward_score=72.0; score=367.706336455; seq_a_mismatch=0; forward_tag=gtgtacat; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.3333333333; avg_quality=59.1265822785; seq_a_single=33; score_norm=3.99680800494; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=agcacagt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=09_07C; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttcaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1207:7817:59077_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-38_b': 1}; forward_score=72.0; score=366.948065081; seq_a_mismatch=0; forward_tag=tctactga; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=60.9420289855; avg_quality=57.9113924051; seq_a_single=33; score_norm=3.9885659248; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=tagctagt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=11_08F; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgatcgttat
>HWI-D00393:103:C6KCUANXX:2:1308:2120:40513_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN3-42_b': 1}; forward_score=72.0; score=367.703923809; seq_a_mismatch=0; forward_tag=atgatcgc; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.3043478261; avg_quality=59.1012658228; seq_a_single=33; score_norm=3.99678178053; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=actctgct; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=11_03B; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcattcgttat
>HWI-D00393:103:C6KCUANXX:2:2206:6280:74968_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctggttaattccg; seq_a_deletion=0; forward_match=acacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-40_a': 1}; forward_score=66.0; score=367.706336455; seq_a_mismatch=0; forward_tag=gatcgcga; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.3333333333; avg_quality=59.1265822785; seq_a_single=33; score_norm=3.99680800494; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=catcagtc; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=05_08H; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttgaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1215:13979:31122_CONS_SUB_SUB_CMP ali_length=93; seq_ab_match=93; tail_quality=37.0; reverse_match=tttgttgcttaattgcga; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-34_b': 1}; forward_score=72.0; score=371.70314446; seq_a_mismatch=0; forward_tag=atgatcgc; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.6861313869; avg_quality=59.4140127389; seq_a_single=32; score_norm=3.99680800494; reverse_score=66.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=157; reverse_tag=tagctagt; goodAli=Alignement; count=1; seq_length=99; status=full; mode=alignment; head_quality=37.0; position=11_08B; seq_b_single=32;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggtaagggtctcgttcgtta
>HWI-D00393:103:C6KCUANXX:2:2208:15903:75212_CONS_SUB_SUB ali_length=92; seq_ab_match=91; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-01_b': 1}; forward_score=72.0; score=359.811368897; seq_a_mismatch=1; forward_tag=actagatc; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.0072463768; avg_quality=58.8417721519; seq_a_single=33; score_norm=3.91099314019; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=tcagtgtc; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=09_02G; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagtaggttaaggtctcgtttgttaa
>HWI-D00393:103:C6KCUANXX:2:1203:16969:99847_CONS_SUB_SUB ali_length=92; seq_ab_match=91; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-40_b': 1}; forward_score=72.0; score=359.526419012; seq_a_mismatch=0; forward_tag=ctgcgtac; seq_b_mismatch=1; start=ctaaa; experiment=australie; mid_quality=60.9782608696; avg_quality=57.9430379747; seq_a_single=33; score_norm=3.90789585883; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=tagctagt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=11_08H; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataacaatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2213:12449:50581_CONS_SUB_SUB_CMP status=full; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-45_b': 1, 'AN5-25_b': 1, 'AN4-22_b': 1}; forward_score=72.0; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; reverse_score=72.0; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; goodAli=Alignement; count=3; seq_length=100; mode=alignment; head_quality=37.0;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggtataactatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2310:15600:75905_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-03_a': 1}; forward_score=72.0; score=367.701511163; seq_a_mismatch=0; forward_tag=gtacgact; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.2753623188; avg_quality=59.0759493671; seq_a_single=33; score_norm=3.99675555612; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=gtgtacat; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=03_03A; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggtaagggtctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:1214:16466:5047_CONS_SUB_SUB status=full; merged_sample={'AN2-07_b': 1, 'AN1-17_a': 1, 'AN3-25_b': 1, 'AN4-23_b': 1, 'AW2-35_a': 1}; seq_b_insertion=0; seq_a_insertion=0; seq_a_mismatch=0; forward_score=72.0; seq_b_deletion=0; seq_a_deletion=0; count=5; seq_length=100; start=ctaaa; experiment=australie; reverse_score=72.0; mode=alignment; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; goodAli=Alignement;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcgggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1306:6166:18686_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=24.8; reverse_match=tttgtctggttaattccg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-39_b': 1}; forward_score=72.0; score=200.442729055; seq_a_mismatch=0; forward_tag=gatgatct; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=40.5289855072; avg_quality=38.6898734177; seq_a_single=33; score_norm=2.17872531582; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=tagctagt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=27.2; position=11_08G; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggctagagtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2312:19984:25051_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctggttaattccg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-03_b': 1}; forward_score=72.0; score=367.703923809; seq_a_mismatch=0; forward_tag=acacacac; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.3043478261; avg_quality=59.1012658228; seq_a_single=33; score_norm=3.99678178053; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=actctgct; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=09_03A; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttagagtctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:1209:10424:5048_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-43_b': 1}; forward_score=72.0; score=367.706336455; seq_a_mismatch=0; forward_tag=acgacgag; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.3333333333; avg_quality=59.1265822785; seq_a_single=33; score_norm=3.99680800494; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=agtgctac; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=11_09C; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggataaggtctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:1306:10017:29340_CONS_SUB_SUB_CMP merged_sample={'AN4-36_b': 1, 'AN4-12_b': 1, 'AW2-07_a': 1}; forward_score=72.0; direction=reverse; seq_b_insertion=0; seq_a_insertion=0; seq_a_mismatch=0; seq_b_deletion=0; status=full; seq_a_deletion=0; count=3; seq_length=100; start=ctaaa; experiment=australie; reverse_score=72.0; mode=alignment; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; goodAli=Alignement;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttagcataactatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1116:1609:19254_CONS_SUB_SUB ali_length=93; seq_ab_match=93; tail_quality=34.4; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-45_b': 1}; forward_score=72.0; score=365.502727331; seq_a_mismatch=0; forward_tag=atcagtca; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=55.8905109489; avg_quality=52.9936305732; seq_a_single=32; score_norm=3.93013685302; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=157; reverse_tag=agtgctac; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=31.9; position=11_09E; seq_b_single=32;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcgtttt
>HWI-D00393:103:C6KCUANXX:2:1116:18814:29744_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-35_a': 1}; forward_score=72.0; score=367.048675141; seq_a_mismatch=0; forward_tag=acgacgag; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=61.7101449275; avg_quality=58.582278481; seq_a_single=33; score_norm=3.9896595124; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=actagatc; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=03_07C; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcagattaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1301:18017:5670_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=91; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-29_b': 1}; forward_score=72.0; score=307.503017136; seq_a_mismatch=0; forward_tag=atcagtca; seq_b_mismatch=1; start=ctaaa; experiment=australie; mid_quality=48.0362318841; avg_quality=46.0632911392; seq_a_single=33; score_norm=3.3424240993; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=agcacagt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=27.9; position=11_07E; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagccggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2304:14057:23271_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'ADR-47_b': 1, 'AN3-35_b': 1}; forward_score=72.0; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=2; seq_length=100; status=full; mode=alignment; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataattatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1105:1482:86390_CONS_SUB_SUB_CMP ali_length=94; seq_ab_match=94; tail_quality=37.0; reverse_match=tttgtctggttaattccg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN5-13_b': 1}; forward_score=72.0; score=375.690301881; seq_a_mismatch=0; forward_tag=atcagtca; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=63.0735294118; avg_quality=59.7307692308; seq_a_single=31; score_norm=3.99670533916; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=156; reverse_tag=cgagtcgt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=11_11E; seq_b_single=31;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcgtaaa
>HWI-D00393:103:C6KCUANXX:2:1214:1498:16792_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=36.4; reverse_match=tttgtctggttaattccg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-01_a': 1}; forward_score=72.0; score=367.691974388; seq_a_mismatch=0; forward_tag=gatgatct; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.2608695652; avg_quality=59.0253164557; seq_a_single=33; score_norm=3.99665189552; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=acagcaca; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=03_02G; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggctgaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2112:6296:18402_CONS_SUB_SUB_CMP reverse_score=72.0; count=2; direction=reverse; seq_b_insertion=0; experiment=australie; seq_b_deletion=0; status=full; seq_a_deletion=0; seq_length=100; start=ctaaa; merged_sample={'AN4-11_b': 1, 'AN4-17_b': 1}; seq_a_insertion=0; mode=alignment; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; goodAli=Alignement;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggaataactatttagcaggttaaggtctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:2202:15115:92220_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-35_b': 1}; forward_score=72.0; score=367.703923809; seq_a_mismatch=0; forward_tag=gtgtacat; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.3043478261; avg_quality=59.1012658228; seq_a_single=33; score_norm=3.99678178053; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=agcacagt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=09_07C; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggtccgggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2308:15218:31319_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=35.1; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN5-30_a': 1}; forward_score=72.0; score=367.379880648; seq_a_mismatch=0; forward_tag=gcgtcagc; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=61.7463768116; avg_quality=58.4936708861; seq_a_single=33; score_norm=3.99325957226; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=agactatg; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=06_01F; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactagttagcaggttaaggtctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:1303:8962:10788_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=91; tail_quality=35.6; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-40_b': 1}; forward_score=72.0; score=328.577396734; seq_a_mismatch=0; forward_tag=ctgcgtac; seq_b_mismatch=1; start=ctaaa; experiment=australie; mid_quality=51.2898550725; avg_quality=49.2721518987; seq_a_single=33; score_norm=3.57149344276; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=tagctagt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=35.1; position=11_08H; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactttttagcaggttaaggtctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:1309:18593:53694_CONS_SUB_SUB_CMP ali_length=92; seq_a_deletion=0; reverse_score=72.0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN5-43_b': 1, 'ABR-41_a': 1, 'AW2-35_a': 1}; forward_score=72.0; seq_a_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; status=full; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=3; seq_length=100; mode=alignment; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactacttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1208:4517:87099_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctggttaattccg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-03_a': 1}; forward_score=72.0; score=367.701511163; seq_a_mismatch=0; forward_tag=gtacgact; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.2753623188; avg_quality=59.0759493671; seq_a_single=33; score_norm=3.99675555612; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=gtgtacat; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=03_03A; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagaaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1115:14275:7184_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-45_b': 1}; forward_score=72.0; score=367.706336455; seq_a_mismatch=0; forward_tag=atcagtca; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.3333333333; avg_quality=59.1265822785; seq_a_single=33; score_norm=3.99680800494; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=agtgctac; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=11_09E; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttagagtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1209:11513:82442_CONS_SUB_SUB_CMP ali_length=94; seq_ab_match=94; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-39_a': 1}; forward_score=72.0; score=375.685590398; seq_a_mismatch=0; forward_tag=gatgatct; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=63.0294117647; avg_quality=59.6923076923; seq_a_single=31; score_norm=3.996655217; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=156; reverse_tag=actagatc; goodAli=Alignement; count=1; seq_length=98; status=full; mode=alignment; head_quality=37.0; position=03_07G; seq_b_single=31;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1214:6399:72570_CONS_SUB_SUB ali_length=92; seq_ab_match=88; tail_quality=24.5; reverse_match=tttgtctgcttaattacg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-30_a': 1}; forward_score=72.0; score=255.892896628; seq_a_mismatch=3; forward_tag=tctactga; seq_b_mismatch=1; start=ctaaa; experiment=australie; mid_quality=40.4057971014; avg_quality=38.5253164557; seq_a_single=33; score_norm=2.78144452857; reverse_score=66.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=tactatac; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=26.6; position=03_06F; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgtgcgttaa
>HWI-D00393:103:C6KCUANXX:2:1305:3473:19356_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctggttaattccg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-07_b': 1}; forward_score=72.0; score=367.635869076; seq_a_mismatch=0; forward_tag=tactatac; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=61.9782608696; avg_quality=58.8164556962; seq_a_single=33; score_norm=3.99604205518; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=actctgct; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=09_03F; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcgtaactatttagcaggttaaggtctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:2302:5940:68325_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-37_b': 1}; forward_score=72.0; score=367.706336455; seq_a_mismatch=0; forward_tag=atcagtca; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.3333333333; avg_quality=59.1265822785; seq_a_single=33; score_norm=3.99680800494; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=tagctagt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=11_08E; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggtcgaggtctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:2309:13769:62059_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=91; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-35_a': 1}; forward_score=72.0; score=359.470664833; seq_a_mismatch=0; forward_tag=acgacgag; seq_b_mismatch=1; start=ctaaa; experiment=australie; mid_quality=61.7536231884; avg_quality=58.6202531646; seq_a_single=33; score_norm=3.90728983514; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=actagatc; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=03_07C; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaagttctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2315:8947:22476_CONS_SUB_SUB ali_length=93; seq_ab_match=93; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN3-25_a': 1}; forward_score=72.0; score=368.159719936; seq_a_mismatch=0; forward_tag=acacacac; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=58.4744525547; avg_quality=55.7388535032; seq_a_single=32; score_norm=3.95870666598; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=157; reverse_tag=agactatg; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=05_01A; seq_b_single=32;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactgtttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1301:19819:95569_CONS_SUB_SUB_CMP ali_length=94; seq_ab_match=94; tail_quality=37.0; reverse_match=tttgtctttaattccgat; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN5-34_b': 1}; forward_score=72.0; score=375.699952465; seq_a_mismatch=0; forward_tag=tcagtgtc; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=63.1911764706; avg_quality=59.8333333333; seq_a_single=31; score_norm=3.99680800494; reverse_score=60.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=156; reverse_tag=tcagtgtc; goodAli=Alignement; count=1; seq_length=98; status=full; mode=alignment; head_quality=37.0; position=12_02B; seq_b_single=31;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcgtt
>HWI-D00393:103:C6KCUANXX:2:1313:4613:34271_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=32.3; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW1-11_a': 1}; forward_score=72.0; score=313.551003229; seq_a_mismatch=0; forward_tag=gtcacgtc; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=56.6594202899; avg_quality=53.1518987342; seq_a_single=33; score_norm=3.40816307858; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=cgctctcg; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=25.6; position=02_09C; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggtcaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2113:10988:9799_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=30.2; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN5-13_b': 1}; forward_score=72.0; score=361.020650934; seq_a_mismatch=0; forward_tag=atcagtca; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=55.6086956522; avg_quality=52.4936708861; seq_a_single=33; score_norm=3.92413751015; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=cgagtcgt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=31.8; position=11_11E; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaatgtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2113:5686:82397_CONS_SUB_SUB_CMP ali_length=93; seq_ab_match=93; tail_quality=36.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'blk-12_b': 1}; forward_score=72.0; score=371.343139227; seq_a_mismatch=0; forward_tag=ctatgcta; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.1532846715; avg_quality=58.7643312102; seq_a_single=32; score_norm=3.99293698094; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=157; reverse_tag=ctgcgtac; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=35.1; position=08_12E; seq_b_single=32;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggatagggtctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:2311:15976:57660_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=91; tail_quality=30.1; reverse_match=tttgtctggttaattccg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-39_b': 1}; forward_score=72.0; score=346.55597612; seq_a_mismatch=0; forward_tag=gatgatct; seq_b_mismatch=1; start=ctaaa; experiment=australie; mid_quality=56.8550724638; avg_quality=53.8037974684; seq_a_single=33; score_norm=3.76691278391; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=tagctagt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=35.4; position=11_08G; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggtgaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2205:20527:78743_CONS_SUB_SUB ali_length=95; seq_ab_match=93; tail_quality=29.1; reverse_match=tttgtctgcttaatggcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-12_b': 1}; forward_score=72.0; score=351.353534518; seq_a_mismatch=1; forward_tag=catcagtc; seq_b_mismatch=1; start=ctaaa; experiment=australie; mid_quality=50.2074074074; avg_quality=47.8709677419; seq_a_single=30; score_norm=3.69845825808; reverse_score=66.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=155; reverse_tag=ctatgcta; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=35.1; position=11_05D; seq_b_single=30;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtcgcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1104:5688:6100_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-46_b': 1}; forward_score=72.0; score=367.370471331; seq_a_mismatch=0; forward_tag=tctactga; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.0362318841; avg_quality=58.8670886076; seq_a_single=33; score_norm=3.99315729707; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=agtgctac; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=11_09F; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtcccattcgttaa
>HWI-D00393:103:C6KCUANXX:2:1111:18652:56309_CONS_SUB_SUB ali_length=93; seq_ab_match=93; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN2-12_a': 1}; forward_score=72.0; score=371.700731814; seq_a_mismatch=0; forward_tag=atatagcg; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.7299270073; avg_quality=59.4522292994; seq_a_single=32; score_norm=3.99678206251; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=157; reverse_tag=tatgtcag; goodAli=Alignement; count=1; seq_length=99; status=full; mode=alignment; head_quality=37.0; position=04_04D; seq_b_single=32;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttgttaa
>HWI-D00393:103:C6KCUANXX:2:1211:20612:10566_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-38_b': 1}; forward_score=72.0; score=367.703923809; seq_a_mismatch=0; forward_tag=tctactga; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.3043478261; avg_quality=59.1012658228; seq_a_single=33; score_norm=3.99678178053; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=tagctagt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=11_08F; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataaccatttagctggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2113:19027:63568_CONS_SUB_SUB_CMP ali_length=93; seq_ab_match=93; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-38_b': 1}; forward_score=72.0; score=371.700731814; seq_a_mismatch=0; forward_tag=tctactga; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.7299270073; avg_quality=59.4267515924; seq_a_single=32; score_norm=3.99678206251; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=157; reverse_tag=tagctagt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=36.6; position=11_08F; seq_b_single=32;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagtaggctagagtctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:2201:15565:85738_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=32.1; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-23_b': 1}; forward_score=72.0; score=365.670520501; seq_a_mismatch=0; forward_tag=actagatc; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=59.5289855072; avg_quality=56.3670886076; seq_a_single=33; score_norm=3.97467957066; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=ctatgcta; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=09_05G; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcatatctatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2215:13270:16769_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-01_b': 1}; forward_score=72.0; score=367.703923809; seq_a_mismatch=0; forward_tag=actagatc; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.3043478261; avg_quality=59.1012658228; seq_a_single=33; score_norm=3.99678178053; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=tcagtgtc; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=09_02G; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgtccgttaa
>HWI-D00393:103:C6KCUANXX:2:1102:18567:11128_CONS_SUB_SUB_CMP ali_length=93; seq_ab_match=93; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN2-10_b': 1}; forward_score=72.0; score=370.995083415; seq_a_mismatch=0; forward_tag=gtcgtaga; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=61.8248175182; avg_quality=58.5732484076; seq_a_single=32; score_norm=3.98919444533; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=157; reverse_tag=atatagcg; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=35.6; position=10_04B; seq_b_single=32;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttgacataactatttagcaggttaaggtctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:1106:7644:40758_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=90; tail_quality=34.2; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-35_b': 1}; forward_score=72.0; score=296.115079326; seq_a_mismatch=2; forward_tag=acgacgag; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=54.1449275362; avg_quality=51.6075949367; seq_a_single=33; score_norm=3.21864216659; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=tagctagt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=34.0; position=11_08C; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcgttac
>HWI-D00393:103:C6KCUANXX:2:1111:17524:16890_CONS_SUB_SUB ali_length=94; seq_ab_match=94; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN2-12_b': 1}; forward_score=72.0; score=375.680651297; seq_a_mismatch=0; forward_tag=gactgatg; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.9264705882; avg_quality=59.6025641026; seq_a_single=31; score_norm=3.99660267337; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=156; reverse_tag=atatagcg; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=10_04D; seq_b_single=31;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttgtcataactatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2109:4640:21588_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN5-10_b': 1}; forward_score=72.0; score=367.701511163; seq_a_mismatch=0; forward_tag=atgatcgc; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.2173913043; avg_quality=59.0253164557; seq_a_single=33; score_norm=3.99675555612; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=cgagtcgt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=11_11B; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcggtcgttaa
>HWI-D00393:103:C6KCUANXX:2:2212:14824:56282_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW1-11_a': 1}; forward_score=72.0; score=367.703923809; seq_a_mismatch=0; forward_tag=gtcacgtc; seq_b_mismatch=0; start=ctaaa; experiment=australie; mid_quality=62.3043478261; avg_quality=59.1012658228; seq_a_single=33; score_norm=3.99678178053; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=cgctctcg; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=02_09C; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggtcaaggtctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:2306:10336:10418_CONS_SUB_SUB ali_length=92; seq_ab_match=92; tail_quality=37.0; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW2-35_b': 1, 'AN4-23_b': 1}; forward_score=72.0; seq_a_mismatch=0; seq_b_mismatch=0; start=ctaaa; experiment=australie; seq_a_single=33; reverse_score=72.0; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; goodAli=Alignement; count=2; seq_length=100; status=full; mode=alignment; head_quality=37.0; seq_b_single=33;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataaccatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2216:11206:55822_CONS_SUB_SUB_CMP ali_length=93; seq_ab_match=92; tail_quality=29.8; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AW1-30_a': 1}; forward_score=72.0; score=357.173125277; seq_a_mismatch=0; forward_tag=cgctctcg; seq_b_mismatch=1; start=ctaaa; experiment=australie; mid_quality=54.6642335766; avg_quality=51.4777070064; seq_a_single=32; score_norm=3.84057123954; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=157; reverse_tag=gactgatg; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=29.5; position=02_12A; seq_b_single=32;
ctaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcttaactatttagcaggttaaggtctcgttcgttat
>HWI-D00393:103:C6KCUANXX:2:2303:4215:99753_CONS_SUB_SUB ali_length=93; seq_ab_match=93; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgtattgcc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-41_b': 1}; forward_score=66.0; score=371.70314446; seq_a_mismatch=0; forward_tag=gtacgact; seq_b_mismatch=0; start=gaaaa; experiment=australie; mid_quality=62.7591240876; avg_quality=59.4522292994; seq_a_single=32; score_norm=3.99680800494; reverse_score=72.0; direction=forward; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=157; reverse_tag=agtgctac; goodAli=Alignement; count=1; seq_length=99; status=full; mode=alignment; head_quality=36.6; position=11_09A; seq_b_single=32;
gaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagagt
tggcataactatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2209:18822:35652_CONS_SUB_SUB_CMP ali_length=92; seq_ab_match=92; tail_quality=37.0; reverse_match=tttgtctgcttaattgcg; seq_a_deletion=0; forward_match=tcacagacctgttattgc; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; merged_sample={'AN4-29_b': 1}; forward_score=72.0; score=366.357546614; seq_a_mismatch=0; forward_tag=atcagtca; seq_b_mismatch=0; start=gtaaa; experiment=australie; mid_quality=60.7898550725; avg_quality=57.7784810127; seq_a_single=33; score_norm=3.98214724581; reverse_score=72.0; direction=reverse; seq_b_insertion=0; seq_b_deletion=0; seq_a_insertion=0; seq_length_ori=158; reverse_tag=agcacagt; goodAli=Alignement; count=1; seq_length=100; status=full; mode=alignment; head_quality=37.0; position=11_07E; seq_b_single=33;
gtaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttggcataactatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:2212:12163:84900_CONS_SUB_SUB merged_sample={'AN4-36_b': 1, 'AN4-39_b': 1, 'AN4-12_b': 1, 'AW2-03_b': 1, 'ABR-42_a': 1, 'AN4-46_b': 1, 'AN4-23_b': 1, 'AN4-37_b': 1, 'AN5-23_b': 1, 'AN5-41_b': 1, 'AW1-18_a': 1, 'AN4-35_b': 1, 'AW2-35_b': 1, 'AW2-35_a': 1}; reverse_score=72.0; seq_b_insertion=0; seq_a_mismatch=0; seq_b_deletion=0; seq_b_mismatch=0; seq_a_deletion=0; count=14; seq_length=99; start=taaaa; experiment=australie; seq_a_insertion=0; mode=alignment; status=full; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; goodAli=Alignement;
taaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagagt
tggcataactatttagcaggttaaggtctcgttcgttaa
>HWI-D00393:103:C6KCUANXX:2:1210:8971:79507_CONS_SUB_SUB_CMP merged_sample={'AN5-41_b': 1, 'AW2-01_a': 3, 'AN5-25_b': 2, 'AW2-07_b': 4, 'AN5-23_b': 3, 'AW2-01_b': 1}; ali_length=92; seq_b_insertion=0; seq_b_deletion=0; status=full; seq_a_deletion=0; count=14; seq_length=100; start=ttaaa; experiment=australie; seq_a_insertion=0; mode=alignment; seq_length_ori=158; forward_primer=tcacagacctgttattgc; reverse_primer=tttgtctgsttaattscg; seq_a_single=33; seq_b_single=33; goodAli=Alignement;
ttaaaacttccgtcggctaatcgccgacagtccctctaagaagttgactaccaacgagag
ttagcataactattcagtaggttaaggtctcgttcgttaa

View File

@ -0,0 +1,12 @@
>1 {"taxid" : 1234, "specie_name" : "Lupus lupus"}
acgtacgtacgtagc
>2 {"taxid" : 3243, "specie_name" : "Gallus gallus"} test
acgatcgatc
@3 {"taxid" : 3243, "specie_name" : "Gallus gallus"} test2
cgctagcatagc
cgatatgactta
+
78wo82usd980
d88fau
238ud8

View File

@ -0,0 +1,141 @@
package ncbitaxdump
import (
"bufio"
"encoding/csv"
"errors"
"fmt"
"io"
"log"
"os"
"path"
"strconv"
"strings"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitax"
)
func loadNodeTable(reader io.Reader, taxonomy *obitax.Taxonomy) {
file := csv.NewReader(reader)
file.Comma = '|'
file.Comment = '#'
file.TrimLeadingSpace = true
file.ReuseRecord = true
for record, err := file.Read(); err == nil; record, err = file.Read() {
taxid, _ := strconv.Atoi(strings.TrimSpace(record[0]))
parent, _ := strconv.Atoi(strings.TrimSpace(record[1]))
rank := strings.TrimSpace(record[2])
taxonomy.AddNewTaxa(taxid, parent, rank, true, true)
}
taxonomy.ReindexParent()
}
func loadNameTable(reader io.Reader, taxonomy *obitax.Taxonomy, onlysn bool) int {
// file := csv.NewReader(reader)
// file.Comma = '|'
// file.Comment = '#'
// file.TrimLeadingSpace = true
// file.ReuseRecord = true
// file.LazyQuotes = true
file := bufio.NewReader(reader)
n := 0
for line, prefix, err := file.ReadLine(); err == nil; line, prefix, err = file.ReadLine() {
if prefix {
return -1
}
record := strings.Split(string(line), "|")
taxid, _ := strconv.Atoi(strings.TrimSpace(record[0]))
name := strings.TrimSpace(record[1])
classname := strings.TrimSpace(record[3])
if !onlysn || classname == "scientific name" {
n++
taxonomy.AddNewName(taxid, &name, &classname)
}
}
return n
}
func loadMergedTable(reader io.Reader, taxonomy *obitax.Taxonomy) int {
file := csv.NewReader(reader)
file.Comma = '|'
file.Comment = '#'
file.TrimLeadingSpace = true
file.ReuseRecord = true
n := 0
for record, err := file.Read(); err == nil; record, err = file.Read() {
oldtaxid, _ := strconv.Atoi(strings.TrimSpace(record[0]))
newtaxid, _ := strconv.Atoi(strings.TrimSpace(record[1]))
n++
taxonomy.AddNewAlias(newtaxid, oldtaxid)
}
return n
}
func LoadNCBITaxDump(directory string, onlysn bool) (*obitax.Taxonomy, error) {
taxonomy := obitax.NewTaxonomy()
//
// Load the Taxonomy nodes
//
log.Printf("Loading Taxonomy nodes\n")
nodefile, err := os.Open(path.Join(directory, "nodes.dmp"))
if err != nil {
return nil, errors.New(fmt.Sprintf("Cannot open nodes file from '%s'",
directory))
}
defer nodefile.Close()
buffered := bufio.NewReader(nodefile)
loadNodeTable(buffered, taxonomy)
log.Printf("%d Taxonomy nodes read\n", taxonomy.Length())
//
// Load the Taxonomy nodes
//
log.Printf("Loading Taxon names\n")
namefile, nerr := os.Open(path.Join(directory, "names.dmp"))
if nerr != nil {
return nil, errors.New(fmt.Sprintf("Cannot open names file from '%s'",
directory))
}
defer namefile.Close()
n := loadNameTable(namefile, taxonomy, onlysn)
log.Printf("%d taxon names read\n", n)
//
// Load the merged taxa
//
log.Printf("Loading Merged taxa\n")
aliasfile, aerr := os.Open(path.Join(directory, "merged.dmp"))
if aerr != nil {
return nil, errors.New(fmt.Sprintf("Cannot open merged file from '%s'",
directory))
}
defer aliasfile.Close()
buffered = bufio.NewReader(aliasfile)
n = loadMergedTable(buffered, taxonomy)
log.Printf("%d merged taxa read\n", n)
return taxonomy, nil
}

158
pkg/obiformats/options.go Normal file
View File

@ -0,0 +1,158 @@
package obiformats
import "git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
type __options__ struct {
fastseq_header_parser obiseq.SeqAnnotator
fastseq_header_writer func(obiseq.BioSequence) string
with_progress_bar bool
buffer_size int
batch_size int
quality_shift int
parallel_workers int
}
type Options struct {
pointer *__options__
}
type WithOption func(Options)
func MakeOptions(setters []WithOption) Options {
o := __options__{
fastseq_header_parser: ParseGuessedFastSeqHeader,
fastseq_header_writer: FormatFastSeqJsonHeader,
with_progress_bar: false,
buffer_size: 2,
quality_shift: 33,
parallel_workers: 4,
batch_size: 5000,
}
opt := Options{&o}
for _, set := range setters {
set(opt)
}
return opt
}
func (opt Options) QualityShift() int {
return opt.pointer.quality_shift
}
func (opt Options) BufferSize() int {
return opt.pointer.buffer_size
}
func (opt Options) BatchSize() int {
return opt.pointer.batch_size
}
func (opt Options) ParallelWorkers() int {
return opt.pointer.parallel_workers
}
func (opt Options) ParseFastSeqHeader() obiseq.SeqAnnotator {
return opt.pointer.fastseq_header_parser
}
func (opt Options) FormatFastSeqHeader() func(obiseq.BioSequence) string {
return opt.pointer.fastseq_header_writer
}
func (opt Options) ProgressBar() bool {
return opt.pointer.with_progress_bar
}
func OptionsBufferSize(size int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.buffer_size = size
})
return f
}
// Allows to specify the ascii code corresponding to
// a quality of 0 in fastq encoded quality scores.
func OptionsQualityShift(shift int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.quality_shift = shift
})
return f
}
// Allows to specify a quality shift of 33, corresponding
// to a FastQ file qualities encoded following Sanger
// convention. This corresponds to Illumina produced FastQ
// files.
func OptionsQualitySanger() WithOption {
return OptionsQualityShift(33)
}
// Allows to specify a quality shift of 64, corresponding
// to a FastQ file qualities encoded following the Solexa
// convention.
func OptionsQualitySolexa() WithOption {
return OptionsQualityShift(64)
}
func OptionsFastSeqHeaderParser(parser obiseq.SeqAnnotator) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.fastseq_header_parser = parser
})
return f
}
func OptionFastSeqDoNotParseHeader() WithOption {
return OptionsFastSeqHeaderParser(nil)
}
func OptionsFastSeqDefaultHeaderParser() WithOption {
return OptionsFastSeqHeaderParser(ParseGuessedFastSeqHeader)
}
// OptionsFastSeqHeaderFormat allows foor specifying the format
// used to write FASTA and FASTQ sequence.
func OptionsFastSeqHeaderFormat(format func(obiseq.BioSequence) string) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.fastseq_header_writer = format
})
return f
}
func OptionsParallelWorkers(nworkers int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.parallel_workers = nworkers
})
return f
}
func OptionsBatchSize(size int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.batch_size = size
})
return f
}
func OptionsWithProgressBar() WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_progress_bar = true
})
return f
}
func OptionsWithoutProgressBar() WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_progress_bar = false
})
return f
}

View File

@ -0,0 +1,93 @@
package obiformats
import (
"bufio"
"compress/gzip"
"io"
"log"
"os"
"strings"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
func GuessSeqFileType(firstline string) string {
switch {
case strings.HasPrefix(firstline, "#@ecopcr-v2"):
return "ecopcr"
case strings.HasPrefix(firstline, "#"):
return "ecopcr"
case strings.HasPrefix(firstline, ">"):
return "fasta"
case strings.HasPrefix(firstline, "@"):
return "fastq"
case strings.HasPrefix(firstline, "ID "):
return "embl"
case strings.HasPrefix(firstline, "LOCUS "):
return "genebank"
default:
return "unknown"
}
}
func ReadSequencesBatchFromFile(filename string, options ...WithOption) (obiseq.IBioSequenceBatch, error) {
var file *os.File
var reader io.Reader
var greader io.Reader
var err error
file, err = os.Open(filename)
if err != nil {
log.Fatalf("open file error: %v", err)
return obiseq.NilIBioSequenceBatch, err
}
reader = file
// Test if the flux is compressed by gzip
greader, err = gzip.NewReader(reader)
if err != nil {
file.Seek(0, 0)
} else {
log.Printf("File %s is gz compressed ", filename)
reader = greader
}
breader := bufio.NewReader(reader)
tag, _ := breader.Peek(30)
filetype := GuessSeqFileType(string(tag))
log.Printf("File guessed format : %s (tag: %s)",
filetype, (strings.Split(string(tag), "\n"))[0])
reader = breader
switch filetype {
case "fastq", "fasta":
file.Close()
is, _ := ReadFastSeqBatchFromFile(filename, options...)
return is, nil
case "ecopcr":
return ReadEcoPCRBatch(reader, options...), nil
case "embl":
return ReadEMBLBatch(reader, options...), nil
default:
log.Fatalf("File %s has guessed format %s which is not yet implemented",
filename, filetype)
}
return obiseq.NilIBioSequenceBatch, nil
}
func ReadSequencesFromFile(filename string, options ...WithOption) (obiseq.IBioSequence, error) {
ib, err := ReadSequencesBatchFromFile(filename, options...)
return ib.SortBatches().IBioSequence(), err
}

View File

@ -0,0 +1,79 @@
package obiformats
import (
"fmt"
"io"
"log"
"os"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
func WriteSequences(iterator obiseq.IBioSequence,
file io.Writer,
options ...WithOption) error {
opts := MakeOptions(options)
header_format := opts.FormatFastSeqHeader()
quality := opts.QualityShift()
ok := iterator.Next()
if ok {
seq := iterator.Get()
if seq.HasQualities() {
fmt.Fprintln(file, FormatFastq(seq, quality, header_format))
WriteFastq(iterator, file, options...)
} else {
fmt.Fprintln(file, FormatFasta(seq, header_format))
WriteFasta(iterator, file, options...)
}
}
return nil
}
func WriteSequencesToFile(iterator obiseq.IBioSequence,
filename string,
options ...WithOption) error {
file, err := os.Create(filename)
if err != nil {
log.Fatalf("open file error: %v", err)
return err
}
return WriteSequences(iterator, file, options...)
}
func WriteSequencesToStdout(iterator obiseq.IBioSequence, options ...WithOption) error {
return WriteSequences(iterator, os.Stdout, options...)
}
// func WriteSequenceBatch(iterator obiseq.IBioSequenceBatch,
// file io.Writer,
// options ...WithOption) error {
// opts := MakeOptions(options)
// header_format := opts.FormatFastSeqHeader()
// quality := opts.QualityShift()
// ok := iterator.Next()
// if ok {
// batch := iterator.Get()
// if batch.Slice()[0].HasQualities() {
// file.Write()
// fmt.Fprintln(file, FormatFastq(seq, quality, header_format))
// WriteFastq(iterator, file, options...)
// } else {
// fmt.Fprintln(file, FormatFasta(seq, header_format))
// WriteFasta(iterator, file, options...)
// }
// }
// return nil
// }

View File

@ -0,0 +1,117 @@
package obikmer
import "git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
var __single_base_code__ = []byte{0,
// A, B, C, D,
0, 0, 1, 0,
// E, F, G, H,
0, 0, 2, 0,
// I, J, K, L,
0, 0, 0, 0,
// M, N, O, P,
0, 0, 0, 0,
// Q, R, S, T,
0, 0, 0, 3,
// U, V, W, X,
3, 0, 0, 0,
// Y, Z, ., .,
0, 0, 0, 0,
0, 0, 0,
}
// Encode4mer transforms an obiseq.BioSequence into a sequence
// of kmer of length 4. Each letter of the sequence noot belonging
// A, C, G, T, U are considered as a A. The kmer is encoded as a byte
// value ranging from 0 to 255. Each nucleotite is represented by
// two bits. The values 0, 1, 2, 3 correspond respectively to A, C, G,
// and T. U is encoded by 3 like T. Therefore AAAA has the code 0 and
// TTTT the code 255 when ACGT is encoded by 00011011 in binary, 0x1B
// in hexadecimal and 27 in decimal. If the buffer parameter is not nil
// the slice is used to store the result, overwise a new slice is
// created.
func Encode4mer(seq obiseq.BioSequence, buffer *[]byte) []byte {
slength := seq.Length()
length := slength - 3
rawseq := seq.Sequence()
if length < 0 {
return nil
}
if buffer == nil {
b := make([]byte, 0, length)
buffer = &b
} else {
*buffer = (*buffer)[:0]
}
var code byte
i := 0
code = 0
for ; i < 4; i++ {
code <<= 2
code += __single_base_code__[rawseq[i]&31]
}
*buffer = append((*buffer), code)
for ; i < slength; i++ {
code <<= 2
code |= __single_base_code__[rawseq[i]&31]
*buffer = append((*buffer), code)
}
return *buffer
}
func Index4mer(seq obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int {
iternal_buffer := Encode4mer(seq, buffer)
if index == nil || cap(*index) < 256 {
i := make([][]int, 256)
index = &i
}
for i := 0; i < 256; i++ {
(*index)[i] = (*index)[i][:0]
}
for pos, code := range iternal_buffer {
(*index)[code] = append((*index)[code], pos)
}
return *index
}
func FastShiftFourMer(index [][]int, seq obiseq.BioSequence, buffer *[]byte) (int, int) {
iternal_buffer := Encode4mer(seq, buffer)
shifts := make(map[int]int, 3*seq.Length())
for pos, code := range iternal_buffer {
for _, refpos := range index[code] {
shift := refpos - pos
count, ok := shifts[shift]
if ok {
shifts[shift] = count + 1
} else {
shifts[shift] = 1
}
}
}
maxshift := 0
maxcount := -1
for shift, count := range shifts {
if count > maxcount {
maxshift = shift
maxcount = count
}
}
return maxshift, maxcount
}

48
pkg/obioptions/options.go Normal file
View File

@ -0,0 +1,48 @@
package obioptions
import (
"fmt"
"os"
"github.com/DavidGamba/go-getoptions"
)
var __debug__ = false
var __profiling__ = ""
type ArgumentParser func([]string) (*getoptions.GetOpt, []string, error)
func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser {
options := getoptions.New()
options.Bool("help", false, options.Alias("h", "?"))
options.BoolVar(&__debug__, "debug", false)
// options.StringVar(&__profiling__, "profile", "")
for _, o := range optionset {
o(options)
}
return func(args []string) (*getoptions.GetOpt, []string, error) {
remaining, err := options.Parse(args[1:])
if options.Called("help") {
fmt.Fprintf(os.Stderr, options.Help())
os.Exit(1)
}
return options, remaining, err
}
}
// Predicate indicating if the debug mode is activated
func IsDebugMode() bool {
return __debug__
}
func DebugOn() {
__debug__ = true
}
func DebugOff() {
__debug__ = false
}

358
pkg/obiseq/batchiterator.go Normal file
View File

@ -0,0 +1,358 @@
package obiseq
import (
"log"
"sync"
)
type BioSequenceBatch struct {
slice BioSequenceSlice
order int
}
var NilBioSequenceBatch = BioSequenceBatch{nil, -1}
func MakeBioSequenceBatch(order int, sequences ...BioSequence) BioSequenceBatch {
return BioSequenceBatch{
slice: sequences,
order: order,
}
}
func (batch BioSequenceBatch) Order() int {
return batch.order
}
func (batch BioSequenceBatch) Slice() BioSequenceSlice {
return batch.slice
}
func (batch BioSequenceBatch) Length() int {
return len(batch.slice)
}
func (batch BioSequenceBatch) IsNil() bool {
return batch.slice == nil
}
// Structure implementing an iterator over bioseq.BioSequenceBatch
// based on a channel.
type __ibiosequencebatch__ struct {
channel chan BioSequenceBatch
current BioSequenceBatch
all_done *sync.WaitGroup
buffer_size int
finished bool
p_finished *bool
}
type IBioSequenceBatch struct {
pointer *__ibiosequencebatch__
}
var NilIBioSequenceBatch = IBioSequenceBatch{pointer: nil}
func MakeIBioSequenceBatch(sizes ...int) IBioSequenceBatch {
buffsize := 1
if len(sizes) > 0 {
buffsize = sizes[0]
}
i := __ibiosequencebatch__{
channel: make(chan BioSequenceBatch, buffsize),
current: NilBioSequenceBatch,
buffer_size: buffsize,
finished: false,
p_finished: nil}
i.p_finished = &i.finished
waiting := sync.WaitGroup{}
i.all_done = &waiting
ii := IBioSequenceBatch{&i}
return ii
}
func (iterator IBioSequenceBatch) Add(n int) {
iterator.pointer.all_done.Add(n)
}
func (iterator IBioSequenceBatch) Done() {
iterator.pointer.all_done.Done()
}
func (iterator IBioSequenceBatch) Wait() {
iterator.pointer.all_done.Wait()
}
func (iterator IBioSequenceBatch) Channel() chan BioSequenceBatch {
return iterator.pointer.channel
}
func (iterator IBioSequenceBatch) IsNil() bool {
return iterator.pointer == nil
}
func (iterator IBioSequenceBatch) BufferSize() int {
return iterator.pointer.buffer_size
}
func (iterator IBioSequenceBatch) Split() IBioSequenceBatch {
i := __ibiosequencebatch__{
channel: iterator.pointer.channel,
current: NilBioSequenceBatch,
all_done: iterator.pointer.all_done,
buffer_size: iterator.pointer.buffer_size,
finished: false,
p_finished: iterator.pointer.p_finished}
new_iter := IBioSequenceBatch{&i}
return new_iter
}
func (iterator IBioSequenceBatch) Next() bool {
if *(iterator.pointer.p_finished) {
return false
}
next, ok := (<-iterator.pointer.channel)
if ok {
iterator.pointer.current = next
return true
}
iterator.pointer.current = NilBioSequenceBatch
*iterator.pointer.p_finished = true
return false
}
// The 'Get' method returns the instance of BioSequenceBatch
// currently pointed by the iterator. You have to use the
// 'Next' method to move to the next entry before calling
// 'Get' to retreive the following instance.
func (iterator IBioSequenceBatch) Get() BioSequenceBatch {
return iterator.pointer.current
}
// Finished returns 'true' value if no more data is available
// from the iterator.
func (iterator IBioSequenceBatch) Finished() bool {
return *iterator.pointer.p_finished
}
func (iterator IBioSequenceBatch) IBioSequence(sizes ...int) IBioSequence {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequence(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
for iterator.Next() {
batch := iterator.Get()
for _, s := range batch.slice {
new_iter.pointer.channel <- s
}
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequenceBatch) SortBatches(sizes ...int) IBioSequenceBatch {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequenceBatch(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
next_to_send := 0
received := make(map[int]BioSequenceBatch)
go func() {
for iterator.Next() {
batch := iterator.Get()
if batch.order == next_to_send {
new_iter.pointer.channel <- batch
next_to_send++
batch, ok := received[next_to_send]
for ok {
new_iter.pointer.channel <- batch
delete(received, next_to_send)
next_to_send++
batch, ok = received[next_to_send]
}
} else {
received[batch.order] = batch
}
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequenceBatch) Concat(iterators ...IBioSequenceBatch) IBioSequenceBatch {
if len(iterators) == 0 {
return iterator
}
buffsize := iterator.BufferSize()
new_iter := MakeIBioSequenceBatch(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.Channel())
}()
go func() {
previous_max := 0
max_order := 0
for iterator.Next() {
s := iterator.Get()
if s.order > max_order {
max_order = s.order
}
new_iter.Channel() <- MakeBioSequenceBatch(s.order+previous_max, s.slice...)
}
previous_max = max_order + 1
for _, iter := range iterators {
for iter.Next() {
s := iter.Get()
if (s.order + previous_max) > max_order {
max_order = s.order + previous_max
}
new_iter.Channel() <- MakeBioSequenceBatch(s.order+previous_max, s.slice...)
}
previous_max = max_order + 1
}
new_iter.Done()
}()
return new_iter
}
// Redistributes sequences from a IBioSequenceBatch into a new
// IBioSequenceBatch with every batches having the same size
// indicated in parameter. Rebatching implies to sort the
// source IBioSequenceBatch.
func (iterator IBioSequenceBatch) Rebatch(size int, sizes ...int) IBioSequenceBatch {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequenceBatch(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
order := 0
iterator = iterator.SortBatches()
buffer := make(BioSequenceSlice, 0, size)
for iterator.Next() {
seqs := iterator.Get()
for _, s := range seqs.slice {
buffer = append(buffer, s)
if len(buffer) == size {
new_iter.Channel() <- MakeBioSequenceBatch(order, buffer...)
order++
buffer = make(BioSequenceSlice, 0, size)
}
}
}
if len(buffer) > 0 {
new_iter.Channel() <- MakeBioSequenceBatch(order, buffer...)
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequenceBatch) Destroy() {
log.Println("Start recycling of Bioseq objects")
for iterator.Next() {
batch := iterator.Get()
for _, seq := range batch.Slice() {
(&seq).Destroy()
}
}
log.Println("End of the recycling of Bioseq objects")
}
func (iterator IBioSequenceBatch) PairWith(reverse IBioSequenceBatch, sizes ...int) IPairedBioSequenceBatch {
buffsize := iterator.BufferSize()
batchsize := 5000
if len(sizes) > 0 {
batchsize = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
iterator = iterator.Rebatch(batchsize)
reverse = reverse.Rebatch(batchsize)
new_iter := MakeIPairedBioSequenceBatch(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
log.Println("End of association of paired reads")
}()
log.Println("Start association of paired reads")
go func() {
for iterator.Next() {
if !reverse.Next() {
log.Panicln("Etrange reverse pas prêt")
}
new_iter.Channel() <- MakePairedBioSequenceBatch(iterator.Get(),
reverse.Get())
}
new_iter.Done()
}()
return new_iter
}

185
pkg/obiseq/biosequence.go Normal file
View File

@ -0,0 +1,185 @@
package obiseq
import (
"bytes"
"crypto/md5"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/goutils"
)
type Quality []uint8
var __default_qualities__ = make(Quality, 0, 500)
func __make_default_qualities__(length int) Quality {
cl := len(__default_qualities__)
if cl < length {
for i := cl; i <= length; i++ {
__default_qualities__ = append(__default_qualities__, 40)
}
}
return __default_qualities__[0:length]
}
type Annotation map[string]interface{}
type __sequence__ struct {
id bytes.Buffer
definition bytes.Buffer
sequence bytes.Buffer
qualities bytes.Buffer
feature bytes.Buffer
annotations Annotation
}
type BioSequence struct {
sequence *__sequence__
}
type BioSequenceSlice []BioSequence
var NilBioSequence = BioSequence{sequence: nil}
func (s BioSequence) IsNil() bool {
return s.sequence == nil
}
func (s BioSequence) Reset() {
s.sequence.id.Reset()
s.sequence.definition.Reset()
s.sequence.sequence.Reset()
s.sequence.qualities.Reset()
s.sequence.feature.Reset()
for k := range s.sequence.annotations {
delete(s.sequence.annotations, k)
}
}
func (s BioSequence) Copy() BioSequence {
new_seq := MakeEmptyBioSequence()
new_seq.sequence.id.Write(s.sequence.id.Bytes())
new_seq.sequence.definition.Write(s.sequence.definition.Bytes())
new_seq.sequence.sequence.Write(s.sequence.sequence.Bytes())
new_seq.sequence.qualities.Write(s.sequence.qualities.Bytes())
new_seq.sequence.feature.Write(s.sequence.feature.Bytes())
if len(s.sequence.annotations) > 0 {
goutils.CopyMap(new_seq.sequence.annotations,
s.sequence.annotations)
}
return new_seq
}
func (s BioSequence) Id() string {
return s.sequence.id.String()
}
func (s BioSequence) Definition() string {
return s.sequence.definition.String()
}
func (s BioSequence) Sequence() []byte {
return s.sequence.sequence.Bytes()
}
func (s BioSequence) String() string {
return s.sequence.sequence.String()
}
func (s BioSequence) Length() int {
return s.sequence.sequence.Len()
}
func (s BioSequence) HasQualities() bool {
return s.sequence.qualities.Len() > 0
}
func (s BioSequence) Qualities() Quality {
if s.HasQualities() {
return s.sequence.qualities.Bytes()
} else {
return __make_default_qualities__(s.sequence.sequence.Len())
}
}
func (s BioSequence) Features() string {
return s.sequence.feature.String()
}
func (s BioSequence) Annotations() Annotation {
return s.sequence.annotations
}
func (s BioSequence) MD5() [16]byte {
return md5.Sum(s.sequence.sequence.Bytes())
}
func (s BioSequence) Count() int {
if s.sequence.annotations == nil {
return 1
}
if val, ok := (s.sequence.annotations)["count"]; ok {
val, err := goutils.InterfaceToInt(val)
if err == nil {
return val
}
}
return 1
}
func (s BioSequence) Taxid() int {
if s.sequence.annotations == nil {
return 1
}
if val, ok := (s.sequence.annotations)["taxid"]; ok {
val, err := goutils.InterfaceToInt(val)
if err == nil {
return val
}
}
return 1
}
func (s BioSequence) SetId(id string) {
s.sequence.id.Reset()
s.sequence.id.WriteString(id)
}
func (s BioSequence) SetDefinition(definition string) {
s.sequence.definition.Reset()
s.sequence.definition.WriteString(definition)
}
func (s BioSequence) SetFeatures(feature string) {
s.sequence.feature.Reset()
s.sequence.feature.WriteString(feature)
}
func (s BioSequence) SetSequence(sequence []byte) {
s.sequence.sequence.Reset()
s.sequence.sequence.Write(sequence)
}
func (s BioSequence) SetQualities(qualities Quality) {
s.sequence.qualities.Reset()
s.sequence.qualities.Write(qualities)
}
func (s BioSequence) Write(data []byte) (int, error) {
return s.sequence.sequence.Write(data)
}
func (s BioSequence) WriteString(data string) (int, error) {
return s.sequence.sequence.WriteString(data)
}
func (s BioSequence) WriteByte(data byte) error {
return s.sequence.sequence.WriteByte(data)
}
func (s BioSequence) WriteRune(data rune) (int, error) {
return s.sequence.sequence.WriteRune(data)
}

326
pkg/obiseq/iterator.go Normal file
View File

@ -0,0 +1,326 @@
package obiseq
import (
"sync"
"time"
)
// Private structure implementing an iterator over
// bioseq.BioSequence based on a channel.
type __ibiosequence__ struct {
channel chan BioSequence
current BioSequence
all_done *sync.WaitGroup
buffer_size int
finished bool
p_finished *bool
}
type IBioSequence struct {
pointer *__ibiosequence__
}
var NilIBioSequence = IBioSequence{pointer: nil}
func (iterator IBioSequence) IsNil() bool {
return iterator.pointer == nil
}
func (iterator IBioSequence) Add(n int) {
iterator.pointer.all_done.Add(n)
}
func (iterator IBioSequence) Done() {
iterator.pointer.all_done.Done()
}
func (iterator IBioSequence) Wait() {
iterator.pointer.all_done.Wait()
}
func (iterator IBioSequence) Channel() chan BioSequence {
return iterator.pointer.channel
}
func (iterator IBioSequence) PChannel() *chan BioSequence {
return &(iterator.pointer.channel)
}
func MakeIBioSequence(sizes ...int) IBioSequence {
buffsize := 1
if len(sizes) > 0 {
buffsize = sizes[0]
}
i := __ibiosequence__{
channel: make(chan BioSequence, buffsize),
current: NilBioSequence,
buffer_size: buffsize,
finished: false,
p_finished: nil}
i.p_finished = &i.finished
waiting := sync.WaitGroup{}
i.all_done = &waiting
ii := IBioSequence{&i}
return ii
}
func (iterator IBioSequence) Split() IBioSequence {
i := __ibiosequence__{
channel: iterator.pointer.channel,
current: NilBioSequence,
finished: false,
all_done: iterator.pointer.all_done,
buffer_size: iterator.pointer.buffer_size,
p_finished: iterator.pointer.p_finished}
new_iter := IBioSequence{&i}
return new_iter
}
func (iterator IBioSequence) Next() bool {
if iterator.IsNil() || *(iterator.pointer.p_finished) {
iterator.pointer.current = NilBioSequence
return false
}
next, ok := (<-iterator.pointer.channel)
if ok {
iterator.pointer.current = next
return true
}
iterator.pointer.current = NilBioSequence
*iterator.pointer.p_finished = true
return false
}
// The 'Get' method returns the instance of BioSequence
// currently pointed by the iterator. You have to use the
// 'Next' method to move to the next entry before calling
// 'Get' to retreive the following instance.
func (iterator IBioSequence) Get() BioSequence {
return iterator.pointer.current
}
// Finished returns 'true' value if no more data is available
// from the iterator.
func (iterator IBioSequence) Finished() bool {
return *iterator.pointer.p_finished
}
func (iterator IBioSequence) BufferSize() int {
return iterator.pointer.buffer_size
}
// The IBioSequenceBatch converts a IBioSequence iterator
// into an iterator oveer batches oof sequences. By default
// the size of a batch is of 100 sequences and the iterator
// implements a buffer equal to that of the source iterator.
// These defaults can be overriden by specifying one or two
// optional parametters at the method call. The first one
// indicates the batch size. The second optional parametter
// indicates the size of the buffer.
func (iterator IBioSequence) IBioSequenceBatch(sizes ...int) IBioSequenceBatch {
batchsize := 100
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
batchsize = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
new_iter := MakeIBioSequenceBatch(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
for len(new_iter.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
close(new_iter.pointer.channel)
}()
go func() {
for j := 0; !iterator.Finished(); j++ {
batch := BioSequenceBatch{
slice: make(BioSequenceSlice, 0, batchsize),
order: j}
for i := 0; i < batchsize && iterator.Next(); i++ {
seq := iterator.Get()
batch.slice = append(batch.slice, seq)
}
new_iter.pointer.channel <- batch
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequence) IBioSequence(sizes ...int) IBioSequence {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequence(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
for iterator.Next() {
s := iterator.Get()
new_iter.pointer.channel <- s
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequence) Skip(n int, sizes ...int) IBioSequence {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequence(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
for i := 0; iterator.Next(); i++ {
if i >= n {
s := iterator.Get()
new_iter.pointer.channel <- s
}
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequence) Head(n int, sizes ...int) IBioSequence {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequence(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
not_done := true
for i := 0; iterator.Next(); i++ {
if i < n {
s := iterator.Get()
new_iter.pointer.channel <- s
} else {
if not_done {
new_iter.Done()
not_done = false
}
}
}
}()
return new_iter
}
// The 'Tail' method discard every data from the source iterator
// except the 'n' last ones.
func (iterator IBioSequence) Tail(n int, sizes ...int) IBioSequence {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequence(buffsize)
buffseq := make(BioSequenceSlice, n)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
var i int
for i = 0; iterator.Next(); i++ {
buffseq[i%n] = iterator.Get()
}
if i > n {
for j := 0; j < n; j++ {
new_iter.Channel() <- buffseq[(i+j)%n]
}
} else {
for j := 0; j < i; j++ {
new_iter.Channel() <- buffseq[j]
}
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequence) Concat(iterators ...IBioSequence) IBioSequence {
if len(iterators) == 0 {
return iterator
}
buffsize := iterator.BufferSize()
new_iter := MakeIBioSequence(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
for iterator.Next() {
s := iterator.Get()
new_iter.pointer.channel <- s
}
for _, iter := range iterators {
for iter.Next() {
s := iter.Get()
new_iter.pointer.channel <- s
}
}
new_iter.Done()
}()
return new_iter
}

19
pkg/obiseq/join.go Normal file
View File

@ -0,0 +1,19 @@
package obiseq
import "git.metabarcoding.org/lecasofts/go/oa2/pkg/goutils"
func (sequence BioSequence) Join(seq2 BioSequence, copy_annot bool) (BioSequence, error) {
new_seq := MakeEmptyBioSequence()
new_seq.SetId(sequence.Id())
new_seq.SetDefinition(sequence.Definition())
new_seq.Write(sequence.Sequence())
new_seq.Write(seq2.Sequence())
if copy_annot {
goutils.CopyMap(new_seq.Annotations(), sequence.Annotations())
}
return new_seq, nil
}

View File

@ -0,0 +1,196 @@
package obiseq
import (
"log"
"sync"
)
type PairedBioSequenceBatch struct {
forward BioSequenceSlice
reverse BioSequenceSlice
order int
}
var NilPairedBioSequenceBatch = PairedBioSequenceBatch{nil, nil, -1}
func MakePairedBioSequenceBatch(forward, reverse BioSequenceBatch) PairedBioSequenceBatch {
if forward.order != reverse.order {
log.Fatalf("Forward order : %d and reverse order : %d are not matching",
forward.order, reverse.order)
}
for i := range reverse.slice {
reverse.slice[i].ReverseComplement(true)
}
return PairedBioSequenceBatch{
forward: forward.slice,
reverse: reverse.slice,
order: forward.order,
}
}
func (batch PairedBioSequenceBatch) Order() int {
return batch.order
}
func (batch PairedBioSequenceBatch) Length() int {
return len(batch.forward)
}
func (batch PairedBioSequenceBatch) Forward() BioSequenceSlice {
return batch.forward
}
func (batch PairedBioSequenceBatch) Reverse() BioSequenceSlice {
return batch.reverse
}
func (batch PairedBioSequenceBatch) IsNil() bool {
return batch.forward == nil
}
// Structure implementing an iterator over bioseq.BioSequenceBatch
// based on a channel.
type __ipairedbiosequencebatch__ struct {
channel chan PairedBioSequenceBatch
current PairedBioSequenceBatch
all_done *sync.WaitGroup
buffer_size int
finished bool
p_finished *bool
}
type IPairedBioSequenceBatch struct {
pointer *__ipairedbiosequencebatch__
}
var NilIPairedBioSequenceBatch = IPairedBioSequenceBatch{pointer: nil}
func MakeIPairedBioSequenceBatch(sizes ...int) IPairedBioSequenceBatch {
buffsize := 1
if len(sizes) > 0 {
buffsize = sizes[0]
}
i := __ipairedbiosequencebatch__{
channel: make(chan PairedBioSequenceBatch, buffsize),
current: NilPairedBioSequenceBatch,
buffer_size: buffsize,
finished: false,
p_finished: nil}
i.p_finished = &i.finished
waiting := sync.WaitGroup{}
i.all_done = &waiting
ii := IPairedBioSequenceBatch{&i}
return ii
}
func (iterator IPairedBioSequenceBatch) Add(n int) {
iterator.pointer.all_done.Add(n)
}
func (iterator IPairedBioSequenceBatch) Done() {
iterator.pointer.all_done.Done()
}
func (iterator IPairedBioSequenceBatch) Wait() {
iterator.pointer.all_done.Wait()
}
func (iterator IPairedBioSequenceBatch) Channel() chan PairedBioSequenceBatch {
return iterator.pointer.channel
}
func (iterator IPairedBioSequenceBatch) IsNil() bool {
return iterator.pointer == nil
}
func (iterator IPairedBioSequenceBatch) BufferSize() int {
return iterator.pointer.buffer_size
}
func (iterator IPairedBioSequenceBatch) Split() IPairedBioSequenceBatch {
i := __ipairedbiosequencebatch__{
channel: iterator.pointer.channel,
current: NilPairedBioSequenceBatch,
all_done: iterator.pointer.all_done,
buffer_size: iterator.pointer.buffer_size,
finished: false,
p_finished: iterator.pointer.p_finished}
new_iter := IPairedBioSequenceBatch{&i}
return new_iter
}
func (iterator IPairedBioSequenceBatch) Next() bool {
if *(iterator.pointer.p_finished) {
return false
}
next, ok := (<-iterator.pointer.channel)
if ok {
iterator.pointer.current = next
return true
}
iterator.pointer.current = NilPairedBioSequenceBatch
*iterator.pointer.p_finished = true
return false
}
// The 'Get' method returns the instance of BioSequenceBatch
// currently pointed by the iterator. You have to use the
// 'Next' method to move to the next entry before calling
// 'Get' to retreive the following instance.
func (iterator IPairedBioSequenceBatch) Get() PairedBioSequenceBatch {
return iterator.pointer.current
}
// Finished returns 'true' value if no more data is available
// from the iterator.
func (iterator IPairedBioSequenceBatch) Finished() bool {
return *iterator.pointer.p_finished
}
func (iterator IPairedBioSequenceBatch) SortBatches(sizes ...int) IPairedBioSequenceBatch {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIPairedBioSequenceBatch(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
next_to_send := 0
received := make(map[int]PairedBioSequenceBatch)
go func() {
for iterator.Next() {
batch := iterator.Get()
if batch.order == next_to_send {
new_iter.pointer.channel <- batch
next_to_send++
batch, ok := received[next_to_send]
for ok {
new_iter.pointer.channel <- batch
delete(received, next_to_send)
next_to_send++
batch, ok = received[next_to_send]
}
} else {
received[batch.order] = batch
}
}
new_iter.Done()
}()
return new_iter
}

34
pkg/obiseq/pool.go Normal file
View File

@ -0,0 +1,34 @@
package obiseq
import (
"sync"
)
var __bioseq__pool__ = sync.Pool{
New: func() interface{} {
var bs __sequence__
bs.annotations = make(Annotation, 50)
return &bs
},
}
func MakeEmptyBioSequence() BioSequence {
bs := BioSequence{__bioseq__pool__.Get().(*__sequence__)}
bs.Reset()
return bs
}
func MakeBioSequence(id string,
sequence []byte,
definition string) BioSequence {
bs := MakeEmptyBioSequence()
bs.SetId(id)
bs.SetSequence(sequence)
bs.SetDefinition(definition)
return bs
}
func (sequence *BioSequence) Destroy() {
__bioseq__pool__.Put(sequence.sequence)
sequence.sequence = nil
}

26
pkg/obiseq/revcomp.go Normal file
View File

@ -0,0 +1,26 @@
package obiseq
// ".ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
var __revcmp_dna__ = []byte(".TVGHEFCDIJMLKNOPQYSAABWXRZ#!][")
// Reverse complements a DNA sequence.
// If the inplace parametter is true, that operation is done in place.
func (sequence BioSequence) ReverseComplement(inplace bool) BioSequence {
if !inplace {
sequence = sequence.Copy()
}
s := sequence.sequence.sequence.Bytes()
for i, j := sequence.Length()-1, 0; i >= j; i-- {
s[j], s[i] = __revcmp_dna__[s[i]&31]|(s[i]&0x20),
__revcmp_dna__[s[j]&31]|(s[j]&0x20)
j++
}
sequence.sequence.id.WriteString("_revcomp")
return sequence
}

43
pkg/obiseq/subseq.go Normal file
View File

@ -0,0 +1,43 @@
package obiseq
import (
"errors"
"fmt"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/goutils"
)
// Returns a sub sequence start from position 'from' included,
// to position 'to' excluded. Coordinates start at position 0.
func (sequence BioSequence) Subsequence(from, to int, circular bool) (BioSequence, error) {
if from >= to && !circular {
return NilBioSequence, errors.New("from greater than to")
}
if from < 0 || from >= sequence.Length() {
return NilBioSequence, errors.New("from out of bounds")
}
if to <= 0 || to > sequence.Length() {
return NilBioSequence, errors.New("to out of bounds")
}
var new_seq BioSequence
if from < to {
new_seq = MakeEmptyBioSequence()
new_seq.Write(sequence.Sequence()[from:to])
fmt.Fprintf(&new_seq.sequence.id, "%s_sub[%d..%d]", sequence.Id(), from+1, to)
new_seq.sequence.definition.Write(sequence.sequence.definition.Bytes())
} else {
new_seq, _ = sequence.Subsequence(from, sequence.Length(), false)
new_seq.Write(sequence.Sequence()[0:to])
}
if len(sequence.Annotations()) > 0 {
goutils.CopyMap(new_seq.Annotations(), sequence.Annotations())
}
return new_seq, nil
}

1
pkg/obiseq/types.go Normal file
View File

@ -0,0 +1 @@
package obiseq

134
pkg/obiseq/workers.go Normal file
View File

@ -0,0 +1,134 @@
package obiseq
import (
"log"
"time"
)
type SeqAnnotator func(BioSequence)
type SeqWorker func(BioSequence) BioSequence
type SeqSliceWorker func(BioSequenceSlice) BioSequenceSlice
func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
f := func(seq BioSequence) BioSequence {
function(seq)
return seq
}
return f
}
func (iterator IBioSequence) MakeIWorker(worker SeqWorker, sizes ...int) IBioSequence {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequence(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
for iterator.Next() {
seq := iterator.Get()
seq = worker(seq)
new_iter.pointer.channel <- seq
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequenceBatch) MakeIWorker(worker SeqWorker, sizes ...int) IBioSequenceBatch {
nworkers := 4
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
nworkers = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
new_iter := MakeIBioSequenceBatch(buffsize)
new_iter.Add(nworkers)
go func() {
new_iter.Wait()
for len(new_iter.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
close(new_iter.pointer.channel)
log.Println("End of the batch workers")
}()
f := func(iterator IBioSequenceBatch) {
for iterator.Next() {
batch := iterator.Get()
for i, seq := range batch.slice {
batch.slice[i] = worker(seq)
}
new_iter.pointer.channel <- batch
}
new_iter.Done()
}
log.Println("Start of the batch workers")
for i := 0; i < nworkers; i++ {
go f(iterator.Split())
}
return new_iter
}
func (iterator IBioSequenceBatch) MakeISliceWorker(worker SeqSliceWorker, sizes ...int) IBioSequenceBatch {
nworkers := 4
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
nworkers = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
new_iter := MakeIBioSequenceBatch(buffsize)
new_iter.Add(nworkers)
go func() {
new_iter.Wait()
for len(new_iter.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
close(new_iter.pointer.channel)
log.Println("End of the batch slice workers")
}()
f := func(iterator IBioSequenceBatch) {
for iterator.Next() {
batch := iterator.Get()
batch.slice = worker(batch.slice)
new_iter.pointer.channel <- batch
}
new_iter.Done()
}
log.Println("Start of the batch slice workers")
for i := 0; i < nworkers; i++ {
go f(iterator.Split())
}
return new_iter
}

View File

@ -0,0 +1,56 @@
package obitax
import (
"regexp"
)
func (taxonomy *Taxonomy) IFilterOnName(name string, strict bool) *ITaxonSet {
if strict {
nodes, ok := taxonomy.index[name]
if ok {
return nodes.Iterator()
} else {
empty := make(TaxonSet)
return (&empty).Iterator()
}
}
return taxonomy.Iterator().IFilterOnName(name, strict)
}
func (iterator *ITaxonSet) IFilterOnName(name string, strict bool) *ITaxonSet {
new_iterator := NewITaxonSet()
sentTaxa := make(map[int]bool)
if strict {
go func() {
for iterator.Next() {
taxon := iterator.Get()
if _, ok := sentTaxa[taxon.taxid]; !ok {
if taxon.IsNameEqual(name) {
sentTaxa[taxon.taxid] = true
new_iterator.source <- taxon
}
}
}
close(new_iterator.source)
}()
} else {
pattern := regexp.MustCompile(name)
go func() {
for iterator.Next() {
taxon := iterator.Get()
if _, ok := sentTaxa[taxon.taxid]; !ok {
if taxon.IsNameMatching(pattern) {
sentTaxa[taxon.taxid] = true
new_iterator.source <- taxon
}
}
}
close(new_iterator.source)
}()
}
return new_iterator
}

View File

@ -0,0 +1,29 @@
package obitax
func (iterator *ITaxonSet) IFilterOnTaxRank(rank string) *ITaxonSet {
new_iter := NewITaxonSet()
go func() {
for iterator.Next() {
taxon := iterator.Get()
if taxon.rank == rank {
new_iter.source <- taxon
}
}
close(new_iter.source)
}()
return new_iter
}
func (set *TaxonSet) IFilterOnTaxRank(rank string) *ITaxonSet {
return set.Iterator().IFilterOnTaxRank(rank)
}
func (slice *TaxonSlice) IFilterOnTaxRank(rank string) *ITaxonSet {
return slice.Iterator().IFilterOnTaxRank(rank)
}
func (taxonomy *Taxonomy) IFilterOnTaxRank(rank string) *ITaxonSet {
return taxonomy.Iterator().IFilterOnTaxRank(rank)
}

View File

@ -0,0 +1,59 @@
package obitax
import "reflect"
func (iterator *ITaxonSet) IFilterOnSubcladeOf(taxon *TaxNode) *ITaxonSet {
new_iter := NewITaxonSet()
go func() {
for iterator.Next() {
tx := iterator.Get()
if tx.IsSubCladeOf(taxon) {
new_iter.source <- tx
}
}
close(new_iter.source)
}()
return new_iter
}
func (set *TaxonSet) IFilterOnSubcladeOf(taxon *TaxNode) *ITaxonSet {
return set.Iterator().IFilterOnSubcladeOf(taxon)
}
func (slice *TaxonSlice) IFilterOnSubcladeOf(taxon *TaxNode) *ITaxonSet {
return slice.Iterator().IFilterOnSubcladeOf(taxon)
}
func (taxonomy *Taxonomy) IFilterOnSubcladeOf(taxon *TaxNode) *ITaxonSet {
return taxonomy.Iterator().IFilterOnSubcladeOf(taxon)
}
func (iterator *ITaxonSet) IFilterBelongingSubclades(clades *TaxonSet) *ITaxonSet {
if len(*clades) == 0 {
return iterator
}
// Considers the second simplest case when only
// a single subclase is provided
if len(*clades) == 1 {
keys := reflect.ValueOf(*clades).MapKeys()
return iterator.IFilterOnSubcladeOf((*clades)[int(keys[0].Int())])
}
new_iter := NewITaxonSet()
go func() {
for iterator.Next() {
tx := iterator.Get()
if tx.IsBelongingSubclades(clades) {
new_iter.source <- tx
}
}
close(new_iter.source)
}()
return new_iter
}

View File

@ -0,0 +1,21 @@
package obitax
func (taxon *TaxNode) IsSubCladeOf(parent *TaxNode) bool {
for taxon.taxid != parent.taxid && taxon.parent != taxon.taxid {
taxon = taxon.pparent
}
return taxon.taxid == parent.taxid
}
func (taxon *TaxNode) IsBelongingSubclades(clades *TaxonSet) bool {
_, ok := (*clades)[taxon.taxid]
for !ok && taxon.parent != taxon.taxid {
taxon = taxon.pparent
_, ok = (*clades)[taxon.taxid]
}
return ok
}

99
pkg/obitax/iterator.go Normal file
View File

@ -0,0 +1,99 @@
package obitax
type ITaxonSet struct {
source chan *TaxNode
current *TaxNode
finished bool
p_finished *bool
}
func NewITaxonSet() *ITaxonSet {
i := ITaxonSet{make(chan *TaxNode), nil, false, nil}
i.p_finished = &i.finished
return &i
}
func (set *TaxonSet) Iterator() *ITaxonSet {
i := NewITaxonSet()
go func() {
for _, t := range *set {
i.source <- t
}
close(i.source)
}()
return i
}
func (set *TaxonSlice) Iterator() *ITaxonSet {
i := NewITaxonSet()
go func() {
for _, t := range *set {
i.source <- t
}
close(i.source)
}()
return i
}
func (taxonmy *Taxonomy) iterator() *ITaxonSet {
return taxonmy.nodes.Iterator()
}
func (iterator *ITaxonSet) Next() bool {
if *(iterator.p_finished) {
return false
}
next, ok := (<-iterator.source)
if ok {
iterator.current = next
return true
}
iterator.current = nil
*iterator.p_finished = true
return false
}
// The 'Get' method returns the instance of *TaxNode
// currently pointed by the iterator. You have to use the
// 'Next' method to move to the next entry before calling
// 'Get' to retreive the following instance.
func (iterator *ITaxonSet) Get() *TaxNode {
return iterator.current
}
// Finished returns 'true' value if no more data is available
// from the iterator.
func (iterator *ITaxonSet) Finished() bool {
return *iterator.p_finished
}
func (iterator *ITaxonSet) Split() *ITaxonSet {
new_iter := ITaxonSet{iterator.source, nil, false, iterator.p_finished}
return &new_iter
}
func (iterator *ITaxonSet) TaxonSet() *TaxonSet {
set := make(TaxonSet)
for iterator.Next() {
taxon := iterator.Get()
set[taxon.taxid] = taxon
}
return &set
}
func (iterator *ITaxonSet) TaxonSlice() *TaxonSlice {
slice := make(TaxonSlice, 0)
for iterator.Next() {
taxon := iterator.Get()
slice = append(slice, taxon)
}
return &slice
}

36
pkg/obitax/path.go Normal file
View File

@ -0,0 +1,36 @@
package obitax
import (
"errors"
"fmt"
)
func (taxon *TaxNode) Path() (*TaxonSlice, error) {
path := make(TaxonSlice, 0, 30)
path = append(path, taxon)
for taxon != taxon.pparent {
taxon = taxon.pparent
if taxon == nil {
return nil, errors.New(fmt.Sprint("Taxonomy must be reindexed"))
}
path = append(path, taxon)
}
return &path, nil
}
// Returns a TaxonSet listing the requested taxon and all
// its ancestors in the taxonomy down to the root.
func (taxonomy *Taxonomy) Path(taxid int) (*TaxonSlice, error) {
taxon, err := taxonomy.Taxon(taxid)
if err != nil {
return nil, err
}
return taxon.Path()
}

16
pkg/obitax/ranklist.go Normal file
View File

@ -0,0 +1,16 @@
package obitax
func (taxonomy *Taxonomy) RankList() []string {
ranks := make([]string, 0, 30)
mranks := make(map[string]bool)
for _, t := range *taxonomy.nodes {
mranks[t.rank] = true
}
for r := range mranks {
ranks = append(ranks, r)
}
return ranks
}

66
pkg/obitax/taxon.go Normal file
View File

@ -0,0 +1,66 @@
package obitax
import (
"regexp"
)
type TaxNode struct {
taxid int
parent int
pparent *TaxNode
rank string
scientificname *string
alternatenames *map[string]*string
}
func NewTaxNode(taxid int, parent int, rank string) *TaxNode {
n := TaxNode{taxid, parent, nil, rank, nil, nil}
return &n
}
func (node *TaxNode) ScientificName() string {
n := node.scientificname
if n == nil {
return ""
}
return *n
}
func (node *TaxNode) Rank() string {
return node.rank
}
func (node *TaxNode) Taxid() int {
return node.taxid
}
func (node *TaxNode) Parent() *TaxNode {
return node.pparent
}
func (node *TaxNode) IsNameEqual(name string) bool {
if *(node.scientificname) == name {
return true
}
if node.alternatenames != nil {
_, ok := (*node.alternatenames)[name]
return ok
}
return false
}
func (node *TaxNode) IsNameMatching(pattern *regexp.Regexp) bool {
if pattern.MatchString(*(node.scientificname)) {
return true
}
if node.alternatenames != nil {
for n := range *node.alternatenames {
if pattern.MatchString(n) {
return true
}
}
}
return false
}

135
pkg/obitax/taxonomy.go Normal file
View File

@ -0,0 +1,135 @@
package obitax
import (
"errors"
"fmt"
"log"
)
type TaxName struct {
name *string
nameclass *string
}
type Taxonomy struct {
nodes *TaxonSet
alias map[int]*TaxNode
index map[string]*TaxonSet
}
func NewTaxonomy() *Taxonomy {
set := make(TaxonSet)
taxonomy := Taxonomy{
nodes: &set,
alias: make(TaxonSet),
index: make(map[string]*TaxonSet)}
return &taxonomy
}
func (taxonomy *Taxonomy) TaxonSet() *TaxonSet {
return taxonomy.nodes
}
func (taxonomy *Taxonomy) Alias() *map[int]*TaxNode {
return &(taxonomy.alias)
}
func (taxonomy *Taxonomy) Index() *map[string]*TaxonSet {
return &(taxonomy.index)
}
func (taxonomy *Taxonomy) Length() int {
return len(*taxonomy.nodes)
}
func (taxonomy *Taxonomy) Iterator() *ITaxonSet {
return taxonomy.nodes.Iterator()
}
func (taxonomy *Taxonomy) AddNewTaxa(taxid, parent int, rank string, replace bool, init bool) (*TaxNode, error) {
if !replace {
_, ok := (*taxonomy.nodes)[taxid]
if ok {
return nil, errors.New(fmt.Sprintf("Trying to add taxoon %d already present in the taxonomy", taxid))
}
}
n := NewTaxNode(taxid, parent, rank)
(*taxonomy.nodes)[taxid] = n
return n, nil
}
func (taxonomy *Taxonomy) Taxon(taxid int) (*TaxNode, error) {
t, ok := (*taxonomy.nodes)[taxid]
if !ok {
a, aok := taxonomy.alias[taxid]
if !aok {
return nil, errors.New(fmt.Sprintf("Taxid %d is not part of the taxonomy", taxid))
}
log.Printf("Taxid %d is deprecated and must be replaced by %d", taxid, a.taxid)
t = a
}
return t, nil
}
func (taxonomy *Taxonomy) AddNewName(taxid int, name, nameclass *string) error {
node, node_err := taxonomy.Taxon(taxid)
if node_err != nil {
return node_err
}
if *nameclass == "scientific name" {
node.scientificname = name
} else {
names := node.alternatenames
if names == nil {
n := make(map[string]*string)
names = &n
node.alternatenames = names
} else {
(*names)[*name] = nameclass
}
}
i, ok := taxonomy.index[*name]
if !ok {
tnm := make(TaxonSet)
i = &tnm
taxonomy.index[*name] = i
}
(*i)[taxid] = node
return nil
}
func (taxonomy *Taxonomy) ReindexParent() error {
var ok bool
for _, taxon := range *taxonomy.nodes {
taxon.pparent, ok = (*taxonomy.nodes)[taxon.parent]
if !ok {
return errors.New(fmt.Sprintf("Parent %d of taxon %d is not defined in taxonomy",
taxon.taxid,
taxon.parent))
}
}
return nil
}
func MakeTaxName(name, nameclass *string) *TaxName {
tn := TaxName{name, nameclass}
return &tn
}
func (taxonomy *Taxonomy) AddNewAlias(newtaxid, oldtaxid int) error {
n, node_err := taxonomy.Taxon(newtaxid)
if node_err != nil {
return node_err
}
taxonomy.alias[oldtaxid] = n
return nil
}

15
pkg/obitax/taxonset.go Normal file
View File

@ -0,0 +1,15 @@
package obitax
type TaxonSet map[int]*TaxNode
func (set *TaxonSet) Get(i int) *TaxNode {
return (*set)[i]
}
func (set *TaxonSet) Length() int {
return len(*set)
}
func (set *TaxonSet) Inserts(taxon *TaxNode) {
(*set)[taxon.taxid] = taxon
}

11
pkg/obitax/taxonslice.go Normal file
View File

@ -0,0 +1,11 @@
package obitax
type TaxonSlice []*TaxNode
func (set *TaxonSlice) Get(i int) *TaxNode {
return (*set)[i]
}
func (set *TaxonSlice) Length() int {
return len(*set)
}

View File

@ -0,0 +1,138 @@
package obiconvert
import (
"github.com/DavidGamba/go-getoptions"
)
var __skipped_entries__ = 0
var __read_only_entries__ = -1
var __input_fastjson_format__ = false
var __input_fastobi_format__ = false
var __input_ecopcr_format__ = false
var __input_embl_format__ = false
var __input_solexa_quality__ = false
var __output_in_fasta__ = false
var __output_in_fastq__ = false
var __output_fastjson_format__ = false
var __output_fastobi_format__ = false
var __output_solexa_quality__ = false
func InputOptionSet(options *getoptions.GetOpt) {
options.IntVar(&__skipped_entries__, "skip", 0,
options.Description("The N first sequence records of the file are discarded from the analysis and not reported to the output file."))
options.IntVar(&__read_only_entries__, "only", -1,
options.Description("Only the N next sequence records of the file are analyzed. The following sequences in the file are neither analyzed, neither reported to the output file. This option can be used conjointly with the skip option."))
options.BoolVar(&__input_fastjson_format__, "input-json-header", false,
options.Description("FASTA/FASTQ title line annotations follow json format."))
options.BoolVar(&__input_fastobi_format__, "input-OBI-header", false,
options.Description("FASTA/FASTQ title line annotations follow OBI format."))
options.BoolVar(&__input_ecopcr_format__, "ecopcr", false,
options.Description("Read data following the ecoPCR output format."))
options.BoolVar(&__input_embl_format__, "embl", false,
options.Description("Read data following the EMBL flatfile format."))
options.BoolVar(&__input_solexa_quality__, "solexa", false,
options.Description("Decodes quality string according to the Solexa specification."))
}
func OutputOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__output_in_fasta__, "fasta-output", false,
options.Description("Read data following the ecoPCR output format."))
options.BoolVar(&__output_in_fastq__, "fastq-output", false,
options.Description("Read data following the EMBL flatfile format."))
options.BoolVar(&__output_fastjson_format__, "output-json-header", false,
options.Description("output FASTA/FASTQ title line annotations follow json format."))
options.BoolVar(&__output_fastobi_format__, "output-OBI-header", false,
options.Description("output FASTA/FASTQ title line annotations follow OBI format."))
options.BoolVar(&__output_solexa_quality__, "solexa-output", false,
options.Description("Encodes quality string according to the Solexa specification."))
}
func OptionSet(options *getoptions.GetOpt) {
InputOptionSet(options)
OutputOptionSet(options)
}
// Returns true if the number of reads described in the
// file has to be printed.
func InputFormat() string {
switch {
case __input_ecopcr_format__:
return "ecopcr"
case __input_embl_format__:
return "embl"
default:
return "guessed"
}
}
func OutputFormat() string {
switch {
case __output_in_fastq__:
return "fastq"
case __output_in_fasta__:
return "fasta"
default:
return "guessed"
}
}
func InputFastHeaderFormat() string {
switch {
case __input_fastjson_format__:
return "json"
case __input_fastobi_format__:
return "obi"
default:
return "guessed"
}
}
func OutputFastHeaderFormat() string {
switch {
case __output_fastjson_format__:
return "json"
case __output_fastobi_format__:
return "obi"
default:
return "json"
}
}
// Returns the count of sequences to skip at the beginning of the
// processing.
func SequencesToSkip() int {
return __skipped_entries__
}
func AnalyzeOnly() int {
return __read_only_entries__
}
func InputQualityShift() int {
if __input_solexa_quality__ {
return 64
} else {
return 33
}
}
func OutputQualityShift() int {
if __output_solexa_quality__ {
return 64
} else {
return 33
}
}

View File

@ -0,0 +1,149 @@
package obiconvert
import (
"log"
"os"
"path/filepath"
"strings"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiformats"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
func __expand_list_of_files__(check_ext bool, filenames ...string) ([]string, error) {
var err error
list_of_files := make([]string, 0, 100)
for _, fn := range filenames {
err = filepath.Walk(fn,
func(path string, info os.FileInfo, err error) error {
for info.Mode()&os.ModeSymlink == os.ModeSymlink {
path, err = filepath.EvalSymlinks(path)
if err != nil {
return err
}
info, err = os.Stat(path)
if err != nil {
return err
}
}
if info.IsDir() {
if path != fn {
subdir, err := __expand_list_of_files__(true, path)
if err != nil {
return err
}
list_of_files = append(list_of_files, subdir...)
} else {
check_ext = true
}
} else {
if !check_ext ||
strings.HasSuffix(path, "fasta") ||
strings.HasSuffix(path, "fasta.gz") ||
strings.HasSuffix(path, "fastq") ||
strings.HasSuffix(path, "fastq.gz") ||
strings.HasSuffix(path, "dat") ||
strings.HasSuffix(path, "dat.gz") ||
strings.HasSuffix(path, "ecopcr") ||
strings.HasSuffix(path, "ecopcr.gz") {
log.Printf("Appending %s file\n", path)
list_of_files = append(list_of_files, path)
}
}
return nil
})
if err != nil {
return nil, err
}
}
return list_of_files, nil
}
func ReadBioSequencesBatch(filenames ...string) (obiseq.IBioSequenceBatch, error) {
var iterator obiseq.IBioSequenceBatch
var reader func(string, ...obiformats.WithOption) (obiseq.IBioSequenceBatch, error)
opts := make([]obiformats.WithOption, 0, 10)
switch InputFastHeaderFormat() {
case "json":
opts = append(opts, obiformats.OptionsFastSeqHeaderParser(obiformats.ParseFastSeqJsonHeader))
case "obi":
opts = append(opts, obiformats.OptionsFastSeqHeaderParser(obiformats.ParseFastSeqOBIHeader))
default:
opts = append(opts, obiformats.OptionsFastSeqHeaderParser(obiformats.ParseGuessedFastSeqHeader))
}
opts = append(opts, obiformats.OptionsQualityShift(InputQualityShift()))
if len(filenames) == 0 {
switch InputFormat() {
case "ecopcr":
iterator = obiformats.ReadEcoPCRBatch(os.Stdin, opts...)
case "embl":
iterator = obiformats.ReadEMBLBatch(os.Stdin, opts...)
default:
iterator = obiformats.ReadFastSeqBatchFromStdin(opts...)
}
} else {
list_of_files, err := __expand_list_of_files__(false, filenames...)
if err != nil {
return obiseq.NilIBioSequenceBatch, err
}
switch InputFormat() {
case "ecopcr":
reader = obiformats.ReadEcoPCRBatchFromFile
case "embl":
reader = obiformats.ReadEMBLBatchFromFile
default:
reader = obiformats.ReadSequencesBatchFromFile
}
iterator, err = reader(list_of_files[0], opts...)
if err != nil {
return obiseq.NilIBioSequenceBatch, err
}
list_of_files = list_of_files[1:]
others := make([]obiseq.IBioSequenceBatch, 0, len(list_of_files))
for _, fn := range list_of_files {
r, err := reader(fn, opts...)
if err != nil {
return obiseq.NilIBioSequenceBatch, err
}
others = append(others, r)
}
if len(others) > 0 {
iterator = iterator.Concat(others...)
}
}
// if SequencesToSkip() > 0 {
// iterator = iterator.Skip(SequencesToSkip())
// }
// if AnalyzeOnly() > 0 {
// iterator = iterator.Head(AnalyzeOnly())
// }
return iterator, nil
}
func ReadBioSequences(filenames ...string) (obiseq.IBioSequence, error) {
ib, err := ReadBioSequencesBatch(filenames...)
return ib.SortBatches().IBioSequence(), err
}

View File

@ -0,0 +1,56 @@
package obiconvert
import (
"log"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiformats"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
func WriteBioSequences(iterator obiseq.IBioSequence, filenames ...string) error {
opts := make([]obiformats.WithOption, 0, 10)
switch OutputFastHeaderFormat() {
case "json":
log.Println("On output use JSON headers")
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
case "obi":
log.Println("On output use OBI headers")
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqOBIHeader))
default:
log.Println("On output use JSON headers")
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
}
opts = append(opts, obiformats.OptionsQualityShift(OutputQualityShift()))
var err error
if len(filenames) == 0 {
switch OutputFormat() {
case "fastq":
err = obiformats.WriteFastqToStdout(iterator, opts...)
case "fasta":
err = obiformats.WriteFastaToStdout(iterator, opts...)
default:
err = obiformats.WriteSequencesToStdout(iterator, opts...)
}
} else {
switch OutputFormat() {
case "fastq":
err = obiformats.WriteFastqToFile(iterator, filenames[0], opts...)
case "fasta":
err = obiformats.WriteFastaToFile(iterator, filenames[0], opts...)
default:
err = obiformats.WriteSequencesToFile(iterator, filenames[0], opts...)
}
}
if err != nil {
log.Fatalf("Write file error: %v", err)
return err
}
return nil
}

View File

@ -0,0 +1,48 @@
// obicount function utility package.
//
// The obitols/obicount package contains every
// functions specificaly required by the obicount utility.
package obicount
import (
"github.com/DavidGamba/go-getoptions"
)
var __read_count__ bool
var __variant_count__ bool
var __symbol_count__ bool
func OptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__variant_count__, "variants", false,
options.Alias("v"),
options.Description("Prints variant counts."))
options.BoolVar(&__read_count__, "reads", false,
options.Alias("r"),
options.Description("Prints read counts."))
options.BoolVar(&__symbol_count__, "symbols", false,
options.Alias("s"),
options.Description("Prints symbol counts."))
}
// Returns true if the number of reads described in the
// file has to be printed.
func IsPrintingReadCount() bool {
return __read_count__ ||
!(__read_count__ || __variant_count__ || __symbol_count__)
}
// Returns true if the number of sequence variants described in the
// file has to be printed.
func IsPrintingVariantCount() bool {
return __variant_count__ ||
!(__read_count__ || __variant_count__ || __symbol_count__)
}
// Returns true if the number of symbols (sum of the sequence lengths)
// described in the file has to be printed.
func IsPrintingSymbolCount() bool {
return __symbol_count__ ||
!(__read_count__ || __variant_count__ || __symbol_count__)
}

View File

@ -0,0 +1,87 @@
package obifind
import (
"bytes"
"fmt"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitax"
)
func IFilterRankRestriction() func(*obitax.ITaxonSet) *obitax.ITaxonSet {
f := func(s *obitax.ITaxonSet) *obitax.ITaxonSet {
return s
}
if __restrict_rank__ != "" {
f = func(s *obitax.ITaxonSet) *obitax.ITaxonSet {
return s.IFilterOnTaxRank(__restrict_rank__)
}
}
return f
}
func ITaxonNameMatcher() (func(string) *obitax.ITaxonSet, error) {
taxonomy, err := LoadSelectedTaxonomy()
if err != nil {
return nil, err
}
fun := func(name string) *obitax.ITaxonSet {
return taxonomy.IFilterOnName(name, __fixed_pattern__)
}
return fun, nil
}
func ITaxonRestrictions() (func(*obitax.ITaxonSet) *obitax.ITaxonSet, error) {
clades, err := TaxonomicalRestrictions()
if err != nil {
return nil, err
}
rankfilter := IFilterRankRestriction()
fun := func(iterator *obitax.ITaxonSet) *obitax.ITaxonSet {
return rankfilter(iterator).IFilterBelongingSubclades(clades)
}
return fun, nil
}
func TaxonAsString(taxon *obitax.TaxNode, pattern string) string {
text := taxon.ScientificName()
if __with_path__ {
var bf bytes.Buffer
path, err := taxon.Path()
if err != nil {
fmt.Printf("%+v", err)
}
bf.WriteString(path.Get(path.Length() - 1).ScientificName())
for i := path.Length() - 2; i >= 0; i-- {
fmt.Fprintf(&bf, ":%s", path.Get(i).ScientificName())
}
text = bf.String()
}
return fmt.Sprintf("%-20s | %10d | %10d | %-20s | %s",
pattern,
taxon.Taxid(),
taxon.Parent().Taxid(),
taxon.Rank(),
text)
}
func TaxonWriter(itaxa *obitax.ITaxonSet, pattern string) {
for itaxa.Next() {
fmt.Println(TaxonAsString(itaxa.Get(), pattern))
}
}

View File

@ -0,0 +1,114 @@
package obifind
import (
"errors"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiformats/ncbitaxdump"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitax"
"github.com/DavidGamba/go-getoptions"
)
var __taxdump__ = ""
var __alternative_name__ = false
var __rank_list__ = false
var __selected_taxonomy__ = (*obitax.Taxonomy)(nil)
var __taxonomical_restriction__ = make([]int, 0)
var __fixed_pattern__ = false
var __with_path__ = false
var __taxid_path__ = -1
var __taxid_sons__ = -1
var __restrict_rank__ = ""
func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bool) {
if required {
options.StringVar(&__taxdump__, "taxdump", "",
options.Alias("t"),
options.Required(),
options.Description("Points to the directory containing the NCBI Taxonomy database dump."))
} else {
options.StringVar(&__taxdump__, "taxdump", "",
options.Alias("t"),
options.Description("Points to the directory containing the NCBI Taxonomy database dump."))
}
if alternatiive {
options.BoolVar(&__alternative_name__, "alternative-names", false,
options.Alias("a"),
options.Description("Enable the search on all alternative names and not only scientific names."))
}
options.BoolVar(&__rank_list__, "rank-list", false,
options.Alias("l"),
options.Description("List every taxonomic rank available iin the taxonomy."))
options.IntSliceVar(&__taxonomical_restriction__, "subclade-of", 1, 1,
options.Alias("s"),
options.Description("Restrict output to some subclades."))
}
func SelectedNCBITaxDump() string {
return __taxdump__
}
func AreAlternativeNamesSelected() bool {
return __alternative_name__
}
func TaxonomicalRestrictions() (*obitax.TaxonSet, error) {
taxonomy, err := LoadSelectedTaxonomy()
if err != nil {
return nil, err
}
ts := make(obitax.TaxonSet)
for _, taxid := range __taxonomical_restriction__ {
tx, err := taxonomy.Taxon(taxid)
if err != nil {
return nil, err
}
ts.Inserts(tx)
}
return &ts, nil
}
func LoadSelectedTaxonomy() (*obitax.Taxonomy, error) {
if SelectedNCBITaxDump() != "" {
if __selected_taxonomy__ == nil {
var err error
__selected_taxonomy__, err = ncbitaxdump.LoadNCBITaxDump(SelectedNCBITaxDump(),
!AreAlternativeNamesSelected())
if err != nil {
return nil, err
}
}
return __selected_taxonomy__, nil
}
return nil, errors.New("No NCBII taxdump selected using option -t|--taxdump")
}
func OptionSet(options *getoptions.GetOpt) {
LoadTaxonomyOptionSet(options, true, true)
options.BoolVar(&__fixed_pattern__, "fixed", false,
options.Alias("F"),
options.Description("Match taxon names using a fixed pattern, not a regular expression"))
options.BoolVar(&__with_path__, "with-path", false,
options.Alias("P"),
options.Description("Adds a column containing the full path for each displayed taxon."))
options.IntVar(&__taxid_path__, "parents", -1,
options.Alias("p"),
options.Description("Displays every parental tree's information for the provided taxid."))
options.StringVar(&__restrict_rank__, "rank", "",
options.Alias("r"),
options.Description("Restrict to the given taxonomic rank."))
}
func RequestsPathForTaxid() int {
return __taxid_path__
}
func RequestsSonsForTaxid() int {
return __taxid_sons__
}

View File

@ -0,0 +1,74 @@
package obipairing
import (
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var __forward_files__ = make([]string, 0, 10)
var __reverse_files__ = make([]string, 0, 10)
var __delta__ = 5
var __min_overlap__ = 20
var __gap_penality__ = 2
var __without_stats__ = false
func PairingOptionSet(options *getoptions.GetOpt) {
options.StringSliceVar(&__forward_files__, "forward-reads",
1, 1000,
options.Alias("F"),
options.Description("The file names containing the forward reads"))
options.StringSliceVar(&__reverse_files__, "reverse-reads",
1, 1000,
options.Alias("R"),
options.Description("The file names containing the reverse reads"))
options.IntVar(&__delta__, "delta", 5,
options.Alias("D"),
options.Description("Length added to the fast detected overlap for the precise alignement (default 5)."))
options.IntVar(&__min_overlap__, "min-overlap", 20,
options.Alias("O"),
options.Description("Minimum ovelap between both the reads to consider the aligment (default 20)."))
options.IntVar(&__gap_penality__, "gap-penality", 2,
options.Alias("G"),
options.Description("Gap penality expressed as the multiply factor applied to the mismatch score between two nucleotides with a quality of 40 (default 2)."))
options.BoolVar(&__without_stats__, "without-stat", false,
options.Alias("S"),
options.Description("Remove alignment statistics from the produced consensus sequences."))
}
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
PairingOptionSet(options)
}
func IBatchPairedSequence() (obiseq.IPairedBioSequenceBatch, error) {
forward, err := obiconvert.ReadBioSequencesBatch(__forward_files__...)
if err != nil {
return obiseq.NilIPairedBioSequenceBatch, err
}
reverse, err := obiconvert.ReadBioSequencesBatch(__reverse_files__...)
if err != nil {
return obiseq.NilIPairedBioSequenceBatch, err
}
paired := forward.PairWith(reverse)
return paired, nil
}
func Delta() int {
return __delta__
}
func MinOverlap() int {
return __min_overlap__
}
func GapPenality() int {
return __gap_penality__
}
func WithStats() bool {
return !__without_stats__
}

View File

@ -0,0 +1,176 @@
package obipairing
import (
"log"
"math"
"os"
"time"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obialign"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
"github.com/schollz/progressbar/v3"
)
func __abs__(x int) int {
if x < 0 {
return -x
}
return x
}
func JoinPairedSequence(seqA, seqB obiseq.BioSequence) obiseq.BioSequence {
js := make([]byte, seqA.Length(), seqA.Length()+seqB.Length()+10)
jq := make([]byte, seqA.Length(), seqA.Length()+seqB.Length()+10)
copy(js, seqA.Sequence())
copy(jq, seqA.Qualities())
js = append(js, '.', '.', '.', '.', '.', '.', '.', '.', '.', '.')
jq = append(jq, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
js = append(js, seqB.Sequence()...)
jq = append(jq, seqB.Qualities()...)
rep := obiseq.MakeBioSequence(seqA.Id(), js, seqA.Definition())
rep.SetQualities(jq)
return rep
}
func AssemblePESequences(seqA, seqB obiseq.BioSequence,
gap, delta, overlap_min int, with_stats bool,
arena_align obialign.PEAlignArena,
arena_cons obialign.BuildAlignArena,
arena_qual obialign.BuildAlignArena) obiseq.BioSequence {
score, path := obialign.PEAlign(seqA, seqB, gap, delta, arena_align)
cons, match := obialign.BuildQualityConsensus(seqA, seqB, path,
arena_cons, arena_qual)
left := path[0]
right := 0
if path[len(path)-1] == 0 {
right = path[len(path)-2]
}
lcons := cons.Length()
ali_length := lcons - __abs__(left) - __abs__(right)
if ali_length >= overlap_min {
if with_stats {
annot := cons.Annotations()
annot["mode"] = "alignment"
annot["score"] = score
if left < 0 {
annot["seq_a_single"] = -left
annot["ali_dir"] = "left"
} else {
annot["seq_b_single"] = left
annot["ali_dir"] = "right"
}
if right < 0 {
right = -right
annot["seq_a_single"] = right
} else {
annot["seq_b_single"] = right
}
score_norm := float64(0)
if ali_length > 0 {
score_norm = math.Round(float64(match)/float64(ali_length)*1000) / 1000
}
annot["ali_length"] = ali_length
annot["seq_ab_match"] = match
annot["score_norm"] = score_norm
}
} else {
cons = JoinPairedSequence(seqA, seqB)
if with_stats {
annot := cons.Annotations()
annot["mode"] = "join"
}
}
return cons
}
func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
gap, delta, overlap_min int, with_stats bool, sizes ...int) obiseq.IBioSequenceBatch {
nworkers := 7
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
nworkers = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
new_iter := obiseq.MakeIBioSequenceBatch(buffsize)
new_iter.Add(nworkers)
go func() {
new_iter.Wait()
for len(new_iter.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
close(new_iter.Channel())
log.Printf("End of the sequence Pairing")
}()
bar := progressbar.NewOptions(
-1,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowCount(),
progressbar.OptionShowIts(),
progressbar.OptionSetDescription("[Sequence Pairing]"))
f := func(iterator obiseq.IPairedBioSequenceBatch, wid int) {
arena := obialign.MakePEAlignArena(150, 150)
barena1 := obialign.MakeBuildAlignArena(150, 150)
barena2 := obialign.MakeBuildAlignArena(150, 150)
// log.Printf("\n==> %d Wait data to align\n", wid)
// start := time.Now()
for iterator.Next() {
// elapsed := time.Since(start)
// log.Printf("\n==>%d got data to align after %s\n", wid, elapsed)
batch := iterator.Get()
cons := make(obiseq.BioSequenceSlice, len(batch.Forward()))
processed := 0
for i, A := range batch.Forward() {
B := batch.Reverse()[i]
cons[i] = AssemblePESequences(A, B, 2, 5, 20, true, arena, barena1, barena2)
if i%59 == 0 {
bar.Add(59)
processed += 59
}
}
bar.Add(batch.Length() - processed)
new_iter.Channel() <- obiseq.MakeBioSequenceBatch(
batch.Order(),
cons...,
)
// log.Printf("\n==> %d Wait data to align\n", wid)
// start = time.Now()
}
new_iter.Done()
}
log.Printf("Start of the sequence Pairing")
for i := 0; i < nworkers; i++ {
go f(iterator.Split(), i)
}
return new_iter
}

View File

@ -0,0 +1,86 @@
package obipcr
import (
"log"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiapat"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var __circular__ = false
var __forward_primer__ string
var __reverse_primer__ string
var __allowed_mismatch__ = 0
var __minimum_length__ = 0
var __maximum_length__ = -1
func PCROptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__circular__, "circular", false,
options.Alias("c"),
options.Description("Considers that sequences are [c]ircular."))
options.StringVar(&__forward_primer__, "forward", "",
options.Required("You must provide a forward primer"),
options.Description("The forward primer used for the electronic PCR."))
options.StringVar(&__reverse_primer__, "reverse", "",
options.Required("You must provide a reverse primer"),
options.Description("The reverse primer used for the electronic PCR."))
options.IntVar(&__allowed_mismatch__, "allowed-mismatches", 0,
options.Alias("e"),
options.Description("Maximum number of mismatches allowed for each primer."))
options.IntVar(&__minimum_length__, "min-length", 0,
options.Alias("l"),
options.Description("Minimum length of the barcode (primers excluded)."))
options.IntVar(&__maximum_length__, "max-length", -1,
options.Alias("L"),
options.Description("Maximum length of the barcode (primers excluded)."))
}
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
PCROptionSet(options)
}
func ForwardPrimer() string {
pattern, err := obiapat.MakeApatPattern(__forward_primer__, __allowed_mismatch__)
if err != nil {
log.Fatalf("%+v", err)
}
pattern.Free()
return __forward_primer__
}
func ReversePrimer() string {
pattern, err := obiapat.MakeApatPattern(__reverse_primer__, __allowed_mismatch__)
if err != nil {
log.Fatalf("%+v", err)
}
pattern.Free()
return __reverse_primer__
}
func AllowedMismatch() int {
return __allowed_mismatch__
}
func Circular() bool {
return __circular__
}
func MinLength() int {
return __minimum_length__
}
func MaxLength() int {
return __maximum_length__
}

View File

@ -0,0 +1,32 @@
package obipcr
import (
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiapat"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
func PCR(iterator obiseq.IBioSequenceBatch) (obiseq.IBioSequence, error) {
forward := ForwardPrimer()
reverse := ReversePrimer()
opts := make([]obiapat.WithOption, 0, 10)
opts = append(opts, obiapat.OptionForwardError(AllowedMismatch()),
obiapat.OptionReverseError(AllowedMismatch()))
if MinLength() > 0 {
opts = append(opts, obiapat.OptionMinLength(MinLength()))
}
if MaxLength() > 0 {
opts = append(opts, obiapat.OptionMaxLength(MaxLength()))
}
if Circular() {
opts = append(opts, obiapat.OptionCircular(Circular()))
}
worker := obiapat.PCRSliceWorker(forward, reverse, opts...)
return iterator.MakeISliceWorker(worker).IBioSequence(), nil
}

62
test/main.go Normal file
View File

@ -0,0 +1,62 @@
package main
import (
"fmt"
"log"
"os"
"runtime/trace"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obialign"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
func main() {
ftrace, err := os.Create("cpu.trace")
if err != nil {
log.Fatal(err)
}
trace.Start(ftrace)
defer trace.Stop()
// option_parser := obioptions.GenerateOptionParser(
// obiconvert.InputOptionSet,
// )
//_, args, _ := option_parser(os.Args)
// fs, _ := obiconvert.ReadBioSequences(args...)
// buffer := make([]byte, 0)
// fs.Next()
// s := fs.Get()
// index := obikmer.Index4mer(s, nil, nil)
// for fs.Next() {
// s := fs.Get()
// if s.IsNil() {
// log.Panicln("Read sequence is nil")
// }
// maxshift, maxcount := obikmer.FastShiftFourMer(index, s, buffer)
// fmt.Printf("Shift : %d Score : %d\n", maxshift, maxcount)
// }
A := []byte("ccgcctccttagaacaggctcctctagaaaaccatagtgggatatctaaagaaggcggagatagaaagagcggttcagcaggaatgccgagatggacggcgtgtgacg")
B := []byte("cgccaccaccgagatctacactctttccctacacgacgctcttccgatctccgcctccttagaacaggctcctctagaaaagcatagtggggtatctaaaggaggcgg")
sA := obiseq.MakeBioSequence("A", A, "")
sB := obiseq.MakeBioSequence("B", B, "")
fmt.Println(string(sA.Sequence()))
fmt.Println(sA.Qualities())
fmt.Println(string(sB.Sequence()))
fmt.Println(sB.Qualities())
score, path := obialign.PELeftAlign(sA, sB, 2, obialign.NilPEAlignArena)
fmt.Printf("Score : %d Path : %v\n", score, path)
score, path = obialign.PERightAlign(sA, sB, 2, obialign.NilPEAlignArena)
fmt.Printf("Score : %d Path : %v\n", score, path)
fmt.Println(string(sA.Sequence()))
sA.ReverseComplement(true)
fmt.Println(string(sA.Sequence()))
fmt.Println(string(sA.Id()))
}

58
test/test_test.go Normal file
View File

@ -0,0 +1,58 @@
package main_test
import (
"fmt"
"testing"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiannot"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiformats"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
func TestParseOBIFasta(t *testing.T) {
f := "/Users/coissac/travail/Adeline/Soumission_data/Zonation/euka03/euka03.ecotag.fasta.gz"
var nseq, nread int
nseq = 0
nread = 0
fs := obiformats.ReaderFromIlluminaFile(f)
fmt.Println(f)
for i := range obiannot.ExtractHeaderChannel(fs, fastseq.ParseOBIHeader) {
for _, s := range i {
nseq++
nread += s.Count()
}
}
fmt.Println(nseq, nread)
}
func ExtractHeaderChannel(fs fastseq.IFastSeq, sequence func(sequence obiseq.Sequence)) {
panic("unimplemented")
}
// Performance test of an ADEXP message parsing
func BenchmarkParseOBIFasta(t *testing.B) {
f := "/Users/coissac/travail/Adeline/Soumission_data/Zonation/euka03/euka03.ecotag.fasta.gz"
var nseq, nread int
nseq = 0
nread = 0
fs := fastseq.ReaderFromIlluminaFile(f)
fmt.Println(f)
for i := range obiannot.ExtractHeaderChannel(fs, fastseq.ParseOBIHeader) {
for _, s := range i {
nseq++
nread += s.Count()
}
}
fmt.Println(nseq, nread)
}