
Former-commit-id: 0579e878a69b7c285ca71870e9ca5730649a2fda Former-commit-id: 7cced5b488441d87bf070a9a444317db0e048880
98 lines
1.3 KiB
Awk
98 lines
1.3 KiB
Awk
#
|
|
# get feature info from genbank
|
|
#
|
|
|
|
# @include libgbk.awk
|
|
|
|
function GC(s, _local_, i, len) {
|
|
s = toupper(s)
|
|
len = length(s)
|
|
gsub("G|C", "", s)
|
|
return ((len - length(s)) * 100 / len)
|
|
}
|
|
|
|
#
|
|
# rules
|
|
#
|
|
|
|
BEGIN {
|
|
print "#locus orga len oklen gc nbCds nbCds_int0 nbCds_int1 nbCds_intsup1 perCds_noex meanCdsSize nbtRNA nbrRNA nboRNA"
|
|
}
|
|
|
|
/^LOCUS/ {
|
|
locus = $2
|
|
next
|
|
}
|
|
|
|
/^ ORGANISM/ {
|
|
orga = substr($0, 13)
|
|
split(orga, a, ";")
|
|
orga = a[1]
|
|
gsub(" ", "_", orga)
|
|
next
|
|
}
|
|
|
|
/^ source/ {
|
|
GetLoc($2, loc);
|
|
len = loc[2];
|
|
next
|
|
}
|
|
|
|
/^ CDS/ {
|
|
meanCds = meanCds * nbCds + LenLocation($2)
|
|
nbCds++
|
|
meanCds /= nbCds
|
|
n = Nexons($2)
|
|
if (n > 3) n = 3
|
|
nbCdx[n]++
|
|
next
|
|
}
|
|
|
|
/^ tRNA/ {
|
|
nbTrna++
|
|
next
|
|
}
|
|
|
|
/^ rRNA/ {
|
|
nbRrna++
|
|
next
|
|
}
|
|
|
|
/^ mRNA/ {
|
|
next
|
|
}
|
|
|
|
/^ .*RNA/ {
|
|
nbOrna++
|
|
next
|
|
}
|
|
|
|
/^ORIGIN/ {
|
|
inseq = 1
|
|
seq = ""
|
|
next
|
|
}
|
|
|
|
inseq && /^ +[1-9][0-9]*/ {
|
|
s = substr($0, 11)
|
|
gsub(" ", "", s)
|
|
seq = seq "" s
|
|
next
|
|
}
|
|
|
|
/^\/\// {
|
|
oklen = (len == length(seq) ? "ok" : "wrong")
|
|
gc = GC(seq)
|
|
print locus, orga, len, oklen, gc, nbCds+0, nbCdx[1]+0, \
|
|
nbCdx[2]+0, nbCdx[3]+0, (nbCdx[1]+0)*100/Max(1, nbCds+0), \
|
|
meanCds+0, nbTrna+0, nbRrna+0, nbOrna+0
|
|
nbCds = nbTrna = nbRrna = nbOrna = len = inseq = meanCds = 0
|
|
delete nbCdx
|
|
orga = locus = "?"
|
|
}
|
|
|
|
|
|
|
|
|
|
|