Files
annotate/detectors/cds/lib/bestclust.awk
2025-05-22 05:36:04 +02:00

159 lines
3.1 KiB
Awk

#
# select best cluster(s)
#
# -v MAX_SPAN ALLOW_STOP EXCLUDE
#
#
#
BEGIN {
PROCINFO["sorted_in"] = "@ind_num_asc"
if (MAX_SPAN == "") MAX_SPAN = 10000
if (ALLOW_STOP == "") ALLOW_STOP = 0
if (EXCLUDE == "") EXCLUDE = 0
}
/^# --- START OF GFF DUMP ---/ {
State = "gff"
NbEntry++
next
}
/^# --- END OF GFF DUMP ---/ {
State = 0
next
}
/^C4 Alignment:/ {
for (i = 1 ; i <= 8 ; i++)
getline
State = "align"
Align = ""
next
}
/^#/ { next }
(State == 0) { next }
(State == "gff") && ($3 == "gene") {
span = $5 - $4 + 1
valid = (span < MAX_SPAN) \
&& (ALLOW_STOP || (Align !~ "\\*")) \
&& ((EXCLUDE == 0) || (! (Organism ~ EXCLUDE)))
Entry[NbEntry]["valid"] = valid
Entry[NbEntry]["from"] = $4+0
Entry[NbEntry]["to"] = $5+0
Entry[NbEntry]["score"] = $6+0
Entry[NbEntry]["strand"] = $7
Entry[NbEntry]["stop"] = (Align !~ "\\*")
if (valid) {
for (i = $4+0 ; i <= $5+0; i++)
Cover[i] = 1
}
}
(State == "gff") && ($3 == "exon") {
Entry[NbEntry]["nbexon"]++
}
(State == "gff") {
n = ++Entry[NbEntry]["nbline"]
$1=$2=""
Entry[NbEntry][n] = $0
}
(State == "align") && /^vulgar/ {
State = 0
split($2, a, "@")
Organism = a[1]
next
}
(State == "align") {
getline; getline
Align = Align "" $0
getline; getline
next
}
END {
ClustScoreMax=0
# make clusters
pi = -1
for (i in Cover) {
if (i+0 > pi+1)
Clust[++NbClust]["from"] = i
pi = Clust[NbClust]["to"] = i
}
# get highest scoring clusters
for (i = 1 ; i <= NbEntry ; i++) {
valid = Entry[i]["valid"]
if (! valid) continue
clusno = 0
for (j = 1; j <= NbClust; j++) {
if ((Entry[i]["from"]+0 >= Clust[j]["from"]+0) && (Entry[i]["to"]+0 <= Clust[j]["to"]+0)) {
clusno = j
}
}
valid = (clusno != 0)
if (! valid) continue
score = Entry[i]["score"]
pfrom = Entry[i]["from"]
if (score > Clust[clusno]["score"]+0 ||
(score == Clust[clusno]["score"]+0 && pfrom < Clust[clusno]["from"]+0)) {
Clust[clusno]["score"] = score
Clust[clusno]["strand"] = Entry[i]["strand"]
Clust[clusno]["entry"] = i
if (score > ClustScoreMax)
ClustScoreMax=score
}
}
# Consider only cluster with at least 95% of the best score
NbClustOk=0
for (i = 1 ; i <= NbClust ; i++) {
if (Clust[i]["score"]+0 >= ClustScoreMax * 0.95)
NbClustOk++
else
Clust[i]["score"]=0
}
# print cluster info
print "c nclust", NbClustOk+0
for (i = 1 ; i <= NbClust ; i++) {
if (Clust[i]["score"]+0 == 0) continue
print "c cluster", i, "from", Clust[i]["from"], "to", Clust[i]["to"],\
"strand", Clust[i]["strand"], "score", Clust[i]["score"]
}
# print best clusters
for (i = 1 ; i <= NbClust ; i++) {
if (Clust[i]["score"]+0 == 0) continue
j = Clust[i]["entry"]
s = Clust[i]["strand"]
n = Entry[j]["nbline"]
ne = Entry[j]["nbexon"]
print "c begin_entry", j, "cluster", i, "strand", s, "nbexon", ne
for (k = 1 ; k <= n ; k++) {
entry = Entry[j][k]
gsub("^ +", "", entry)
print "e", entry
}
print "c end_entry", j, "cluster", i
}
}