CDS detector added

Former-commit-id: 93fac7a70052d06c2a12bf8af59820c653edd31b
Former-commit-id: 0869fdad0f550941a0f78f1e4c57f4fcdb3f6076
This commit is contained in:
alain viari
2015-11-08 14:28:05 +01:00
parent 50d5458bc1
commit 8dab2d56b2
648 changed files with 290109 additions and 543 deletions

View File

@ -0,0 +1,131 @@
#
# select best cluster(s)
#
# -v MAX_SPAN ALLOW_STOP EXCLUDE
#
BEGIN {
PROCINFO["sorted_in"] = "@ind_num_asc"
if (MAX_SPAN == "") MAX_SPAN = 10000
if (ALLOW_STOP == "") ALLOW_STOP = 0
if (EXCLUDE == "") EXCLUDE = 0
}
/^\# --- START OF GFF DUMP ---/ {
State = "gff"
NbEntry++
next
}
/^\# --- END OF GFF DUMP ---/ {
State = 0
next
}
/^C4 Alignment:/ {
for (i = 1 ; i <= 8 ; i++)
getline
State = "align"
Align = ""
next
}
/^\#/ { next }
(State == 0) { next }
(State == "gff") && ($3 == "gene") {
span = $5 - $4 + 1
valid = (span < MAX_SPAN) \
&& (ALLOW_STOP || (Align !~ "\\*")) \
&& ((EXCLUDE == 0) || (! (Organism ~ EXCLUDE)))
Entry[NbEntry]["valid"] = valid
Entry[NbEntry]["from"] = $4+0
Entry[NbEntry]["to"] = $5+0
Entry[NbEntry]["score"] = $6+0
Entry[NbEntry]["strand"] = $7
if (valid) {
for (i = $4+0 ; i <= $5+0; i++)
Cover[i] = 1
}
}
(State == "gff") && ($3 == "exon") {
Entry[NbEntry]["nbexon"]++
}
(State == "gff") {
n = ++Entry[NbEntry]["nbline"]
$1=$2=""
Entry[NbEntry][n] = $0
}
(State == "align") && /^vulgar/ {
State = 0
split($2, a, "@")
Organism = a[1]
next
}
(State == "align") {
getline; getline
Align = Align "" $0
getline; getline
next
}
END {
# make clusters
pi = -1
for (i in Cover) {
if (i+0 > pi+1)
Clust[++NbClust]["from"] = i
pi = Clust[NbClust]["to"] = i
}
# get highest scoring clusters
for (i = 1 ; i <= NbEntry ; i++) {
valid = Entry[i]["valid"]
if (! valid) continue
clusno = 0
for (j = 1; j <= NbClust; j++) {
if ((Entry[i]["from"] >= Clust[j]["from"]) && (Entry[i]["to"] <= Clust[j]["to"]))
clusno = j
}
valid = (clusno != 0)
if (! valid) continue
score = Entry[i]["score"]
if (score > Clust[clusno]["score"]+0) {
Clust[clusno]["score"] = score
Clust[clusno]["strand"] = Entry[i]["strand"]
Clust[clusno]["entry"] = i
}
}
# print cluster info
print "c nclust", NbClust+0
for (i = 1 ; i <= NbClust ; i++) {
print "c cluster", i, "from", Clust[i]["from"], "to", Clust[i]["to"],\
"strand", Clust[i]["strand"], "score", Clust[i]["score"]
}
# print best clusters
for (i = 1 ; i <= NbClust ; i++) {
if (Clust[i]["score"] == 0) continue
j = Clust[i]["entry"]
s = Clust[i]["strand"]
n = Entry[j]["nbline"]
ne = Entry[j]["nbexon"]
print "c begin_entry", j, "cluster", i, "strand", s, "nbexon", ne
for (k = 1 ; k <= n ; k++) {
entry = Entry[j][k]
gsub("^ +", "", entry)
print "e", entry
}
print "c end_entry", j, "cluster", i
}
}

View File

@ -0,0 +1,18 @@
#
{
line = $0
if (length(line) > 80) {
print substr(line, 1, 80)
rest = substr(line, 81)
while (length(rest) > 59) {
print "FT " substr(rest, 1, 59)
rest = substr(rest, 60)
}
if (length(rest) > 0)
print "FT " rest
}
else {
print line
}
}

View File

@ -0,0 +1,135 @@
#
# extend start/stop
#
# -v FASTA START_MODEL STOP_MODEL START_WALK STOP_WALK
#
function UpStart(pos, strand, _local_, seq, i, imax, smax, s) {
seq = Seq
if (strand == "-") {
pos = LenSeq - pos + 1
seq = RevSeq
}
imax = 0
smax = 0
for (i = pos ; i >= Max(1, pos-START_WALK) ; i -= 3) {
s = substr(seq, i, 3)
if (s in StopModel) break
if ((s in StartModel) && (StartModel[s] > smax)) {
imax = i
smax = StartModel[s]
}
}
if (strand == "-") {
imax = (imax > 0) ? LenSeq - imax + 1 : imax
}
return imax
}
#
function DownStop(pos, strand, _local_, seq, i, imax, s) {
seq = Seq
if (strand == "-") {
pos = LenSeq - pos + 1
seq = RevSeq
}
imax = 0
for (i = pos ; i < Min(LenSeq, pos+STOP_WALK) ; i += 3) {
s = substr(seq, i, 3)
if (s in StopModel) {
imax = i
break
}
}
if (strand == "-") {
imax = (imax > 0) ? LenSeq - imax + 1 : imax
}
return imax
}
#
# rules
#
BEGIN {
if (START_MODEL == "") START_MODEL="Models/start.default.frq"
if (STOP_MODEL == "") STOP_MODEL="Models/stop.default.frq"
if (START_WALK == "") START_WALK=120
if (STOP_WALK == "") STOP_WALK=-1
if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)
Seq = ReadFasta(FASTA)
LenSeq = length(Seq)
RevSeq = RevComplement(Seq)
if (START_WALK < 0) START_WALK = LenSeq
if (STOP_WALK < 0) STOP_WALK = LenSeq
if (! TestPath(START_MODEL)) Error("model file: '" START_MODEL "' not found", 2)
if (! TestPath(STOP_MODEL)) Error("model file: '" STOP_MODEL "' not found", 2)
ReadModel(START_MODEL, StartModel)
ReadModel(STOP_MODEL, StopModel)
}
#
/^c begin_entry/ {
Strand = $7
nbexon = $9+0
StartExon = 1 # always first (even on - strand)
StopExon = nbexon # always last (even on - strand)
NbExon = 0
}
/^c/ {
print $0
next
}
/^e exon/ {
NbExon++
if (NbExon == StartExon) {
pos = (Strand == "+" ? $3+0 : $4+0)
start = UpStart(pos, Strand)
mod_start = (start == 0 ? (Strand == "+" ? "<" : ">") : "=")
start = (start == 0 ? pos : start)
$(Strand == "+" ? 3 : 4) = start
} else {
mod_start = "="
}
if (NbExon == StopExon) {
pos = (Strand == "+" ? $4+0 : $3+0)
pos += (Strand == "+" ? 1 : -1)
stop = DownStop(pos, Strand)
mod_stop = (stop == 0 ? (Strand == "+" ? ">" : "<") : "=")
last = (stop == 0 ? pos : stop)
last += (Strand == "+" ? -1 : 1)
stop = last + (stop == 0 ? 0 : (Strand == "+") ? 3 : -3)
$(Strand == "+" ? 4 : 3) = stop
} else {
mod_stop = "="
}
modif = (Strand == "+" ? mod_start "" mod_stop : mod_stop "" mod_start)
print $0, "; modifier \"" modif "\""
next
}
/^e (intron|splice|similarity)/ {
print $0
next
}

View File

@ -0,0 +1,27 @@
#
#
# filter BlastX results for speedup
#
BEGIN {
if (IDMIN == "") IDMIN = 80
if (NBMIN == "") NBMIN = 50
if (NBMAX == "") NBMAX = 200
}
/^#/ { next }
{
if ((($3+0 >= IDMIN) || (HitNum <= NBMIN)) && (HitIndex[$2] == 0)) {
HitIndex[$2] = ++HitNum
IndexHit[HitNum] = $2
}
}
END {
n = (HitNum > NBMAX) ? NBMAX : HitNum
for (i = 1 ; i <= n ; i++)
print IndexHit[i]
}

View File

@ -0,0 +1,62 @@
#!/bin/csh -f
#
# filter a DB thru BlastX
# usually to speedup further DB search
#
# output on stdout
#
# usage: go_filterbx.sh dna_fasta prot_fasta [idmin nbmin nbmax]
#
setenv ORG_HOME `dirname $0`/../../..
source $ORG_HOME/scripts/csh_init.sh
NeedArg 2
set GenoFile = $Argv[1]; Shift
set ProtFile = $Argv[1]; Shift
NeedFile $GenoFile
NeedFile $ProtFile
set IDMIN = 70
set NBMIN = 50
set NBMAX = 200
if ($#Argv >= 1) set IDMIN = $Argv[1]
if ($#Argv >= 2) set NBMIN = $Argv[2]
if ($#Argv >= 3) set NBMAX = $Argv[3]
#
# format ProtFile
#
if (! -e $ProtFile.phr) then
Notify " formatting $ProtFile"
(makeblastdb -dbtype prot -in $ProtFile) >& /dev/null
CheckAbort 10 "makeblastdb failure"
endif
#
# blastx
#
Notify " blasting $ProtFile"
blastx -query $GenoFile -db $ProtFile -outfmt 7 |\
$AwkCmd -v IDMIN=$IDMIN \
-v NBMIN=$NBMIN \
-v NBMAX=$NBMAX \
-f $LIB_DIR/filterbx.awk > T_$$
#
# extract subdb
#
$AwkCmd -v FILE=T_$$ -f $LIB_DIR/subdb.awk $ProtFile
#
# end
#
(\rm -f ?_$$) >> /dev/null
Exit 0

197
detectors/cds/lib/go_pass1.sh Executable file
View File

@ -0,0 +1,197 @@
#!/bin/csh -f
#
# Annotate CDS - Pass1
#
#========================================================================================
#
# Annotate CDS of chlorodb/core proteins using exonerate
#
# pass1.sh <FASTAFILE> <FAMILY> [<OUTDIR>]
#
# - <FASTAFILE> : The fasta file containing the genome to annotate
# - <FAMILY> : Name of the protein family (defined in chlorodb/core)
#
# Results are in file : `basename <FASTAFILE>:r`.<FAMILY>.res
#
#========================================================================================
#
# usage: go_pass1.sh fasta family [outdir]
#
setenv ORG_HOME `dirname $0`/../../..
source $ORG_HOME/scripts/csh_init.sh
set PARAMS_DIR = $LIB_DIR/../params
set MODELS_DIR = $LIB_DIR/../models
alias Override 'if (-e \!:2) set \!:1 = \!:2'
NeedArg 2
set GenoFile = $Argv[1]
set GenoName = `basename $GenoFile:r`
set ProtName = $Argv[2]
set ProtDir = $CDS_DATA_DIR/chlorodb/core
set ProtFile = $ProtDir/$ProtName.fst
NeedFile $GenoFile
NeedFile $ProtFile
set OutDir = .
if ($#Argv >= 3) set OutDir = $3
if (! -d $OutDir) mkdir $OutDir
#
# general parameters
#
source $PARAMS_DIR/default
#
# family specific parameters
#
if (-e $PARAMS_DIR/$ProtName) then
source $PARAMS_DIR/$ProtName
endif
#
# start/stop/splice models
#
if ($?STARTMODEL == 0) then
set STARTMODEL = $MODELS_DIR/start.default.frq
Override STARTMODEL $MODELS_DIR/start.$ProtName.frq
endif
if ($?STOPMODEL == 0) then
set STOPMODEL = $MODELS_DIR/stop.default.frq
Override STOPMODEL $MODELS_DIR/stop.$ProtName.frq
endif
if ($?SPLICE3MODEL == 0) then
set SPLICE3MODEL = $MODELS_DIR/splice3.default.frq
Override SPLICE3MODEL $MODELS_DIR/splice3.$ProtName.frq
endif
if ($?SPLICE5MODEL == 0) then
set SPLICE5MODEL = $MODELS_DIR/splice5.default.frq
Override SPLICE5MODEL $MODELS_DIR/splice5.$ProtName.frq
endif
#
# out files prefix
#
set base = $OutDir/$GenoName.$ProtName
#
# skip exonerate calculations if already done
#
if (-e $base.exo.raw) then
Notify " file $base.exo.raw found <exonerate skipped>"
goto parse
endif
#
# speedup exonerate
#
if ($PASS1_SPEEDUP != 0) then
$LIB_DIR/go_filterbx.sh $GenoFile $ProtFile \
$PASS1_BLASTX_FILTER_IDMIN \
$PASS1_BLASTX_FILTER_NBMIN \
$PASS1_BLASTX_FILTER_NBMAX > D_$$
set n = `egrep "^>" D_$$ | wc -l`
if ($n > 0) then
Notify " $n sequences kept"
set DbFile = D_$$
else
Notify " no sequence match"
if ($PASS1_SLOWDOWN != 0) then
Notify " reverting to original $ProtName"
set DbFile = $ProtFile
else
echo "" > $base.exo.raw
goto parse
endif
endif
else
set DbFile = $ProtFile
endif
#
# run exonerate
#
Notify " running exonerate of $GenoName on $ProtName"
exonerate --model protein2genome \
--percent $PASS1_PERCENT \
--showalignment TRUE \
--showvulgar TRUE \
--showtargetgff TRUE \
--geneticcode $PASS1_GENETIC_CODE \
--minintron $PASS1_MIN_INTRON \
--maxintron $PASS1_MAX_INTRON \
--bestn $PASS1_BESTN \
--frameshift $PASS1_FRAMESHIFT \
--proteinsubmat $PASS1_SUBMAT \
--splice3 $SPLICE3MODEL \
--splice5 $SPLICE5MODEL \
$DbFile $GenoFile > $base.exo.raw
CheckAbort 20 "exonerate failure"
#
# extract best clusters
#
parse:
$AwkCmd -v MAX_SPAN=$PASS1_MAX_SPAN \
-v ALLOW_STOP=$PASS1_ALLOW_STOP \
-v EXCLUDE=$GenoName \
-f $LIB_DIR/bestclust.awk $base.exo.raw > $base.exo.best
#
# get annotations
#
egrep "^$ProtName " $CDS_DATA_DIR/chlorodb/core/Annot.lst |\
awk '{print "c annot", $0}' > T_$$
#
# extend start/stop
#
$AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/extend.awk \
-v FASTA=$GenoFile \
-v START_MODEL=$STARTMODEL \
-v STOP_MODEL=$STOPMODEL \
-v START_WALK=$PASS1_START_WALK \
-v STOP_WALK=$PASS1_STOP_WALK \
$base.exo.best >> T_$$
#
# translate
#
$AwkCmd -v FASTA=$GenoFile -f $LIB_DIR/libutil.awk \
-f $LIB_DIR/translate.awk T_$$ > $base.iff
#
# convert to embl
#
$AwkCmd -f $LIB_DIR/toEmbl.awk $base.iff |\
$AwkCmd -f $LIB_DIR/cutline.awk > $base.res
#
# end
#
Notify " output file: $base.res"
(\rm -f ?_$$) >> /dev/null
Exit 0

View File

@ -0,0 +1,179 @@
#
# utilities library
#
END {
if (_ForceExit_) exit(_ForceExit_)
}
function Notify(key, msg) {
print "# " key " " msg >> "/dev/stderr"
}
function Info(msg) {
Notify("info", msg)
return 1
}
function Warning(msg) {
Notify("warning", msg)
return 0
}
function Exit(status) {
exit(_ForceExit_ = status)
}
function Error(msg, status) {
Notify("error", msg)
Exit(status ? status : 1)
}
function Assert(condition, msg, status) {
if (! condition) {
msg = FILENAME ":" FNR ": " msg
return status ? Error(msg, status) : Warning(msg)
}
}
#
function Min(a, b) {
return (a < b ? a : b)
}
function Max(a, b) {
return (a > b ? a : b)
}
#
function Trim(s, regexp) {
if (regexp == 0) regexp = "[ \t]+"
gsub("^" regexp "|" regexp "$", "", s)
return s
}
function ShieldPath(path) {
gsub(" ", "\\ ", path)
return path
}
function TestPath(path, test, _local_, stat) {
if (test == 0) test = "-f"
if (Trim(path) == "")
return 0 # because of a bug in 'test'
stat = system("test " test " " ShieldPath(path))
return stat ? 0 : 1
}
#
function Reverse(s, _local_, i, n, rs) {
rs = "";
n = length(s);
for (i = n ; i >= 1 ; i--)
rs = rs "" substr(s, i, 1)
return rs;
}
function RevComplement(seq, _local_, n, i, c, rs) {
n = length(seq)
rs = ""
for (i = n ; i >= 1 ; i--) {
c = substr(seq, i, 1)
rs = rs "" (_DnaC[c] ? _DnaC[c] : "X")
}
return rs
}
#
function _AssertCode(name, _local_, n, i1, i2, i3, b1, b2, b3) {
if (_InitCod[name] != 0) return 1
for (i1 = 1 ; i1 <= 4 ; i1++) {
b1 = substr(_NucOrder, i1, 1)
for (i2 = 1 ; i2 <= 4 ; i2++) {
b2 = substr(_NucOrder, i2, 1)
for (i3 = 1 ; i3 <= 4 ; i3++) {
b3 = substr(_NucOrder, i3, 1)
_GenCod[name][b1 "" b2 "" b3] = substr(_Cod2Aa[name], ++n, 1)
}
}
}
_InitCod[name] = 1
return 1
}
function Translate(seq, codname, _local_, n, i, c, p) {
if (codname == 0) codname = "universal"
_AssertCode(codname)
seq = toupper(seq)
n = length(seq)
p = ""
for (i = 1 ; i <= n ; i += 3) {
c = substr(seq, i, 3)
p = p "" ((c in _GenCod[codname]) ? _GenCod[codname][c] : "X")
}
return p
}
#
function SubSeq(seq, from, to, revstrand) {
seq = substr(seq, from, to-from+1)
if (revstrand) seq = RevComplement(seq)
return seq
}
#
function ReadFasta(file, _local_, seq, context) {
context = $0
seq = ""
getline < file
while(getline < file) seq = seq "" $0
$0 = context
return seq
}
#
function ReadModel(file, a, _local_, line, context) {
context = $0
delete a
while(getline < file)
if (! ($0 ~ "^#")) a[$1] = $2
$0 = context
}
#
# constants
#
BEGIN {
# complementary of _IupacDna
_DnaC["A"] = "T"; _DnaC["C"] = "G"; _DnaC["G"] = "C"; _DnaC["T"] = "A"
_DnaC["R"] = "Y"; _DnaC["Y"] = "R"; _DnaC["M"] = "K"; _DnaC["K"] = "M"
_DnaC["W"] = "W"; _DnaC["S"] = "S"; _DnaC["B"] = "V"; _DnaC["V"] = "B"
_DnaC["D"] = "H"; _DnaC["H"] = "D"; _DnaC["N"] = "N"; _DnaC["X"] = "X"
_DnaC["a"] = "t"; _DnaC["c"] = "g"; _DnaC["g"] = "c"; _DnaC["t"] = "a"
_DnaC["r"] = "y"; _DnaC["y"] = "r"; _DnaC["m"] = "k"; _DnaC["k"] = "m"
_DnaC["w"] = "w"; _DnaC["s"] = "s"; _DnaC["b"] = "v"; _DnaC["v"] = "b"
_DnaC["d"] = "h"; _DnaC["h"] = "d"; _DnaC["n"] = "n"; _DnaC["x"] = "x"
# genetic codes
_NucOrder = "ACGT"
#1 AAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTT
#2 AAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTT
#3 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
_Cod2Aa["universal"] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF"
_Cod2Aa["mito-yeast"] = "KNKNTTTTRSRSMIMIQHQHPPPPRRRRTTTTEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF"
_Cod2Aa["mito-vertebrates"] = "KNKNTTTT*S*SMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF"
_Cod2Aa["mito-insects"] = "KNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF"
_Cod2Aa["mito-echinoderms"] = "NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF"
_Cod2Aa["mycoplasma"] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF"
_Cod2Aa["ciliata"] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSS*CWCLFLF"
_Cod2Aa["euplotes"] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSCCWCLFLF"
_Cod2Aa["candida"] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRXLSLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF"
}

View File

@ -0,0 +1,20 @@
#
# select subDB from fasta DB
#
# -v FILE
BEGIN {
if (FILE == "") FILE = "dbsel.txt"
while (getline < FILE)
INC[$1] = $1
close(FILE)
}
/^>/ {
name = substr($1, 2)
ok = name in INC
}
ok {
print $0
}

View File

@ -0,0 +1,141 @@
#
# iff -> embl
#
function FromLoc(i, _local_, s) {
s = substr(Exon[i]["modif"], 1, 1) "" Exon[i]["from"]
gsub("=", "", s)
return s
}
function ToLoc(i, _local_, s) {
s = substr(Exon[i]["modif"], 2, 1) "" Exon[i]["to"]
gsub("=", "", s)
return s
}
function GeneLocation(_local_, d, s) {
d = Exon[1]["strand"]
if (d == "+")
s = FromLoc(1) ".." ToLoc(Nexon)
else
s = FromLoc(Nexon) ".." ToLoc(1)
if (d == "-") s = "complement(" s ")"
return s
}
function CdsLocation(_local_, d, i, s) {
d = Exon[1]["strand"]
if (d == "+") {
for (i = 1 ; i <= Nexon ; i++)
s = s "," FromLoc(i) ".." ToLoc(i)
}
else {
for (i = Nexon ; i >= 1 ; i--)
s = s "," FromLoc(i) ".." ToLoc(i)
}
s = substr(s, 2)
if (Nexon > 1) s = "join(" s ")"
if (d == "-") s = "complement(" s ")"
return s
}
function ExonLocation(i, _local_, s) {
s = FromLoc(i) ".." ToLoc(i)
if (Exon[i]["strand"] == "-") s = "complement(" s ")"
return s
}
function Pad(s, len) {
while (length(s) < len)
s = s " "
return s
}
function Feature(feat, loc, _local_, s) {
s = Pad("FT " feat, 21)
print s "" loc
}
function SQualifier(qual, val, _local_, s) {
s = Pad("FT", 21)
print s "/" qual "=" val
}
function QQualifier(qual, val) {
SQualifier(qual, "\"" val "\"")
}
#
# rules
#
/^c annot/ {
GeneName = $4
Product = $NF
gsub("_", " ", Product)
next
}
/^c nclust/ {
Ngene = $3
next
}
/^c begin_entry/ {
Nexon = 0
delete Exon
next
}
/^e exon/ {
Nexon++
Exon[Nexon]["from"] = $3
Exon[Nexon]["to"] = $4
Exon[Nexon]["strand"] = $6
Exon[Nexon]["indels"] = $9 "+" $12
modif = $15; gsub("\"", "", modif)
Exon[Nexon]["modif"] = modif
next
}
/^e similarity/ {
split($12, a, "@")
Simil = a[1] ":" a[2]
next
}
/^e translate/ {
Translat = $3
next
}
/^c end_entry/ {
gname = (Ngene == 1 ? GeneName : GeneName "_" ++Igene)
locus = ""
Feature("gene", GeneLocation())
QQualifier("gene", gname)
QQualifier("locus_tag", locus)
Feature("CDS", CdsLocation())
SQualifier("codon_start", 1)
SQualifier("transl_table", 11)
QQualifier("gene", gname)
QQualifier("locus_tag", locus)
SQualifier("product", Product)
QQualifier("inference", "similar to DNA sequence:" Simil)
QQualifier("translation", Translat)
if (Nexon > 1) {
for (i = 1 ; i <= Nexon ; i++) {
Feature("exon", ExonLocation(i))
QQualifier("gene", gname)
QQualifier("locus_tag", locus)
SQualifier("number", Exon[1]["strand"] == "+" ? i : Nexon-i+1)
}
}
}

View File

@ -0,0 +1,39 @@
#
# translate CDSs from iff file
#
BEGIN {
Seq = ReadFasta(FASTA)
}
/^c end_entry/ {
if (RevStrand) Cds = RevComplement(Cds)
Prot = Translate(substr(Cds, 1, length(Cds)-3))
if (Modif == "=")
Prot = "M" substr(Prot, 2)
print "e translate " Prot
}
{
print $0
}
/^c begin_entry/ {
Cds = ""
Iexon = 0
next
}
/^e exon/ {
RevStrand = ($6 == "-")
if (++Iexon == 1) { # first is exon with start (even on - strand)
Modif = $15
gsub("\"", "", Modif)
Modif = (RevStrand ? substr(Modif, 2, 1) : substr(Modif, 1, 1))
}
if (RevStrand)
Cds = SubSeq(Seq, $3, $4) "" Cds
else
Cds = Cds "" SubSeq(Seq, $3, $4)
next
}