Make changes to better detect pseudo genes frameshited and annotate them

correctly

Former-commit-id: d827908d63149941538e686b48f60a132173cb80
Former-commit-id: 2841c75b415c6c8fa35a6a90e23cf82c3c84408b
This commit is contained in:
2016-10-06 10:06:37 -03:00
parent 860cd217d4
commit 16b5e2927d
2 changed files with 69 additions and 23 deletions

View File

@ -150,20 +150,25 @@ endif
#
Notify " running exonerate of $GenoName on $ProtName"
exonerate --model protein2genome \
--percent $PASS1_PERCENT \
--showalignment TRUE \
--showvulgar TRUE \
--showtargetgff TRUE \
--geneticcode $PASS1_GENETIC_CODE \
--minintron $PASS1_MIN_INTRON \
--maxintron $PASS1_MAX_INTRON \
--bestn $PASS1_BESTN \
--frameshift $PASS1_FRAMESHIFT \
--proteinsubmat $PASS1_SUBMAT \
--splice3 $SPLICE3MODEL \
--splice5 $SPLICE5MODEL \
$DbFile $GenoFile > $base.exo.raw
exonerate \
--model protein2genome \
--percent $PASS1_PERCENT \
--showalignment TRUE \
--showvulgar TRUE \
--showtargetgff TRUE \
--geneticcode $PASS1_GENETIC_CODE \
--minintron $PASS1_MIN_INTRON \
--maxintron $PASS1_MAX_INTRON \
--bestn $PASS1_BESTN \
--frameshift $PASS1_FRAMESHIFT \
--proteinsubmat $PASS1_SUBMAT \
--splice3 $SPLICE3MODEL \
--splice5 $SPLICE5MODEL \
--refine region \
--refineboundary 5000 \
--singlepass FALSE \
--dpmemory 128 \
$DbFile $GenoFile > $base.exo.raw
CheckAbort 20 "exonerate failure"
#

View File

@ -94,10 +94,10 @@ function Unk(s) {
}
/^c begin_entry/ {
Nexon = 0
Nexon = 0
FrameShift=0
delete Exon
next
delete Exon
next
}
/^e exon/ {
@ -105,12 +105,39 @@ function Unk(s) {
Exon[Nexon]["from"] = $3
Exon[Nexon]["to"] = $4
Exon[Nexon]["strand"] = $6
Exon[Nexon]["indels"] = $9 "+" $12
modif = $15; gsub("\"", "", modif)
Exon[Nexon]["modif"] = modif
if ( $0 ~ /frameshifts +[0-9]+/)
FrameShift=1
match($0, / insertions +([0-9]+) +/, arr)
if (arr[1])
insertions=arr[1]
else
insertions=0
match($0, / insertions +([0-9]+) +/, arr)
if (arr[1])
deletions=arr[1]
else
deletions=0
Exon[Nexon]["indels"] = insertions "+" $deletions
match($0, / modifier +"([^"]*)"/, arr)
if (arr[1])
modif=substr(arr[1],1,1)
else
modif=""
if (modif == "=")
modif=""
Exon[Nexon]["modif"] = modif
match($0, / frameshifts +([-0-9]+)/, arr)
if (arr[1]) {
FrameShift=FrameShift+1
Exon[Nexon]["frameshift"] = arr[1]
}
else
Exon[Nexon]["frameshift"] = 0
next
}
@ -144,8 +171,13 @@ function Unk(s) {
SQualifier("transl_table", 11)
QQualifier("gene", gname)
QQualifier("locus_tag", locus)
if (FrameShift)
if (FrameShift) {
QQualifier("pseudogene","unknown")
if (FrameShift > 1)
QQualifier("note","nonfunctional due to frameshifts in "FrameShift" exons")
else
QQualifier("note","nonfunctional due to a frameshift")
}
QQualifier("product", Product)
QQualifier("inference", "similar to DNA sequence:" Simil)
QQualifier("inference", "org.annot -- detect pass:" PassType ":" PassInfo)
@ -157,6 +189,15 @@ function Unk(s) {
Feature("exon", ExonLocation(i))
QQualifier("gene", gname)
QQualifier("locus_tag", locus)
if (Exon[i]["frameshift"]) {
QQualifier("pseudogene","unknown")
if (Exon[i]["frameshift"] > 0)
QQualifier("note","frameshifted by insertion of " Exon[i]["frameshift"] " bp")
else
QQualifier("note","frameshifted by deletion of " -Exon[i]["frameshift"] " bp")
}
SQualifier("number", Exon[1]["strand"] == "+" ? i : Nexon-i+1)
}
}