From 16b5e2927d68d1e31abc5eb6a3a788cdb917c0e0 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 6 Oct 2016 10:06:37 -0300 Subject: [PATCH] Make changes to better detect pseudo genes frameshited and annotate them correctly Former-commit-id: d827908d63149941538e686b48f60a132173cb80 Former-commit-id: 2841c75b415c6c8fa35a6a90e23cf82c3c84408b --- detectors/cds/bin/do_exonerate.csh | 33 ++++++++++------- detectors/cds/lib/toEmbl.awk | 59 +++++++++++++++++++++++++----- 2 files changed, 69 insertions(+), 23 deletions(-) diff --git a/detectors/cds/bin/do_exonerate.csh b/detectors/cds/bin/do_exonerate.csh index edbcb81..7a8ca9c 100755 --- a/detectors/cds/bin/do_exonerate.csh +++ b/detectors/cds/bin/do_exonerate.csh @@ -150,20 +150,25 @@ endif # Notify " running exonerate of $GenoName on $ProtName" -exonerate --model protein2genome \ - --percent $PASS1_PERCENT \ - --showalignment TRUE \ - --showvulgar TRUE \ - --showtargetgff TRUE \ - --geneticcode $PASS1_GENETIC_CODE \ - --minintron $PASS1_MIN_INTRON \ - --maxintron $PASS1_MAX_INTRON \ - --bestn $PASS1_BESTN \ - --frameshift $PASS1_FRAMESHIFT \ - --proteinsubmat $PASS1_SUBMAT \ - --splice3 $SPLICE3MODEL \ - --splice5 $SPLICE5MODEL \ - $DbFile $GenoFile > $base.exo.raw +exonerate \ + --model protein2genome \ + --percent $PASS1_PERCENT \ + --showalignment TRUE \ + --showvulgar TRUE \ + --showtargetgff TRUE \ + --geneticcode $PASS1_GENETIC_CODE \ + --minintron $PASS1_MIN_INTRON \ + --maxintron $PASS1_MAX_INTRON \ + --bestn $PASS1_BESTN \ + --frameshift $PASS1_FRAMESHIFT \ + --proteinsubmat $PASS1_SUBMAT \ + --splice3 $SPLICE3MODEL \ + --splice5 $SPLICE5MODEL \ + --refine region \ + --refineboundary 5000 \ + --singlepass FALSE \ + --dpmemory 128 \ + $DbFile $GenoFile > $base.exo.raw CheckAbort 20 "exonerate failure" # diff --git a/detectors/cds/lib/toEmbl.awk b/detectors/cds/lib/toEmbl.awk index 9f3cc2c..624f19c 100644 --- a/detectors/cds/lib/toEmbl.awk +++ b/detectors/cds/lib/toEmbl.awk @@ -94,10 +94,10 @@ function Unk(s) { } /^c begin_entry/ { - Nexon = 0 + Nexon = 0 FrameShift=0 - delete Exon - next + delete Exon + next } /^e exon/ { @@ -105,12 +105,39 @@ function Unk(s) { Exon[Nexon]["from"] = $3 Exon[Nexon]["to"] = $4 Exon[Nexon]["strand"] = $6 - Exon[Nexon]["indels"] = $9 "+" $12 - modif = $15; gsub("\"", "", modif) - Exon[Nexon]["modif"] = modif - if ( $0 ~ /frameshifts +[0-9]+/) - FrameShift=1 + match($0, / insertions +([0-9]+) +/, arr) + if (arr[1]) + insertions=arr[1] + else + insertions=0 + + match($0, / insertions +([0-9]+) +/, arr) + if (arr[1]) + deletions=arr[1] + else + deletions=0 + + Exon[Nexon]["indels"] = insertions "+" $deletions + + match($0, / modifier +"([^"]*)"/, arr) + if (arr[1]) + modif=substr(arr[1],1,1) + else + modif="" + if (modif == "=") + modif="" + + Exon[Nexon]["modif"] = modif + + match($0, / frameshifts +([-0-9]+)/, arr) + if (arr[1]) { + FrameShift=FrameShift+1 + Exon[Nexon]["frameshift"] = arr[1] + } + else + Exon[Nexon]["frameshift"] = 0 + next } @@ -144,8 +171,13 @@ function Unk(s) { SQualifier("transl_table", 11) QQualifier("gene", gname) QQualifier("locus_tag", locus) - if (FrameShift) + if (FrameShift) { QQualifier("pseudogene","unknown") + if (FrameShift > 1) + QQualifier("note","nonfunctional due to frameshifts in "FrameShift" exons") + else + QQualifier("note","nonfunctional due to a frameshift") + } QQualifier("product", Product) QQualifier("inference", "similar to DNA sequence:" Simil) QQualifier("inference", "org.annot -- detect pass:" PassType ":" PassInfo) @@ -157,6 +189,15 @@ function Unk(s) { Feature("exon", ExonLocation(i)) QQualifier("gene", gname) QQualifier("locus_tag", locus) + + if (Exon[i]["frameshift"]) { + QQualifier("pseudogene","unknown") + if (Exon[i]["frameshift"] > 0) + QQualifier("note","frameshifted by insertion of " Exon[i]["frameshift"] " bp") + else + QQualifier("note","frameshifted by deletion of " -Exon[i]["frameshift"] " bp") + } + SQualifier("number", Exon[1]["strand"] == "+" ? i : Nexon-i+1) } }