cds/tools/chlorodb added
Former-commit-id: 0579e878a69b7c285ca71870e9ca5730649a2fda Former-commit-id: 7cced5b488441d87bf070a9a444317db0e048880
This commit is contained in:
19
detectors/cds/tools/lib/cds2fasta.awk
Normal file
19
detectors/cds/tools/lib/cds2fasta.awk
Normal file
@ -0,0 +1,19 @@
|
||||
#
|
||||
# get fasta sequence from cds list
|
||||
#
|
||||
# [-v FIELD=13] CDS sequence
|
||||
# [-v FIELD=14] Prot Sequence
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
if (CHARPERLINE == "") CHARPERLINE = 50
|
||||
if (FIELD == "") FIELD = 14
|
||||
}
|
||||
|
||||
/^#/ { next }
|
||||
|
||||
{
|
||||
name = $1 "@" $2 "@" $3 "@" $5 "@" $6 "@" $7 "@" $8 "@" int($9/3)
|
||||
comment = $NF
|
||||
PrintFasta($FIELD, name " " comment)
|
||||
}
|
97
detectors/cds/tools/lib/embl.cds_long.awk
Normal file
97
detectors/cds/tools/lib/embl.cds_long.awk
Normal file
@ -0,0 +1,97 @@
|
||||
#
|
||||
# get cds features from embl (long version)
|
||||
#
|
||||
# -v FASTA
|
||||
|
||||
# @include lib.embl.awk
|
||||
|
||||
|
||||
BEGIN {
|
||||
print "#locus locustag genefam gene from to strand nexon length status start stop dnaseq protseq product"
|
||||
|
||||
if (HEADONLY != "") exit(0)
|
||||
|
||||
if (MAXSPAN == "") MAXSPAN = 10000
|
||||
|
||||
if (FASTA == "") Error("No FASTA file specified", 1)
|
||||
|
||||
if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)
|
||||
|
||||
Seq = tolower(ReadFasta(FASTA))
|
||||
}
|
||||
|
||||
/^ID / {
|
||||
locus = $2
|
||||
gsub(";", "", locus)
|
||||
incds = 0
|
||||
next
|
||||
}
|
||||
|
||||
/^FT CDS/ {
|
||||
revstrand = match($3, "^complement")
|
||||
s = substr($0, 22)
|
||||
gsub("^complement", "", s)
|
||||
ok = ! match(s, "complement|order")
|
||||
nexon = Nexons(s)
|
||||
SpanLocation(s, sloc)
|
||||
spanlen = sloc[2] - sloc[1] + 1
|
||||
len = LenLocation(s)
|
||||
ok = ok && (len < MAXSPAN)
|
||||
cdsseq = ok ? SeqLocation(Seq, s, revstrand) : "XXX"
|
||||
cstart = substr(cdsseq, 1,3)
|
||||
cstop = substr(cdsseq, length(cdsseq)-2)
|
||||
|
||||
gene = "none"
|
||||
locustag = "none"
|
||||
product = "none"
|
||||
translation = "X"
|
||||
incds = 1
|
||||
next
|
||||
}
|
||||
|
||||
(incds && /^FT [^ ]/) {
|
||||
print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"),
|
||||
nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
|
||||
incds = 0
|
||||
next
|
||||
}
|
||||
|
||||
/^FT \/gene=/ {
|
||||
split($0, a, "=")
|
||||
gene = a[2]
|
||||
gsub("^[^a-z,A-Z]+", "", gene)
|
||||
gsub("\"", "", gene)
|
||||
gsub(" ", "_", gene)
|
||||
next
|
||||
}
|
||||
|
||||
/^FT \/locus_tag=/ {
|
||||
split($0, a, "=")
|
||||
locustag = a[2]
|
||||
gsub("\"", "", locustag)
|
||||
gsub(" ", "_", locustag)
|
||||
next
|
||||
}
|
||||
|
||||
/^FT \/product=/ {
|
||||
split($0, a, "=")
|
||||
product = a[2]
|
||||
gsub("\"", "", product)
|
||||
gsub(" ", "_", product)
|
||||
next
|
||||
}
|
||||
|
||||
/^FT \/translation=/ {
|
||||
split($0, a, "=")
|
||||
translation = a[2]
|
||||
gsub("\"", "", translation)
|
||||
gsub(" ", "", translation)
|
||||
next
|
||||
}
|
||||
|
||||
END {
|
||||
if (incds) {
|
||||
print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"),
|
||||
nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
|
||||
}
|
||||
}
|
97
detectors/cds/tools/lib/embl.info.awk
Normal file
97
detectors/cds/tools/lib/embl.info.awk
Normal file
@ -0,0 +1,97 @@
|
||||
#
|
||||
# get feature info from embl
|
||||
#
|
||||
|
||||
# @include libgbk.awk
|
||||
|
||||
function GC(s, _local_, i, len) {
|
||||
s = toupper(s)
|
||||
len = length(s)
|
||||
gsub("G|C", "", s)
|
||||
return ((len - length(s)) * 100 / (len ? len : 1))
|
||||
}
|
||||
|
||||
#
|
||||
# rules
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
print "#locus orga len oklen gc nbCds nbCds_int0 nbCds_int1 nbCds_intsup1 perCds_noex meanCdsSize nbtRNA nbrRNA nboRNA"
|
||||
}
|
||||
|
||||
/^ID/ {
|
||||
locus = $2
|
||||
gsub(";", "", locus)
|
||||
next
|
||||
}
|
||||
|
||||
/^OS/ {
|
||||
orga = substr($0, 6)
|
||||
gsub(" ", "_", orga)
|
||||
next
|
||||
}
|
||||
|
||||
/^FT source/ {
|
||||
GetLoc($3, loc);
|
||||
len = loc[2];
|
||||
next
|
||||
}
|
||||
|
||||
/^FT CDS/ {
|
||||
meanCds = meanCds * nbCds + LenLocation($3)
|
||||
nbCds++
|
||||
meanCds /= nbCds
|
||||
n = Nexons($3)
|
||||
if (n > 3) n = 3
|
||||
nbCdx[n]++
|
||||
next
|
||||
}
|
||||
|
||||
/^FT tRNA/ {
|
||||
nbTrna++
|
||||
next
|
||||
}
|
||||
|
||||
/^FT rRNA/ {
|
||||
nbRrna++
|
||||
next
|
||||
}
|
||||
|
||||
/^FT mRNA/ {
|
||||
next
|
||||
}
|
||||
|
||||
/^FT .*RNA/ {
|
||||
nbOrna++
|
||||
next
|
||||
}
|
||||
|
||||
/^SQ / {
|
||||
inseq = 1
|
||||
seq = ""
|
||||
next
|
||||
}
|
||||
|
||||
inseq && /^ / {
|
||||
s = $0
|
||||
gsub("[0-9]+", "", s)
|
||||
gsub(" ", "", s)
|
||||
seq = seq "" s
|
||||
next
|
||||
}
|
||||
|
||||
/^\/\// {
|
||||
oklen = (len == length(seq) ? "ok" : "wrong")
|
||||
gc = GC(seq)
|
||||
print locus, orga, len, oklen, gc, nbCds+0, nbCdx[1]+0, \
|
||||
nbCdx[2]+0, nbCdx[3]+0, (nbCdx[1]+0)*100/Max(1, nbCds+0), \
|
||||
meanCds+0, nbTrna+0, nbRrna+0, nbOrna+0
|
||||
nbCds = nbTrna = nbRrna = nbOrna = len = inseq = meanCds = 0
|
||||
delete nbCdx
|
||||
orga = locus = "?"
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
97
detectors/cds/tools/lib/embl.intron.awk
Normal file
97
detectors/cds/tools/lib/embl.intron.awk
Normal file
@ -0,0 +1,97 @@
|
||||
#
|
||||
# get intron features from embl
|
||||
#
|
||||
|
||||
# @include libembl.awk
|
||||
|
||||
BEGIN {
|
||||
print "#locus locustag genefam gene from to strand intron_num intron_nb acceptor-donor status"
|
||||
|
||||
if (HEADONLY != "") exit(0)
|
||||
|
||||
if (FASTA == "") Error("No FASTA file specified", 1)
|
||||
|
||||
if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)
|
||||
|
||||
Seq = tolower(ReadFasta(FASTA))
|
||||
}
|
||||
|
||||
/^ID / {
|
||||
locus = $2
|
||||
gsub(";", "", locus)
|
||||
next
|
||||
}
|
||||
|
||||
/^FT CDS/ {
|
||||
revstrand = match($3, "^complement")
|
||||
s = substr($0, 22)
|
||||
gsub("^complement", "", s)
|
||||
ok = ! match(s, "complement|order")
|
||||
if (! ok) next
|
||||
|
||||
na = ParseLocation(s, locs)
|
||||
if (na < 2) next
|
||||
|
||||
delete SINfo
|
||||
Ninfo = 0
|
||||
|
||||
val = locs[1][1]
|
||||
for (i = 2 ; i <= na ; i++) {
|
||||
if (locs[i][1] < val) ok = 0
|
||||
val = locs[i][1]
|
||||
}
|
||||
if (! ok) next
|
||||
|
||||
val = locs[1][2]
|
||||
for (i = 2 ; i <= na ; i++) {
|
||||
if (locs[i][2] < val) ok = 0
|
||||
val = locs[i][2]
|
||||
}
|
||||
if (! ok) next
|
||||
|
||||
from = locs[1][2] + 1
|
||||
for (i = 2 ; i <= na ; i++) {
|
||||
to = locs[i][1] - 1
|
||||
inseq = SeqLocation(Seq, (from - 4) ".." (to + 4), revstrand)
|
||||
SINfo[++Ninfo] = from " " to " " (revstrand ? "R" : "D") " "\
|
||||
(revstrand ? na-i+1 : i-1) " " na-1 " "\
|
||||
substr(inseq, 1,4) "."\
|
||||
substr(inseq, 5,6) "-"\
|
||||
substr(inseq, length(inseq)-9, 6) "."\
|
||||
substr(inseq, length(inseq)-3, 4) " "\
|
||||
"ok"
|
||||
from = locs[i][2] + 1
|
||||
}
|
||||
|
||||
gene = "none"
|
||||
locustag = "none"
|
||||
next
|
||||
}
|
||||
|
||||
/^FT \/gene=/ {
|
||||
split($0, a, "=")
|
||||
gene = a[2]
|
||||
gsub("^[^a-z,A-Z]+", "", gene)
|
||||
gsub("\"", "", gene)
|
||||
gsub(" ", "_", gene)
|
||||
next
|
||||
}
|
||||
|
||||
/^FT \/locus_tag=/ {
|
||||
split($0, a, "=")
|
||||
locustag = a[2]
|
||||
gsub("\"", "", locustag)
|
||||
gsub(" ", "_", locustag)
|
||||
next
|
||||
}
|
||||
|
||||
/^FT \/translation=/ {
|
||||
for (i = 1 ; i <= Ninfo ; i++)
|
||||
print locus, locustag, GeneFamily(gene), gene, SINfo[i]
|
||||
Ninfo = 0
|
||||
next
|
||||
}
|
||||
|
||||
/^\/\// {
|
||||
locus = "?"
|
||||
}
|
36
detectors/cds/tools/lib/embl.tofasta.awk
Normal file
36
detectors/cds/tools/lib/embl.tofasta.awk
Normal file
@ -0,0 +1,36 @@
|
||||
#
|
||||
# get fasta sequence from embl
|
||||
#
|
||||
|
||||
/^ID / {
|
||||
locus = $2
|
||||
gsub(";", "", locus)
|
||||
next
|
||||
}
|
||||
|
||||
/^SQ / {
|
||||
inseq = 1
|
||||
nln = 0
|
||||
delete seq
|
||||
next
|
||||
}
|
||||
|
||||
/^\/\// {
|
||||
inseq = 0
|
||||
print ">" locus
|
||||
for (i = 1 ; i <= nln ; i++)
|
||||
print seq[i]
|
||||
next
|
||||
}
|
||||
|
||||
inseq {
|
||||
s = $0
|
||||
gsub(" ", "", s)
|
||||
gsub("[0-9]+", "", s)
|
||||
seq[++nln] = s
|
||||
next
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
99
detectors/cds/tools/lib/gbk.cds_long.awk
Normal file
99
detectors/cds/tools/lib/gbk.cds_long.awk
Normal file
@ -0,0 +1,99 @@
|
||||
#
|
||||
# get cds features from genbank (long version)
|
||||
#
|
||||
# -v FASTA
|
||||
|
||||
# @include libgbk.awk
|
||||
|
||||
BEGIN {
|
||||
print "#locus locustag genefam gene from to strand nexon length status start stop dnaseq protseq product"
|
||||
|
||||
if (HEADONLY != "") exit(0)
|
||||
|
||||
if (MAXSPAN == "") MAXSPAN = 10000
|
||||
|
||||
if (FASTA == "") Error("No FASTA file specified", 1)
|
||||
|
||||
if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)
|
||||
|
||||
Seq = tolower(ReadFasta(FASTA))
|
||||
}
|
||||
|
||||
/^LOCUS/ {
|
||||
locus = $2
|
||||
incds = 0
|
||||
next
|
||||
}
|
||||
|
||||
/^ CDS/ {
|
||||
revstrand = match($2, "^complement")
|
||||
s = substr($0, 22)
|
||||
gsub("^complement", "", s)
|
||||
ok = ! match(s, "complement|order")
|
||||
nexon = Nexons(s)
|
||||
SpanLocation(s, sloc)
|
||||
spanlen = sloc[2] - sloc[1] + 1
|
||||
len = LenLocation(s)
|
||||
ok = ok && (len < MAXSPAN)
|
||||
cdsseq = ok ? SeqLocation(Seq, s, revstrand) : "XXX"
|
||||
cstart = substr(cdsseq, 1,3)
|
||||
cstop = substr(cdsseq, length(cdsseq)-2)
|
||||
|
||||
gene = "none"
|
||||
locustag = "none"
|
||||
product = "none"
|
||||
translation = "X"
|
||||
incds = 1
|
||||
next
|
||||
}
|
||||
|
||||
(incds && /^ [^ ]/) {
|
||||
print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"),
|
||||
nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
|
||||
incds = 0
|
||||
next
|
||||
}
|
||||
|
||||
/^ \/gene=/ {
|
||||
split($0, a, "=")
|
||||
gene = a[2]
|
||||
gsub("^[^a-z,A-Z]+", "", gene)
|
||||
gsub("\"", "", gene)
|
||||
gsub(" ", "_", gene)
|
||||
next
|
||||
}
|
||||
|
||||
/^ \/locus_tag=/ {
|
||||
split($0, a, "=")
|
||||
locustag = a[2]
|
||||
gsub("\"", "", locustag)
|
||||
gsub(" ", "_", locustag)
|
||||
next
|
||||
}
|
||||
|
||||
/^ \/product=/ {
|
||||
split($0, a, "=")
|
||||
product = a[2]
|
||||
gsub("\"", "", product)
|
||||
gsub(" ", "_", product)
|
||||
next
|
||||
}
|
||||
|
||||
/^ \/translation=/ {
|
||||
split($0, a, "=")
|
||||
translation = a[2]
|
||||
gsub("\"", "", translation)
|
||||
gsub(" ", "", translation)
|
||||
next
|
||||
}
|
||||
|
||||
/^\/\// {
|
||||
locus = "?"
|
||||
}
|
||||
|
||||
END {
|
||||
if (incds) {
|
||||
print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"),
|
||||
nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
|
||||
}
|
||||
}
|
97
detectors/cds/tools/lib/gbk.info.awk
Normal file
97
detectors/cds/tools/lib/gbk.info.awk
Normal file
@ -0,0 +1,97 @@
|
||||
#
|
||||
# get feature info from genbank
|
||||
#
|
||||
|
||||
# @include libgbk.awk
|
||||
|
||||
function GC(s, _local_, i, len) {
|
||||
s = toupper(s)
|
||||
len = length(s)
|
||||
gsub("G|C", "", s)
|
||||
return ((len - length(s)) * 100 / len)
|
||||
}
|
||||
|
||||
#
|
||||
# rules
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
print "#locus orga len oklen gc nbCds nbCds_int0 nbCds_int1 nbCds_intsup1 perCds_noex meanCdsSize nbtRNA nbrRNA nboRNA"
|
||||
}
|
||||
|
||||
/^LOCUS/ {
|
||||
locus = $2
|
||||
next
|
||||
}
|
||||
|
||||
/^ ORGANISM/ {
|
||||
orga = substr($0, 13)
|
||||
split(orga, a, ";")
|
||||
orga = a[1]
|
||||
gsub(" ", "_", orga)
|
||||
next
|
||||
}
|
||||
|
||||
/^ source/ {
|
||||
GetLoc($2, loc);
|
||||
len = loc[2];
|
||||
next
|
||||
}
|
||||
|
||||
/^ CDS/ {
|
||||
meanCds = meanCds * nbCds + LenLocation($2)
|
||||
nbCds++
|
||||
meanCds /= nbCds
|
||||
n = Nexons($2)
|
||||
if (n > 3) n = 3
|
||||
nbCdx[n]++
|
||||
next
|
||||
}
|
||||
|
||||
/^ tRNA/ {
|
||||
nbTrna++
|
||||
next
|
||||
}
|
||||
|
||||
/^ rRNA/ {
|
||||
nbRrna++
|
||||
next
|
||||
}
|
||||
|
||||
/^ mRNA/ {
|
||||
next
|
||||
}
|
||||
|
||||
/^ .*RNA/ {
|
||||
nbOrna++
|
||||
next
|
||||
}
|
||||
|
||||
/^ORIGIN/ {
|
||||
inseq = 1
|
||||
seq = ""
|
||||
next
|
||||
}
|
||||
|
||||
inseq && /^ +[1-9][0-9]*/ {
|
||||
s = substr($0, 11)
|
||||
gsub(" ", "", s)
|
||||
seq = seq "" s
|
||||
next
|
||||
}
|
||||
|
||||
/^\/\// {
|
||||
oklen = (len == length(seq) ? "ok" : "wrong")
|
||||
gc = GC(seq)
|
||||
print locus, orga, len, oklen, gc, nbCds+0, nbCdx[1]+0, \
|
||||
nbCdx[2]+0, nbCdx[3]+0, (nbCdx[1]+0)*100/Max(1, nbCds+0), \
|
||||
meanCds+0, nbTrna+0, nbRrna+0, nbOrna+0
|
||||
nbCds = nbTrna = nbRrna = nbOrna = len = inseq = meanCds = 0
|
||||
delete nbCdx
|
||||
orga = locus = "?"
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
96
detectors/cds/tools/lib/gbk.intron.awk
Normal file
96
detectors/cds/tools/lib/gbk.intron.awk
Normal file
@ -0,0 +1,96 @@
|
||||
#
|
||||
# get intron features from genbank
|
||||
#
|
||||
|
||||
# @include libgbk.awk
|
||||
|
||||
BEGIN {
|
||||
print "#locus locustag genefam gene from to strand intron_num intron_nb acceptor-donor status"
|
||||
|
||||
if (HEADONLY != "") exit(0)
|
||||
|
||||
if (FASTA == "") Error("No FASTA file specified", 1)
|
||||
|
||||
if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)
|
||||
|
||||
Seq = tolower(ReadFasta(FASTA))
|
||||
}
|
||||
|
||||
/^LOCUS/ {
|
||||
locus = $2
|
||||
next
|
||||
}
|
||||
|
||||
/^ CDS/ {
|
||||
revstrand = match($2, "^complement")
|
||||
s = substr($0, 22)
|
||||
gsub("^complement", "", s)
|
||||
ok = ! match(s, "complement|order")
|
||||
if (! ok) next
|
||||
|
||||
na = ParseLocation(s, locs)
|
||||
if (na < 2) next
|
||||
|
||||
delete SINfo
|
||||
Ninfo = 0
|
||||
|
||||
val = locs[1][1]
|
||||
for (i = 2 ; i <= na ; i++) {
|
||||
if (locs[i][1] < val) ok = 0
|
||||
val = locs[i][1]
|
||||
}
|
||||
if (! ok) next
|
||||
|
||||
val = locs[1][2]
|
||||
for (i = 2 ; i <= na ; i++) {
|
||||
if (locs[i][2] < val) ok = 0
|
||||
val = locs[i][2]
|
||||
}
|
||||
if (! ok) next
|
||||
|
||||
from = locs[1][2] + 1
|
||||
for (i = 2 ; i <= na ; i++) {
|
||||
to = locs[i][1] - 1
|
||||
inseq = SeqLocation(Seq, (from - 4) ".." (to + 4), revstrand)
|
||||
SINfo[++Ninfo] = from " " to " " (revstrand ? "R" : "D") " "\
|
||||
(revstrand ? na-i+1 : i-1) " " na-1 " "\
|
||||
substr(inseq, 1,4) "."\
|
||||
substr(inseq, 5,6) "-"\
|
||||
substr(inseq, length(inseq)-9, 6) "."\
|
||||
substr(inseq, length(inseq)-3, 4) " "\
|
||||
"ok"
|
||||
from = locs[i][2] + 1
|
||||
}
|
||||
|
||||
gene = "none"
|
||||
locustag = "none"
|
||||
next
|
||||
}
|
||||
|
||||
/^ \/gene=/ {
|
||||
split($0, a, "=")
|
||||
gene = a[2]
|
||||
gsub("^[^a-z,A-Z]+", "", gene)
|
||||
gsub("\"", "", gene)
|
||||
gsub(" ", "_", gene)
|
||||
next
|
||||
}
|
||||
|
||||
/^ \/locus_tag=/ {
|
||||
split($0, a, "=")
|
||||
locustag = a[2]
|
||||
gsub("\"", "", locustag)
|
||||
gsub(" ", "_", locustag)
|
||||
next
|
||||
}
|
||||
|
||||
/^ \/translation=/ {
|
||||
for (i = 1 ; i <= Ninfo ; i++)
|
||||
print locus, locustag, GeneFamily(gene), gene, SINfo[i]
|
||||
Ninfo = 0
|
||||
next
|
||||
}
|
||||
|
||||
/^\/\// {
|
||||
locus = "?"
|
||||
}
|
32
detectors/cds/tools/lib/gbk.tofasta.awk
Normal file
32
detectors/cds/tools/lib/gbk.tofasta.awk
Normal file
@ -0,0 +1,32 @@
|
||||
#
|
||||
# get fasta sequence from genbank
|
||||
#
|
||||
|
||||
/^LOCUS/ {
|
||||
locus = $2
|
||||
next
|
||||
}
|
||||
|
||||
/^ORIGIN/ {
|
||||
inseq = 1
|
||||
nln = 0
|
||||
delete seq
|
||||
}
|
||||
|
||||
inseq && /^ +[1-9][0-9]*/ {
|
||||
s = substr($0, 11)
|
||||
gsub(" ", "", s)
|
||||
seq[++nln] = s
|
||||
next
|
||||
}
|
||||
|
||||
/^\/\// {
|
||||
print ">" locus
|
||||
for (i = 1 ; i <= nln ; i++)
|
||||
print seq[i]
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
31
detectors/cds/tools/lib/install.rpackages.r
Executable file
31
detectors/cds/tools/lib/install.rpackages.r
Executable file
@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env Rscript
|
||||
#
|
||||
# check and install required packages
|
||||
#
|
||||
|
||||
out <- function(...) {
|
||||
cat(paste0('+ ', ..., '\n'), file=stderr())
|
||||
}
|
||||
|
||||
installed <- function(package) {
|
||||
package %in% rownames(installed.packages())
|
||||
}
|
||||
|
||||
check <- function(package, repos="http://cran.univ-lyon1.fr") {
|
||||
if (installed(package)) {
|
||||
out("R package ", package, " installed")
|
||||
} else {
|
||||
out("Installing R package ", package, " from ", repos)
|
||||
install.packages(package, repos=repos)
|
||||
}
|
||||
invisible(installed(package))
|
||||
}
|
||||
|
||||
check("grid")
|
||||
check("gridExtra")
|
||||
check("vcd")
|
||||
check("plotrix")
|
||||
check("igraph")
|
||||
|
||||
quit(save='no', status=0)
|
||||
|
224
detectors/cds/tools/lib/make.models.r
Executable file
224
detectors/cds/tools/lib/make.models.r
Executable file
@ -0,0 +1,224 @@
|
||||
#!/usr/bin/env Rscript
|
||||
#
|
||||
# compute start, stop, splice-junctions models for core DB
|
||||
#
|
||||
# source("make.models.r")
|
||||
#
|
||||
|
||||
LIB_DIR <- Sys.getenv("LIB_DIR")
|
||||
if (LIB_DIR == "") LIB_DIR = "."
|
||||
|
||||
source(paste0(LIB_DIR, "/util.base.r"))
|
||||
source(paste0(LIB_DIR, "/util.cons.r"))
|
||||
source(paste0(LIB_DIR, "/util.modelio.r"))
|
||||
|
||||
# -------------------------------
|
||||
# parameters
|
||||
# -------------------------------
|
||||
|
||||
# core cutoffs
|
||||
|
||||
source("db.models.params.txt")
|
||||
|
||||
# -------------------------------
|
||||
# genome infos
|
||||
# -------------------------------
|
||||
|
||||
notify("loading info table")
|
||||
chromo <- read.table("db.info.txt", com="", head=T, stringsAsFactors=F)
|
||||
|
||||
# -------------------------------
|
||||
# CDS
|
||||
# -------------------------------
|
||||
|
||||
notify("loading cds table")
|
||||
cds <- read.table("db.cds.txt", com="", header=T, stringsAsFactors=F)
|
||||
|
||||
cds$start <- as.factor(cds$start)
|
||||
cds$stop <- as.factor(cds$stop)
|
||||
|
||||
cds <- cds[cds$status=="Ok",]
|
||||
cds <- cds[cds$genefam!="none",]
|
||||
|
||||
cds$categ <- "dust"
|
||||
|
||||
x <- sort(table(cds$genefam), dec=T)
|
||||
ok <- names(x[x >= CORE_NCDS_CUTOFF])
|
||||
|
||||
cds$categ[cds$genefam %in% ok] <- "core"
|
||||
|
||||
x <- x[! names(x) %in% ok]
|
||||
ok <- names(x[x >= SHEL_NCDS_CUTOFF])
|
||||
|
||||
cds$categ[cds$genefam %in% ok] <- "shell"
|
||||
|
||||
#
|
||||
|
||||
cds.ori <- cds
|
||||
|
||||
cds.lst <- split(cds.ori, cds.ori$categ)
|
||||
|
||||
#
|
||||
# write out families
|
||||
#
|
||||
|
||||
# patterns & names
|
||||
|
||||
invisible(lapply(cds.lst, function(cds) {
|
||||
|
||||
x <- sort(table(cds$genefam), decreasing=T)
|
||||
tab <- paste0("^", names(x), "$")
|
||||
names(tab) <- names(x)
|
||||
|
||||
y <- sapply(split(cds$gene, cds$genefam), function(g) {
|
||||
head(names(sort(table(g), decreasing=T)), 1)
|
||||
})
|
||||
|
||||
tab <- cbind(tab, y[names(x)])
|
||||
|
||||
categ <- unique(cds$categ)
|
||||
f <- paste0("db.", categ, ".pat.txt")
|
||||
notify("writing patterns for", categ, ":", f)
|
||||
write.table(tab, file=f, quote=F, col.names=F, row.names=T)
|
||||
}))
|
||||
|
||||
# -------------------------------
|
||||
# Start models (core only)
|
||||
# -------------------------------
|
||||
|
||||
if (! "core" %in% names(cds.lst)) {
|
||||
notify("*** no gene found in core")
|
||||
notify("*** please change parameters")
|
||||
quit(save='no', status=1)
|
||||
}
|
||||
|
||||
cds <- cds.lst[["core"]]
|
||||
|
||||
#
|
||||
# start by genes
|
||||
#
|
||||
|
||||
tab <- split(cds$start, cds$genefam)
|
||||
|
||||
fatg <- sapply(tab, function(x) table(x)["atg"]/length(x)*100)
|
||||
names(fatg) <- names(tab)
|
||||
|
||||
start.dft <- names(which(fatg >= CORE_START_ATG_CUTOFF))
|
||||
start.spc <- names(which(fatg < CORE_START_ATG_CUTOFF))
|
||||
|
||||
tab <- cds[cds$genefam %in% start.dft,]
|
||||
tab <- table(tab$start)
|
||||
|
||||
# default model
|
||||
|
||||
x <- sort(tab[tab>=CORE_START_DFT_CUTOFF], decreasing=T)
|
||||
write.model.start(x, "default")
|
||||
|
||||
# gene specific models
|
||||
|
||||
invisible(sapply(start.spc, function(g) {
|
||||
x <- cds[cds$genefam == g,]
|
||||
tx <- table(x$start)
|
||||
tx <- sort(tx[tx>=CORE_START_OTH_CUTOFF], decreasing=T)
|
||||
write.model.start(tx, g)
|
||||
}))
|
||||
|
||||
# -------------------------------
|
||||
# Stop models (core only)
|
||||
# -------------------------------
|
||||
|
||||
# write default stop model
|
||||
|
||||
tab <- table(cds$stop)
|
||||
x <- sort(tab[tab>=CORE_STOP_CUTOFF], decreasing=T)
|
||||
write.model.stop(x, "default")
|
||||
|
||||
# -------------------------------
|
||||
# splice junctions
|
||||
# -------------------------------
|
||||
|
||||
notify("loading intron table")
|
||||
intron <- read.table("db.intron.txt", com="", header=T, stringsAsFactors=F)
|
||||
|
||||
# remove invalid sequences
|
||||
|
||||
intron$seq <- gsub("\\.|-", "", intron$acceptor.donor)
|
||||
|
||||
lseq <- nchar(gsub("[^acgt]", "", intron$seq))
|
||||
|
||||
intron <- intron[lseq == 20,]
|
||||
|
||||
# remove genes out of core
|
||||
|
||||
intron <- intron[intron$genefam %in% cds$genefam,]
|
||||
|
||||
# acceptors / donors
|
||||
|
||||
intron$acc <- substr(intron$seq, 5, 6)
|
||||
intron$don <- substr(intron$seq, 15, 16)
|
||||
|
||||
# consensus
|
||||
|
||||
cons.px <- cons.build(intron$acceptor.donor)
|
||||
cons.px <- cons.px[,! is.nan(colSums(cons.px))]
|
||||
|
||||
seq.px <- sapply(intron$acceptor.donor, function(s) gsub("[^acgt]", "", s))
|
||||
|
||||
conf.px <- cons.confusion(cons.px, seq.px)
|
||||
|
||||
sfam <- split(conf.px$l2scor, intron$genefam)
|
||||
sfam <- sfam[order(sapply(sfam, median))]
|
||||
|
||||
# extract splice exceptions
|
||||
|
||||
name.bad <- names(which(sapply(sfam, median) < 0))
|
||||
name.spc <- names(which(sapply(sfam[name.bad], length) >= CORE_SPLICE_CUTOFF))
|
||||
name.ok <- setdiff(unique(intron$genefam), name.bad)
|
||||
name.bad <- setdiff(name.bad, name.spc)
|
||||
name.list <- c(sapply(name.spc, function(x) x), list(default=name.ok))
|
||||
|
||||
cons <- lapply(name.list, function(x) cons.build(intron[intron$genefam %in% x, "acceptor.donor"]))
|
||||
|
||||
# write junction models
|
||||
|
||||
invisible(sapply(names(cons), function(n) write.model.splice3(cons[[n]], n)))
|
||||
invisible(sapply(names(cons), function(n) write.model.splice5(cons[[n]], n)))
|
||||
|
||||
# use uniform model for bad guys
|
||||
|
||||
invisible(sapply(name.bad, function(n) write.unif.splice(3, n)))
|
||||
invisible(sapply(name.bad, function(n) write.unif.splice(5, n)))
|
||||
|
||||
invisible(write.unif.splice('', "none"))
|
||||
|
||||
# -------------------------------
|
||||
# keep data for plotting
|
||||
# -------------------------------
|
||||
|
||||
DB <- list()
|
||||
|
||||
params <- list()
|
||||
|
||||
params$CORE_NCDS_CUTOFF <- CORE_NCDS_CUTOFF
|
||||
params$CORE_START_ATG_CUTOFF <- CORE_START_ATG_CUTOFF
|
||||
params$CORE_START_DFT_CUTOFF <- CORE_START_DFT_CUTOFF
|
||||
params$CORE_START_OTH_CUTOFF <- CORE_START_OTH_CUTOFF
|
||||
params$CORE_STOP_CUTOFF <- CORE_STOP_CUTOFF
|
||||
params$CORE_SPLICE_CUTOFF <- CORE_SPLICE_CUTOFF
|
||||
|
||||
params$SHEL_NCDS_CUTOFF <- SHEL_NCDS_CUTOFF
|
||||
|
||||
DB$params <- params
|
||||
DB$chromo <- chromo
|
||||
DB$cds.lst <- cds.lst
|
||||
DB$intron <- intron
|
||||
DB$cons <- cons
|
||||
|
||||
notify("saving db.data.Rdata")
|
||||
save(DB, file="db.data.Rdata")
|
||||
|
||||
# -------------------------------
|
||||
# end
|
||||
# -------------------------------
|
||||
|
||||
quit(save='no')
|
424
detectors/cds/tools/lib/plot.models.r
Executable file
424
detectors/cds/tools/lib/plot.models.r
Executable file
@ -0,0 +1,424 @@
|
||||
#!/usr/bin/env Rscript
|
||||
#
|
||||
# plots models previously computed by make.models.r
|
||||
#
|
||||
# source("plot.models.r")
|
||||
#
|
||||
|
||||
require(vcd)
|
||||
require(plotrix)
|
||||
|
||||
LIBDIR <- Sys.getenv("LIB_DIR")
|
||||
if (LIBDIR == "") LIBDIR = "."
|
||||
|
||||
source(paste0(LIBDIR, "/util.base.r"))
|
||||
source(paste0(LIBDIR, "/util.plot.r"))
|
||||
source(paste0(LIBDIR, "/util.cons.r"))
|
||||
source(paste0(LIBDIR, "/util.grid.r"))
|
||||
|
||||
# -------------------------------
|
||||
# setup
|
||||
# -------------------------------
|
||||
|
||||
OUT.DEV <- TRUE
|
||||
OUT.TYPE <- "pdf"
|
||||
OUT.FILE <- "models"
|
||||
|
||||
if (OUT.DEV) uplot.init.dev(OUT.FILE, OUT.TYPE)
|
||||
|
||||
# -------------------------------
|
||||
# Load data
|
||||
# -------------------------------
|
||||
|
||||
notify("loading DB data")
|
||||
load("db.data.Rdata")
|
||||
|
||||
params <- DB$params
|
||||
chromo <- DB$chromo
|
||||
cds.lst <- DB$cds.lst
|
||||
intron <- DB$intron
|
||||
cons <- DB$cons
|
||||
|
||||
# -------------------------------
|
||||
# Genomes infos
|
||||
# -------------------------------
|
||||
|
||||
grd.titlepage("Species")
|
||||
grd.textpage(lineno=1, "# org: ", nrow(chromo))
|
||||
|
||||
#
|
||||
# general stats
|
||||
#
|
||||
|
||||
grd.hist(chromo, "len", main="Histogram of chromosome length")
|
||||
grd.hist(chromo, "gc", pos.quant=c(0.75, 0.6), main="Histogram of chromosome GC")
|
||||
grd.hist(chromo, "nbCds")
|
||||
grd.fplot(chromo, "len", "nbCds")
|
||||
|
||||
#
|
||||
# nb cds no introns
|
||||
#
|
||||
|
||||
chromo$nbCds_Mono <- chromo$nbCds_int0
|
||||
chromo$nbCds_Poly <- chromo$nbCds_int1 + chromo$nbCds_intsup1
|
||||
chromo$percentPoly <- round(chromo$nbCds_Poly * 100 / (chromo$nbCds_Poly + chromo$nbCds_Mono))
|
||||
|
||||
grd.hist(chromo, "nbCds_Mono", main="Histogram of monoexonic Cds")
|
||||
grd.hist(chromo, "nbCds_Poly", main="Histogram of polyexonic Cds")
|
||||
grd.hist(chromo, "percentPoly", pos.sum=c(0.23,0.6), main="Histogram of % polyexonic")
|
||||
|
||||
grd.fplot(chromo, "nbCds", "nbCds_Mono", TRUE, ablin=list(a=0, b=1, col=3))
|
||||
grd.fplot(chromo, "nbCds", "nbCds_Poly")
|
||||
|
||||
#
|
||||
# cds size
|
||||
#
|
||||
|
||||
grd.hist(chromo, "meanCdsSize", pos.quant=NULL, main="Histogram of Cds size")
|
||||
|
||||
# -------------------------------
|
||||
# CDS
|
||||
# -------------------------------
|
||||
|
||||
cds.all <- do.call(rbind, cds.lst)
|
||||
|
||||
grd.titlepage("CDS")
|
||||
|
||||
grd.textpage(lineno=1, "# core cds core cutoff: ", params$CORE_NCDS_CUTOFF)
|
||||
grd.textpage(lineno=2, "# core cds shell cutoff: ", params$SHEL_NCDS_CUTOFF)
|
||||
|
||||
grd.textpage(lineno=4, "# total cds: ", nrow(cds.all))
|
||||
grd.textpage(lineno=5, "# core cds: ", nrow(cds.lst[["core"]]))
|
||||
grd.textpage(lineno=6, "# shell cds: ", nrow(cds.lst[["shell"]]))
|
||||
grd.textpage(lineno=7, "# dust cds: ", nrow(cds.lst[["dust"]]))
|
||||
|
||||
grd.textpage(lineno=9, "# total org: ", length(unique(cds.all$X.locus)))
|
||||
grd.textpage(lineno=10, "# core org: ", length(unique(cds.lst[["core"]]$X.locus)))
|
||||
grd.textpage(lineno=11, "# shell org: ", length(unique(cds.lst[["shell"]]$X.locus)))
|
||||
grd.textpage(lineno=12, "# dust org: ", length(unique(cds.lst[["dust"]]$X.locus)))
|
||||
|
||||
|
||||
grd.textpage(lineno=14, "# total families: ", length(unique(cds.all$genefam)))
|
||||
grd.textpage(lineno=15, "# core families: ", length(unique(cds.lst[["core"]]$genefam)))
|
||||
grd.textpage(lineno=16, "# shell families: ", length(unique(cds.lst[["shell"]]$genefam)))
|
||||
grd.textpage(lineno=17, "# dust families: ", length(unique(cds.lst[["dust"]]$genefam)))
|
||||
|
||||
uplot.setup(mfrow=c(2,2), xpd=NA)
|
||||
|
||||
x <- sapply(cds.lst, nrow)
|
||||
uplot.pie(x, main="CDS", text.r=1.1, col=c(3,2,4))
|
||||
|
||||
x <- sapply(cds.lst, function(cds) length(unique(cds$X.locus)))
|
||||
uplot.pie(x, main="ORG", text.r=1.1, col=c(3,2,4))
|
||||
|
||||
x <- sapply(cds.lst, function(cds) length(unique(cds$genefam)))
|
||||
uplot.pie(x, main="FAM", text.r=1.1, col=c(3,2,4))
|
||||
|
||||
uplot.setup(xpd=F)
|
||||
|
||||
#
|
||||
# plot genes cutoff
|
||||
#
|
||||
|
||||
cds.all <- do.call(rbind, cds.lst)
|
||||
cds.byfam <- split(cds.all, cds.all$genefam)
|
||||
|
||||
tab <- sort(sapply(cds.byfam, nrow), decreasing=T)
|
||||
cols <- rep("red", length(tab))
|
||||
cols[tab >= params$SHEL_NCDS_CUTOFF] <- "blue"
|
||||
cols[tab >= params$CORE_NCDS_CUTOFF] <- "green"
|
||||
barplot(tab, col=cols, border=NA, main="# genes")
|
||||
|
||||
cols <- cols[tab >= 50]
|
||||
tab <- tab[tab >= 50]
|
||||
barplot(tab, col=cols, border=NA, las=2, cex.names=0.5, main="# genes in core")
|
||||
abline(h=params$CORE_NCDS_CUTOFF, col=1)
|
||||
text(50, 200, "CORE_NCDS_CUTOFF", pos=3)
|
||||
|
||||
#
|
||||
# cds length for core
|
||||
#
|
||||
|
||||
invisible(sapply(c("core", "shell", "dust"), function(what) {
|
||||
|
||||
cds <- cds.lst[[what]]
|
||||
|
||||
x <- split(cds$length, cds$genefam)
|
||||
x <- x[order(sapply(x, mean))]
|
||||
|
||||
uplot.setup(mfrow=c(2,1))
|
||||
|
||||
boxplot(x, pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5, outcex = 0.1),
|
||||
las=2, cex.axis=0.5, main=paste0(what, " genes - length distribution"))
|
||||
|
||||
boxplot(x, pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5, outcex = 0.1), ylim=c(0,2000),
|
||||
las=2, cex.axis=0.5, main=paste0(what, " genes - length distribution zoom"))
|
||||
|
||||
uplot.setup()
|
||||
}))
|
||||
|
||||
# -------------------------------
|
||||
# starts & stops
|
||||
# -------------------------------
|
||||
|
||||
cds <- cds.lst[["core"]]
|
||||
|
||||
grd.titlepage("Starts and Stops")
|
||||
|
||||
tab <- sort(table(cds$start), dec=T)
|
||||
tab <- tab[tab >= 100]
|
||||
tab <- tab / sum(tab) * 100
|
||||
|
||||
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="start frequencies (%)")
|
||||
text(0.7, 50, round(tab[1], 2))
|
||||
text(1.9, 2.3, round(tab[2], 2))
|
||||
text(3.1, 2.3, round(tab[3], 2))
|
||||
|
||||
#
|
||||
# start by org and gc
|
||||
#
|
||||
|
||||
x <- split(cds$start, cds$X.locus)
|
||||
|
||||
fatg <- sapply(x, function(x) table(x)["atg"]/length(x)*100)
|
||||
names(fatg) <- names(x)
|
||||
chromo$fatg <- round(fatg[chromo$X.locus], 2)
|
||||
|
||||
fgtg <- sapply(x, function(x) table(x)["gtg"]/length(x)*100)
|
||||
names(fgtg) <- names(x)
|
||||
chromo$fgtg <- round(fgtg[chromo$X.locus], 2)
|
||||
|
||||
facg <- sapply(x, function(x) table(x)["acg"]/length(x)*100)
|
||||
names(facg) <- names(x)
|
||||
chromo$facg <- round(facg[chromo$X.locus], 2)
|
||||
|
||||
grd.hist(chromo, "fatg", pos.quant=c(0.5, 0.6), main="Histogram of atg freq. by org")
|
||||
grd.hist(chromo, "fgtg", main="Histogram of gtg freq. by org")
|
||||
grd.hist(chromo, "facg", pos.sum=c(0.3, 0.6), main="Histogram of acg freq. by org")
|
||||
|
||||
grd.fplot(chromo, "gc", "fatg", main="atg freq. by org GC", pos=c(0.2, 0.3))
|
||||
grd.fplot(chromo, "gc", "fgtg", main="gtg freq. by org GC")
|
||||
grd.fplot(chromo, "gc", "facg", main="acg freq. by org GC")
|
||||
|
||||
ter <- cbind(fatg, fgtg, facg)
|
||||
colnames(ter) <- c("ATG", "GTG", "ACG")
|
||||
igc <- cut(chromo$gc, breaks=quantile(chromo$gc, seq(0, 1, 0.1)), include.lowest=T, labels=1:10)
|
||||
cols <- rainbow(10)[igc]
|
||||
ternaryplot(ter, col=cols, cex=0.2, main="Start by org", labels="outside")
|
||||
|
||||
#
|
||||
# start by common genes
|
||||
#
|
||||
|
||||
x <- split(cds$start, cds$genefam)
|
||||
|
||||
fatg <- sapply(x, function(x) table(x)["atg"]/length(x)*100)
|
||||
names(fatg) <- names(x)
|
||||
|
||||
fgtg <- sapply(x, function(x) table(x)["gtg"]/length(x)*100)
|
||||
names(fgtg) <- names(x)
|
||||
|
||||
facg <- sapply(x, function(x) table(x)["acg"]/length(x)*100)
|
||||
names(facg) <- names(x)
|
||||
|
||||
barplot(sort(fatg)[1:10], las=2, main="atg freq. by gene")
|
||||
barplot(sort(fgtg, dec=T)[1:10], las=2, main="gtg freq. by gene")
|
||||
barplot(sort(facg, dec=T)[1:10], las=2, main="acg freq. by gene")
|
||||
|
||||
ter <- cbind(fatg, fgtg, facg)
|
||||
colnames(ter) <- c("ATG", "GTG", "ACG")
|
||||
ternaryplot(ter, col=1, cex=0.5, id=rownames(ter), main="Starts by genes", labels="outside")
|
||||
|
||||
# -------------------------------
|
||||
# stops
|
||||
# -------------------------------
|
||||
|
||||
#
|
||||
# stop by org and gc
|
||||
#
|
||||
|
||||
tab <- sort(table(cds$stop), dec=T)
|
||||
tab <- tab[tab >= 100]
|
||||
tab <- tab / sum(tab) * 100
|
||||
|
||||
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="stop frequencies (%)")
|
||||
text(0.7, 80, round(tab[1], 2))
|
||||
text(1.9, 30, round(tab[2], 2))
|
||||
text(3.1, 25, round(tab[3], 2))
|
||||
|
||||
x <- split(cds$stop, cds$X.locus)
|
||||
|
||||
ftaa <- sapply(x, function(x) table(x)["taa"]/length(x)*100)
|
||||
names(ftaa) <- names(x)
|
||||
chromo$ftaa <- round(ftaa[chromo$X.locus], 2)
|
||||
|
||||
ftag <- sapply(x, function(x) table(x)["tag"]/length(x)*100)
|
||||
names(ftag) <- names(x)
|
||||
chromo$ftag <- round(ftag[chromo$X.locus], 2)
|
||||
|
||||
ftga <- sapply(x, function(x) table(x)["tga"]/length(x)*100)
|
||||
names(ftga) <- names(x)
|
||||
chromo$ftga <- round(ftga[chromo$X.locus], 2)
|
||||
|
||||
grd.hist(chromo, "ftaa", pos.quant=c(0.7, 0.6), main="Histogram of taa freq. by org")
|
||||
grd.hist(chromo, "ftag", pos.quant=c(0.8, 0.6), main="Histogram of tag freq. by org")
|
||||
grd.hist(chromo, "ftga", pos.quant=c(0.8, 0.6), main="Histogram of tga freq. by org")
|
||||
|
||||
grd.fplot(chromo, "gc", "ftaa", main="taa freq. by org GC", pos=c(0.2, 0.3))
|
||||
grd.fplot(chromo, "gc", "ftag", main="tag freq. by org GC")
|
||||
grd.fplot(chromo, "gc", "ftga", main="tga freq. by org GC")
|
||||
|
||||
ter <- cbind(ftaa, ftag, ftga)
|
||||
colnames(ter) <- c("TAA", "TAG", "TGA")
|
||||
igc <- cut(chromo$gc, breaks=quantile(chromo$gc, seq(0, 1, 0.1)), include.lowest=T, labels=1:10)
|
||||
cols <- rainbow(10)[igc]
|
||||
ternaryplot(ter, col=cols, cex=0.2, main="Stops by org", labels="outside")
|
||||
|
||||
#
|
||||
# stop by common genes
|
||||
#
|
||||
|
||||
x <- split(cds$stop, cds$genefam)
|
||||
|
||||
ftaa <- sapply(x, function(x) table(x)["taa"]/length(x)*100)
|
||||
names(ftaa) <- names(x)
|
||||
|
||||
ftag <- sapply(x, function(x) table(x)["tag"]/length(x)*100)
|
||||
names(ftag) <- names(x)
|
||||
|
||||
ftga <- sapply(x, function(x) table(x)["tga"]/length(x)*100)
|
||||
names(ftga) <- names(x)
|
||||
|
||||
barplot(sort(ftaa), las=2, cex.names=0.5, ylim=c(0,100), main="taa freq. by gene")
|
||||
barplot(sort(ftag), las=2, cex.names=0.5, ylim=c(0,100), main="tag freq. by gene")
|
||||
barplot(sort(ftga), las=2, cex.names=0.5, ylim=c(0,100), main="tga freq. by gene")
|
||||
|
||||
ter <- cbind(ftaa, ftag, ftga)
|
||||
colnames(ter) <- c("TAA", "TAG", "TGA")
|
||||
ternaryplot(ter, col=1, cex=0.3, id=rownames(ter), main="Stops by genes", labels="outside")
|
||||
|
||||
# -------------------------------
|
||||
# splice junctions
|
||||
# -------------------------------
|
||||
|
||||
grd.titlepage("Splice Junctions")
|
||||
|
||||
grd.textpage(lineno=1, "# intron in core: ", nrow(intron))
|
||||
|
||||
#
|
||||
# intron size
|
||||
#
|
||||
|
||||
intron$size <- intron$to - intron$from + 1
|
||||
|
||||
grd.hist(intron, "size", pos.quant=NULL, main="Histogram of intron size", br=1000, xlim=c(0,2000))
|
||||
|
||||
#
|
||||
# nb intron / gene
|
||||
#
|
||||
|
||||
x <- split(intron, intron$genefam)
|
||||
x <- x[order(sapply(x, function(x) mean(x$intron_nb)), decreasing=T)]
|
||||
|
||||
nmax <- max(intron$intron_nb)
|
||||
lintron <- lapply(x, function(x) x$intron_nb)
|
||||
mintron <- sapply(lintron, function(x) table(factor(x, levels=1:nmax)))
|
||||
|
||||
lintron0 <- table(cds[cds$nexon == 1,"genefam"])[names(lintron)]
|
||||
mintron <- rbind("0"=lintron0, mintron)
|
||||
mintron <- t(t(mintron)/colSums(mintron))
|
||||
|
||||
mintron[mintron==0] <- NA
|
||||
|
||||
nn <- nrow(mintron)
|
||||
xx <- mintron[nn:1,]
|
||||
ll <- lapply(1:nn, function(i) xx[i,])
|
||||
mintron <- mintron[,do.call(order, c(ll, decreasing=T))]
|
||||
|
||||
battleship.plot(mintron, maxxspan=0.3, maxyspan=0.3,
|
||||
cex.labels=0.7,
|
||||
main="% intron per polyexonic gene")
|
||||
|
||||
#
|
||||
# acceptors / donors
|
||||
#
|
||||
|
||||
tab <- sort(table(intron$acc), dec=T)
|
||||
tab <- tab[tab >= 100]
|
||||
tab <- tab / sum(tab) * 100
|
||||
|
||||
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="acceptor frequencies (%)")
|
||||
text(0.7, 50, round(tab[1], 2))
|
||||
text(1.9, 3, round(tab[2], 2))
|
||||
text(3.1, 2.3, round(tab[3], 2))
|
||||
|
||||
tab <- sort(table(intron$don), dec=T)
|
||||
tab <- tab[tab >= 100]
|
||||
tab <- tab / sum(tab) * 100
|
||||
|
||||
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="donor frequencies (%)")
|
||||
text(0.7, 50, round(tab[1], 1))
|
||||
text(1.9, 40, round(tab[2], 1))
|
||||
text(3.1, 15, round(tab[3], 1))
|
||||
text(4.3, 12, round(tab[4], 1))
|
||||
|
||||
#
|
||||
# consensus all
|
||||
#
|
||||
|
||||
cons$all <- cons.build(intron$acceptor.donor)
|
||||
|
||||
invisible(sapply(rev(names(cons)), function(what) {
|
||||
cons.plot(cons[[what]], paste0("consensus ", what))
|
||||
}))
|
||||
|
||||
#
|
||||
# default consensus score by consensus length
|
||||
#
|
||||
|
||||
cons.def <- cons[["default"]]
|
||||
cons.def <- cons.def[,! is.nan(colSums(cons.def))]
|
||||
seq.def <- sapply(intron$acceptor.donor, function(s) gsub("[^acgt]", "", s))
|
||||
|
||||
epx <- apply(cons.def, 2, function(col) -sum(col * log(col, base=4)))
|
||||
opx <- order(epx)
|
||||
|
||||
conf.def <- lapply(seq(2, length(opx), by=2), function(n) {
|
||||
pos <- head(opx, n)
|
||||
notify(n, "/", length(opx))
|
||||
cons.confusion(cons.def, seq.def, thresh=0, pos=pos)
|
||||
})
|
||||
|
||||
acc <- sapply(conf.def, function(x) x$acc)
|
||||
sen <- sapply(conf.def, function(x) x$sen)
|
||||
sel <- sapply(conf.def, function(x) x$sel)
|
||||
|
||||
plot(sel, ylim=c(0.7, 1), pch=1, type="b", main="accuracy by nb consensus positions", ylab="")
|
||||
lines(sen, type="b", pch=2)
|
||||
lines(acc, type="b", pch=3)
|
||||
legend(1, 0.95, c("sensit.", "select.", "accur."), pch=1:3, horiz=T, bty="n")
|
||||
|
||||
#
|
||||
# default consensus score by genes
|
||||
#
|
||||
|
||||
conf.def <- cons.confusion(cons.def, seq.def, thresh=0)
|
||||
cons.histconf(conf.def)
|
||||
|
||||
sfam <- split(conf.def$l2scor, intron$genefam)
|
||||
sfam <- sfam[order(sapply(sfam, median))]
|
||||
|
||||
boxplot(sfam, pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5, outcex = 0.1),
|
||||
las=2, cex.axis=0.7, main="default junction logr score by genes")
|
||||
abline(h=0)
|
||||
|
||||
#
|
||||
# end
|
||||
#
|
||||
|
||||
if (OUT.DEV) {
|
||||
cat("+ plot file:", paste0(OUT.FILE, ".", OUT.TYPE), "\n")
|
||||
invisible(dev.off())
|
||||
}
|
||||
|
||||
quit(save='no')
|
71
detectors/cds/tools/lib/summarize_cmp.r
Executable file
71
detectors/cds/tools/lib/summarize_cmp.r
Executable file
@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env Rscript
|
||||
#
|
||||
# plot summary graphics of comparisons
|
||||
#
|
||||
#
|
||||
|
||||
LIBDIR <- Sys.getenv("LIB_DIR")
|
||||
if (LIBDIR == "") LIBDIR = "."
|
||||
|
||||
source(paste0(LIBDIR, "/util.plot.r"))
|
||||
|
||||
COLORS <- 2:10
|
||||
|
||||
#
|
||||
|
||||
OUT.DEV <- TRUE
|
||||
OUT.TYPE <- "pdf"
|
||||
OUT.FILE <- "compare"
|
||||
if (OUT.DEV) uplot.init.dev(OUT.FILE, OUT.TYPE)
|
||||
|
||||
#
|
||||
|
||||
tab <- read.table("compare.txt", header=T, comment.char="", stringsAsFactors=F)
|
||||
|
||||
#
|
||||
|
||||
par(xpd=NA)
|
||||
|
||||
#
|
||||
|
||||
sel <- c("cor", "alcor", "acc", "wrong", "over", "misstot")
|
||||
tab$ptot <- rowSums(tab[,sel])
|
||||
for (s in sel)
|
||||
tab[,paste0("p", s)] <- tab[,s] * 100 / tab$ptot
|
||||
|
||||
colors <- head(COLORS, length(sel))
|
||||
|
||||
cols <- paste0("p", sel)
|
||||
ord <- order(tab$pcor+tab$palcor+tab$pacc, decreasing=T)
|
||||
|
||||
barplot(t(tab[ord,cols]), names.arg=tab$X.org[ord],
|
||||
ylim=c(0,100), col=colors, las=2, cex.names=0.5)
|
||||
|
||||
legend(0, 110, sel, fill=colors, cex=0.7, horiz=T)
|
||||
|
||||
#
|
||||
|
||||
sel <- c("cor", "alcor", "acc", "wrong", "over", "misschlo")
|
||||
tab$rtot <- rowSums(tab[,sel])
|
||||
for (s in sel)
|
||||
tab[,paste0("r", s)] <- tab[,s] * 100 / tab$rtot
|
||||
|
||||
colors <- head(COLORS, length(sel))
|
||||
|
||||
cols <- paste0("r", sel)
|
||||
ord <- order(tab$rcor+tab$ralcor+tab$racc, decreasing=T)
|
||||
|
||||
barplot(t(tab[ord,cols]), names.arg=tab$X.org[ord],
|
||||
ylim=c(0,100), col=colors, las=2, cex.names=0.5)
|
||||
|
||||
legend(0, 110, sel, fill=colors, cex=0.7, horiz=T)
|
||||
|
||||
#
|
||||
|
||||
if (OUT.DEV) {
|
||||
cat("# plot file:", paste0(OUT.FILE, ".", OUT.TYPE), "\n")
|
||||
invisible(dev.off())
|
||||
}
|
||||
|
||||
quit(save='no')
|
||||
|
54
detectors/cds/tools/lib/summary.cmp.awk
Normal file
54
detectors/cds/tools/lib/summary.cmp.awk
Normal file
@ -0,0 +1,54 @@
|
||||
#
|
||||
#
|
||||
|
||||
function getOrg(s, _local_, a, na, org) {
|
||||
na = split(s, a, "/")
|
||||
na = split(a[na], a, "\\.")
|
||||
return a[1]
|
||||
}
|
||||
|
||||
|
||||
BEGIN {
|
||||
PROCINFO["sorted_in"] = "@ind_num_asc"
|
||||
print "#org tot cor alcor acc wrong over misstot misschlo missoth"
|
||||
}
|
||||
|
||||
/MISSED in ChloroDB/ {
|
||||
org = getOrg($1)
|
||||
Org[org]++
|
||||
Cnt[org]["MISSCHLORO"] = $2
|
||||
next
|
||||
}
|
||||
|
||||
/MISSED not in ChloroDB/ {
|
||||
org = getOrg($1)
|
||||
Org[org]++
|
||||
Cnt[org]["MISSNOTCHLORO"] = $2
|
||||
next
|
||||
}
|
||||
|
||||
/^#/ { next }
|
||||
|
||||
/^.*:MATCH/ {
|
||||
org = getOrg($1)
|
||||
Org[org]++
|
||||
split($NF, a, "\\.")
|
||||
Cnt[org][a[1]]++
|
||||
}
|
||||
|
||||
END {
|
||||
for (org in Org) {
|
||||
Cnt[org]["TOTAL"] = Cnt[org]["CORRECT"] + Cnt[org]["ALMOST_CORRECT"] \
|
||||
+ Cnt[org]["ACCEPTABLE"] + Cnt[org]["WRONG"] \
|
||||
+ Cnt[org]["MISSED"]
|
||||
}
|
||||
for (org in Org) {
|
||||
print org, Cnt[org]["TOTAL"]+0, Cnt[org]["CORRECT"]+0, \
|
||||
Cnt[org]["ALMOST_CORRECT"]+0, Cnt[org]["ACCEPTABLE"]+0, \
|
||||
Cnt[org]["WRONG"]+0, Cnt[org]["OVERPRED"]+0, \
|
||||
Cnt[org]["MISSED"]+0, \
|
||||
Cnt[org]["MISSCHLORO"]+0, Cnt[org]["MISSNOTCHLORO"]+0
|
||||
|
||||
}
|
||||
|
||||
}
|
12
detectors/cds/tools/lib/util.base.r
Normal file
12
detectors/cds/tools/lib/util.base.r
Normal file
@ -0,0 +1,12 @@
|
||||
#
|
||||
# R basic utilities
|
||||
#
|
||||
|
||||
#
|
||||
# notify on stderr
|
||||
#
|
||||
|
||||
notify <- function(...) cat("+", ..., "\n")
|
||||
|
||||
|
||||
|
109
detectors/cds/tools/lib/util.cons.r
Normal file
109
detectors/cds/tools/lib/util.cons.r
Normal file
@ -0,0 +1,109 @@
|
||||
#
|
||||
# R consensus utilities
|
||||
#
|
||||
|
||||
#
|
||||
# compute consensus
|
||||
#
|
||||
|
||||
cons.build <- function(seqs, backcount=1) {
|
||||
xx <- do.call(rbind, sapply(seqs, strsplit, "", USE.NAMES=F))
|
||||
lv <- c("a", "c", "g", "t", ".", "-")
|
||||
mx <- apply(xx, 2, function(x) table(factor(x, levels=lv)))[1:4,]
|
||||
cx <- colSums(mx)
|
||||
mx <- mx + backcount
|
||||
mx[,cx==0] <- 0
|
||||
apply(mx, 2, function(x) x / sum(x))
|
||||
}
|
||||
|
||||
#
|
||||
# score consensus
|
||||
#
|
||||
|
||||
cons.score <- function(cons, seq, pos=1:ncol(cons)) {
|
||||
seq <- strsplit(seq, "")[[1]]
|
||||
if (length(seq) != ncol(cons)) {
|
||||
warning("incompatible seq and cons size")
|
||||
return(NA)
|
||||
}
|
||||
ppx <- sapply(pos, function(i) cons[seq[i],i])
|
||||
sum(log10(ppx+1e-6))
|
||||
}
|
||||
|
||||
#
|
||||
# logratio to uniform model score
|
||||
#
|
||||
|
||||
cons.logratio <- function(cons, seq, m0=NULL, pos=1:ncol(cons)) {
|
||||
if (is.null(m0)) {
|
||||
m0 <- matrix(rep(0.25, 4), nrow=4, ncol=ncol(cons))
|
||||
rownames(m0) <- c('a', 'c', 'g', 't')
|
||||
}
|
||||
|
||||
sc <- cons.score(cons, seq, pos=pos)
|
||||
sc0 <- cons.score(m0, seq, pos=pos)
|
||||
|
||||
2 * (log(10^sc, base=2) - log(10^sc0, base=2))
|
||||
}
|
||||
|
||||
#
|
||||
# shuffle sequence
|
||||
#
|
||||
|
||||
seq.shuf <- function(seq) {
|
||||
paste0(sample(strsplit(seq, "")[[1]], nchar(seq), replace=F), collapse="")
|
||||
}
|
||||
|
||||
#
|
||||
# compute confusion matrix between actual and shuffled sequences
|
||||
#
|
||||
|
||||
cons.confusion <- function(cons, seq, m0=NULL, pos=1:ncol(cons), thresh=0) {
|
||||
som <- function(x) sum(x, na.rm=T)
|
||||
|
||||
res <- list()
|
||||
res$l2scor <- l2scor <- sapply(seq, function(s) cons.logratio(cons, s, m0=m0, pos=pos))
|
||||
|
||||
seq <- sapply(seq, seq.shuf)
|
||||
res$r2scor <- r2scor <- sapply(seq, function(s) cons.logratio(cons, s, m0=m0, pos=pos))
|
||||
|
||||
res$conf <- conf <- matrix(c(som(l2scor >= thresh), som(l2scor < thresh),
|
||||
som(r2scor >= thresh), som(r2scor < thresh)),
|
||||
nrow=2, byrow=T)
|
||||
|
||||
res$acc <- sum(diag(conf)) / sum(conf)
|
||||
res$sen <- conf[1,1] / sum(conf[1,])
|
||||
res$sel <- conf[1,1] / sum(conf[,1])
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
#
|
||||
# plot consensus
|
||||
#
|
||||
|
||||
|
||||
cons.plot <- function(cons, main="consensus") {
|
||||
cols <- c("blue", "orange", "red", "green")
|
||||
bp <- barplot(cons, col=cols, ylim=c(0,1), main=main)
|
||||
plx <- apply(cons, 2, function(col) -sum(col * log(col+1e-6, base=4)))
|
||||
lines(bp, plx, type="b", pch=19)
|
||||
legend(0, 1.1, c("a","c","g","t"), fill=cols, horiz=T, xpd=NA, bty="n")
|
||||
legend(20, 1.1, "entropy", pch=19, horiz=T, xpd=NA, bty="n")
|
||||
invisible()
|
||||
}
|
||||
|
||||
#
|
||||
# plot confusion scores histograms
|
||||
#
|
||||
|
||||
cons.histconf <- function(conf, main="junction logr score") {
|
||||
lrh <- hist(c(conf$l2scor, conf$r2scor), br=50, plot=F)
|
||||
lh <- hist(conf$l2scor, br=lrh$breaks, plot=F)
|
||||
rh <- hist(conf$r2scor, br=lrh$breaks, plot=F)
|
||||
xx <- rbind(lh$counts, rh$counts) / sum(lh$counts)
|
||||
colnames(xx) <- lrh$mids
|
||||
barplot(xx, col=c(3,2), beside=T, main=main)
|
||||
legend(0, 0.1, c("true", "shuffled"), fill=c(3,2), horiz=F, xpd=NA)
|
||||
invisible()
|
||||
}
|
93
detectors/cds/tools/lib/util.grid.r
Normal file
93
detectors/cds/tools/lib/util.grid.r
Normal file
@ -0,0 +1,93 @@
|
||||
#
|
||||
# R misc grid plotting
|
||||
#
|
||||
|
||||
require(grid)
|
||||
require(gridExtra)
|
||||
|
||||
#
|
||||
# get line height
|
||||
#
|
||||
|
||||
grd.lineheight <- function(s="X") {
|
||||
convertHeight(unit(1,"strheight", s), "native", valueOnly=T)
|
||||
}
|
||||
|
||||
#
|
||||
# quantile table
|
||||
#
|
||||
|
||||
grd.qtab <- function(df, what, cols, n=5) {
|
||||
df <- df[order(df[,what], decreasing=T),cols]
|
||||
sep <- head(df,1)
|
||||
sep[] <- "-"
|
||||
rbind(head(df, n), sep, tail(df, n))
|
||||
}
|
||||
|
||||
#
|
||||
# histogram with tables
|
||||
#
|
||||
|
||||
grd.hist <- function(df, what, cols = c(1,2, which(colnames(df) == what)),
|
||||
breaks=50, pos.sum=c(0.2,0.6), pos.quant=c(0.7,0.6), cex=0.7,
|
||||
main=paste0("Histogram of ", what), ...) {
|
||||
hist(df[,what], breaks=breaks, xlab=what, main=main, ...)
|
||||
if (! is.null(pos.sum)) {
|
||||
pushViewport(viewport(pos.sum[1], pos.sum[2], gp=gpar(cex=cex)))
|
||||
grid.table(x<-summary(df[,what]), rows=names(x))
|
||||
popViewport()
|
||||
}
|
||||
if (! is.null(pos.quant)) {
|
||||
pushViewport(viewport(pos.quant[1], pos.quant[2], gp=gpar(cex=cex)))
|
||||
grid.table(grd.qtab(df, what, cols), rows=NULL)
|
||||
popViewport()
|
||||
}
|
||||
invisible()
|
||||
}
|
||||
|
||||
#
|
||||
# plot with fit
|
||||
#
|
||||
|
||||
grd.fplot <- function(df, what.x, what.y, linfit=T, pos=c(0.2, 0.8), ablin=NULL, ...) {
|
||||
plot(df[,what.x], df[,what.y], xlab=what.x, ylab=what.y, ...)
|
||||
if (linfit) {
|
||||
fit <- lm(df[,what.y] ~ df[,what.x])
|
||||
abline(fit, col=2)
|
||||
pushViewport(viewport(gp=gpar(col=2)))
|
||||
a <- sprintf("%.2e", coef(fit)[2])
|
||||
b <- sprintf("%.2e", coef(fit)[1])
|
||||
grid.text(paste0(what.y, " = ", a, " * ", what.x, " + ", b),
|
||||
pos[1], pos[2], just="left")
|
||||
pos[2] = pos[2] - 2 * grd.lineheight()
|
||||
grid.text(paste0("R2=", round(summary(fit)$r.squared, 3)),
|
||||
pos[1], pos[2], just="left")
|
||||
popViewport()
|
||||
}
|
||||
if (! is.null(ablin))
|
||||
do.call(abline, ablin)
|
||||
invisible()
|
||||
}
|
||||
|
||||
#
|
||||
# write text
|
||||
#
|
||||
|
||||
grd.textpage <- function(..., lineno=0, left=0.1, top=0.9, cex=1, fact=1.4) {
|
||||
txt <- do.call(paste0, list(...))
|
||||
pushViewport(viewport(gp=gpar(cex=cex)))
|
||||
grid.text(txt, left, top-lineno*grd.lineheight()*fact, just="left")
|
||||
popViewport()
|
||||
invisible(txt)
|
||||
}
|
||||
|
||||
#
|
||||
# title page
|
||||
#
|
||||
|
||||
grd.titlepage <- function(title, x=0.5, y=0.7, cex=3, ...) {
|
||||
notify("processing", title)
|
||||
grid.newpage()
|
||||
grid.text(title, x, y, gp=gpar(cex=cex), ...)
|
||||
invisible()
|
||||
}
|
90
detectors/cds/tools/lib/util.modelio.r
Normal file
90
detectors/cds/tools/lib/util.modelio.r
Normal file
@ -0,0 +1,90 @@
|
||||
#
|
||||
# R models I/O utilities
|
||||
#
|
||||
|
||||
#
|
||||
# write start model
|
||||
#
|
||||
|
||||
write.model.start <- function(frq, what) {
|
||||
dir.create("models", showWarnings=F)
|
||||
fil <- paste0("models/start.", what, ".frq")
|
||||
notify("writing start model:", fil)
|
||||
cat("# start model :", what, "\n", file=fil)
|
||||
for (x in names(frq))
|
||||
cat(x, frq[x]/sum(frq), frq[x], "\n", file=fil, append=T)
|
||||
invisible(fil)
|
||||
}
|
||||
|
||||
#
|
||||
# write stop model
|
||||
#
|
||||
|
||||
write.model.stop <- function(frq, what) {
|
||||
dir.create("models", showWarnings=F)
|
||||
fil <- paste0("models/stop.", what, ".frq")
|
||||
notify("writing stop model:", fil)
|
||||
cat("# stop model :", what, "(freq. ignored)\n", file=fil)
|
||||
for (x in names(frq))
|
||||
cat(x, frq[x]/sum(frq), frq[x], "\n", file=fil, append=T)
|
||||
invisible(fil)
|
||||
}
|
||||
|
||||
#
|
||||
# write splice3 model
|
||||
# [FIXME] positions are hard-coded
|
||||
#
|
||||
|
||||
write.model.splice3 <- function(cons, what) {
|
||||
dir.create("models", showWarnings=F)
|
||||
fil <- paste0("models/splice3.", what, ".frq")
|
||||
notify("writing splice3 model:", fil)
|
||||
.catcons <- function(i) {
|
||||
cat(round(cons[c("a","c","g","t"), i]*100, 0), "\n",
|
||||
file=fil, append=T)
|
||||
}
|
||||
cat("# 3' splice model :", what, "\n", file=fil)
|
||||
cat("# A C G T\n", file=fil, append=T)
|
||||
sapply(seq.int(1, 4), .catcons)
|
||||
cat("splice\n", file=fil, append=T)
|
||||
sapply(seq.int(6, 11), .catcons)
|
||||
invisible(fil)
|
||||
}
|
||||
|
||||
#
|
||||
# write splice5 model
|
||||
# [FIXME] positions are hard-coded
|
||||
#
|
||||
|
||||
write.model.splice5 <- function(cons, what) {
|
||||
dir.create("models", showWarnings=F)
|
||||
fil <- paste0("models/splice5.", what, ".frq")
|
||||
notify("writing splice5 model:", fil)
|
||||
.catcons <- function(i) {
|
||||
cat(round(cons[c("a","c","g","t"), i]*100, 0), "\n",
|
||||
file=fil, append=T)
|
||||
}
|
||||
cat("# 5' splice model :", what, "\n", file=fil)
|
||||
cat("# A C G T\n", file=fil, append=T)
|
||||
sapply(seq.int(13, 18), .catcons)
|
||||
cat("splice\n", file=fil, append=T)
|
||||
sapply(seq.int(20, 23), .catcons)
|
||||
invisible(fil)
|
||||
}
|
||||
|
||||
#
|
||||
# write splice3/5 uniform model
|
||||
#
|
||||
|
||||
write.unif.splice <- function(pos, what) {
|
||||
dir.create("models", showWarnings=F)
|
||||
fil <- paste0("models/splice", pos, ".", what, ".frq")
|
||||
notify("writing uniform splice", pos, "model:", fil)
|
||||
cat("# 3'/5' splice null model", file=fil)
|
||||
cat("# A C G T\n", file=fil, append=T)
|
||||
cat("25 25 25 25\n", file=fil, append=T)
|
||||
cat("splice\n", file=fil, append=T)
|
||||
cat("25 25 25 25\n", file=fil, append=T)
|
||||
invisible(fil)
|
||||
}
|
||||
|
105
detectors/cds/tools/lib/util.plot.r
Normal file
105
detectors/cds/tools/lib/util.plot.r
Normal file
@ -0,0 +1,105 @@
|
||||
#
|
||||
# R plot utilities
|
||||
#
|
||||
|
||||
#
|
||||
# setup graphic device
|
||||
# tiff: high resolution 600 dpi
|
||||
# pdf
|
||||
#
|
||||
|
||||
uplot.init.dev <- function(fname, type="pdf", width=7, height=7, resol=600, ...) {
|
||||
fname <- paste0(fname, ".", type)
|
||||
res <- NULL
|
||||
if (type == "tiff") {
|
||||
res <- tiff(fname, width=width, height=height, units="in", res=resol, ...)
|
||||
}
|
||||
if (type == "pdf") {
|
||||
res <- pdf(fname, width=width, height=height, ...)
|
||||
}
|
||||
invisible(res)
|
||||
}
|
||||
|
||||
#
|
||||
# convert pdf to tiff using ghostscript
|
||||
#
|
||||
|
||||
uplot.convert2tiff <- function(fname, resol=600) {
|
||||
infile <- paste0(fname, ".pdf")
|
||||
oufile <- paste0(fname, ".tif")
|
||||
cmd <- paste0("echo quit | gs -r", resol, "-dBATCH -dNOPAUSE -sDEVICE=tiff12nc -sCompression=lzw -sOutputFile=", oufile, " ", infile)
|
||||
system(cmd)
|
||||
}
|
||||
|
||||
#
|
||||
# default plot setup
|
||||
#
|
||||
|
||||
uplot.setup <- function(mfrow=c(1,1),
|
||||
las=1,
|
||||
mgp=c(2, 0.7, 0),
|
||||
oma=c(0, 0, 0, 0),
|
||||
mar=c(4, 3, 3, 2),
|
||||
cex.main=1,
|
||||
font.main=1,
|
||||
family='Helvetica', ...) {
|
||||
par(mfrow=mfrow, las=las, mgp=mgp, oma=oma, mar=mar, cex.main=cex.main, font.main=font.main, family=family, ...)
|
||||
}
|
||||
|
||||
#
|
||||
# pie plot
|
||||
#
|
||||
|
||||
uplot.pie <- function(tab, main="", labels=c("name", "val", "per"), text.r=0.5, text.col="black", text.cex=1, main.pos=c(0,0), main.col="black", ...) {
|
||||
pie(tab, edges=2000, main="", labels="", ...)
|
||||
text(main.pos[1], main.pos[2], main, cex=1.5, col=main.col)
|
||||
prop <- tab/sum(tab)
|
||||
theta <- 2*pi * (cumsum(prop) - prop/2)
|
||||
lab <- list(name=names(tab), val=tab, per=sprintf("%d%%", round(prop*100)))
|
||||
lab <- apply(data.frame(lab[labels]), 1, paste, collapse="\n")
|
||||
if (length(lab) > 0)
|
||||
text(text.r*cos(theta), text.r*sin(theta), lab, cex=text.cex, col=text.col)
|
||||
invisible(NULL)
|
||||
}
|
||||
|
||||
#
|
||||
# plot utility : color representation of a table
|
||||
#
|
||||
|
||||
uplot.table <- function(tab, col=heat.colors(100), with.lines=TRUE) {
|
||||
image(as.matrix(tab), xaxt="n", yaxt="n", col=col)
|
||||
nli <- nrow(tab)
|
||||
nco <- ncol(tab)
|
||||
dx <- 0.5 / (nli-1)
|
||||
dy <- 0.5 / (nco-1)
|
||||
xf <- (seq_len(nli)-1)/(nli-1) - dx
|
||||
yf <- (seq_len(nco)-1)/(nco-1) - dy
|
||||
if (with.lines) {
|
||||
segments(xf, -dy, xf, 1+dy)
|
||||
segments(-dx, yf, 1+dx, yf)
|
||||
}
|
||||
Axis(c(0,1), at=xf+dx, side=1, labels=rownames(tab), las=2, cex.axis=0.5, padj=0)
|
||||
Axis(c(0,1), at=yf+dy, side=2, labels=colnames(tab), las=2, cex.axis=0.5, padj=0)
|
||||
invisible(NULL)
|
||||
}
|
||||
|
||||
#
|
||||
# plot utility : identify points within user's rectangle
|
||||
#
|
||||
|
||||
rect.identify <- function(data) {
|
||||
if (is.null(dim(data))) data <- cbind(seq_along(data), data)
|
||||
xy <- locator(n=2, type='n')
|
||||
r <- matrix(c(range(xy$x), range(xy$y)), ncol=2, byrow=TRUE)
|
||||
rect(r[1,1], r[2,1], r[1,2], r[2,2], border='red')
|
||||
.in.range <- function(p, r) {
|
||||
.in.int <- function(i) {
|
||||
(p[i] >= r[i,1]) && (p[i] <= r[i,2])
|
||||
}
|
||||
.in.int(1) && .in.int(2)
|
||||
}
|
||||
isel <- which(apply(data, 1, .in.range, r))
|
||||
points(data[isel,], col='red', pch=19)
|
||||
isel
|
||||
}
|
||||
|
Reference in New Issue
Block a user