e4d6a8484da5a00bbbbbce0d27957f2fb9ba0ed9

@ -0,0 +1,254 @@
#!/bin/csh -f
#
# make ChloroDB's
#
# usage: copy genbank/embl files into 'DB_DIR/download'
# usage: [create a paramter.sh file in 'DB_DIR']
# usage: go_chlorodb [DB_DIR]
#
unsetenv ORG_SOURCED

setenv ORG_HOME `dirname $0`/../../../..
source $ORG_HOME/scripts/csh_init.sh

#
# which DB to process
#

set DB_BASE = $DATA_DIR/cds/chlorodb  # default location

if ($#Argv > 0) then
  set DB_BASE = $Argv[1]; Shift
endif

set DB_BASE = `cd $DB_BASE && pwd -P`

NeedDir $DB_BASE/download

if (! -d $DB_BASE/info)  mkdir $DB_BASE/info
if (! -d $DB_BASE/fasta) mkdir $DB_BASE/fasta

cd $DB_BASE/info

#
# params
#

if (! -e $DB_BASE/parameters.sh) then
  @ n = `find $DB_BASE/download -depth 1 -type f -print | wc -l`
  @ cor_cutoff = $n / 2
  @ atg_cutoff = $n / 10
  @ dbs_cutoff = $n / 4
  if ($cor_cutoff == 0) @ cor_cutoff = 1
  if ($atg_cutoff == 0) @ atg_cutoff = 1
  if ($dbs_cutoff == 0) @ dbs_cutoff = 1
  echo "# sourced file"                          >  $DB_BASE/parameters.sh
  echo ""                                        >> $DB_BASE/parameters.sh
  echo "set CORE_NCDS_CUTOFF      = $cor_cutoff" >> $DB_BASE/parameters.sh
  echo "set CORE_START_ATG_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
  echo "set CORE_START_DFT_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
  echo "set CORE_START_OTH_CUTOFF = 10"          >> $DB_BASE/parameters.sh
  echo "set CORE_STOP_CUTOFF      = $cor_cutoff" >> $DB_BASE/parameters.sh
  echo "set CORE_SPLICE_CUTOFF    = $atg_cutoff" >> $DB_BASE/parameters.sh
  echo ""                                        >> $DB_BASE/parameters.sh
  echo "set SHEL_NCDS_CUTOFF      = 10"          >> $DB_BASE/parameters.sh
  echo ""                                        >> $DB_BASE/parameters.sh
  echo "set CORE_DELTA            = Inf"         >> $DB_BASE/parameters.sh
  echo "set CORE_COVMIN           = 30"          >> $DB_BASE/parameters.sh
  echo "set CORE_PMAX             = 1e-6"        >> $DB_BASE/parameters.sh
  echo "set CORE_IDMIN            = 30"          >> $DB_BASE/parameters.sh
  echo "set CORE_SIZMIN           = $cor_cutoff" >> $DB_BASE/parameters.sh
  echo ""                                        >> $DB_BASE/parameters.sh
  echo "set SHEL_DELTA            = 0.5"         >> $DB_BASE/parameters.sh
  echo "set SHEL_COVMIN           = 30"          >> $DB_BASE/parameters.sh
  echo "set SHEL_PMAX             = 1e-6"        >> $DB_BASE/parameters.sh
  echo "set SHEL_IDMIN            = 30"          >> $DB_BASE/parameters.sh
  echo "set SHEL_SIZMIN           = $dbs_cutoff" >> $DB_BASE/parameters.sh
  echo ""                                        >> $DB_BASE/parameters.sh
  echo "set DUST_DELTA            = 0.5"         >> $DB_BASE/parameters.sh
  echo "set DUST_COVMIN           = 30"          >> $DB_BASE/parameters.sh
  echo "set DUST_PMAX             = 1e-6"        >> $DB_BASE/parameters.sh
  echo "set DUST_IDMIN            = 30"          >> $DB_BASE/parameters.sh
  echo "set DUST_SIZMIN           = 10"          >> $DB_BASE/parameters.sh
  
endif

source $DB_BASE/parameters.sh

##set CMIN_COD = 0
##set FMIN_COD = 0.01

#
# temporarily uncompress
#

set ff = `find $DB_BASE/download -depth 1 -name \*.gz -print`

if ($#ff != 0) then
  Notify "uncompressing $#ff entries"
  foreach f ($ff)
    gunzip -f $f
  end
endif

#
# convert gbk/embl to fasta
#

set ff = `find $DB_BASE/download -depth 1 \( -name \*.gbk -or -name \*.embl \) -print`

Notify "convert $#ff gbk/embl entries to fasta"

foreach f ($ff)
  set nom = `basename $f:r`
  set typ = $f:e
  $AwkCmd -f $LIB_DIR/$typ.tofasta.awk $f > $DB_BASE/fasta/$nom.fst
end

#
# get gbk/embl info
#

Notify "get gbk/embl info for $#ff entries"

echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.info.awk > db.info.txt  # just get header

foreach f ($ff)
  set nom = `basename $f:r`
  set typ = $f:e
  $AwkCmd -f $LIB_DIR/$typ.oneliner.awk  $f |\
  $AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/$typ.info.awk |\
  egrep -v '^#' >> db.info.txt
end

#
# get cds info
#

Notify "get gbk/embl cds for $#ff entries"

echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.cds_long.awk > db.cds.txt  # just get header

foreach f ($ff)
  set nom = `basename $f:r`
  set typ = $f:e
  $AwkCmd -f $LIB_DIR/$typ.oneliner.awk  $f |\
  $AwkCmd -v FASTA=$DB_BASE/fasta/$nom.fst -f $LIB_DIR/libutil.awk \
          -f $LIB_DIR/$typ.cds_long.awk |\
  egrep -v '^#' >> db.cds.txt
end

#
# get fasta for  prots
#

Notify "get prots"
$AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/cds2fasta.awk db.cds.txt > db.prot.fst

#
# get introns
#

Notify "get gbk/embl introns for $#ff entries"

echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.intron.awk > db.intron.txt  # just get header

foreach f ($ff)
  set nom = `basename $f:r`
  set typ = $f:e
  $AwkCmd -f $LIB_DIR/$typ.oneliner.awk  $f |\
  $AwkCmd -v FASTA=$DB_BASE/fasta/$nom.fst -f $LIB_DIR/libutil.awk \
          -f $LIB_DIR/$typ.intron.awk |\
  egrep -v '^#' >> db.intron.txt
end

#
# make models
#

Notify "Making models"

echo -n ""                                              >  db.models.params.txt
echo "CORE_NCDS_CUTOFF <- $CORE_NCDS_CUTOFF"            >> db.models.params.txt
echo "CORE_START_ATG_CUTOFF <- $CORE_START_ATG_CUTOFF"  >> db.models.params.txt
echo "CORE_START_DFT_CUTOFF <- $CORE_START_DFT_CUTOFF"  >> db.models.params.txt
echo "CORE_START_OTH_CUTOFF <- $CORE_START_OTH_CUTOFF"  >> db.models.params.txt
echo "CORE_STOP_CUTOFF <- $CORE_STOP_CUTOFF"            >> db.models.params.txt
echo "CORE_SPLICE_CUTOFF <- $CORE_SPLICE_CUTOFF"        >> db.models.params.txt
echo "SHEL_NCDS_CUTOFF <- $SHEL_NCDS_CUTOFF"            >> db.models.params.txt

$LIB_DIR/make.models.r |& Cat

GetStatus
OnError then 
  Error 2 "model parameter too stringent"
endif

#
# add matrices
#

cp -f $PROG_DIR/matrices/* models

#
# make subDBs
#

if (-e db.core.pat.txt) then
  Notify "Making core DB (take some time... please wait)"
  $PROG_DIR/subdb/go_subdb.sh db.prot.fst db.core.pat.txt \
    $CORE_DELTA $CORE_COVMIN $CORE_PMAX $CORE_IDMIN $CORE_SIZMIN
endif

if (-e db.shell.pat.txt) then
  Notify "Making shell DB (take some time... please wait)"
  $PROG_DIR/subdb/go_subdb.sh db.prot.fst db.shell.pat.txt \
    $SHEL_DELTA $SHEL_COVMIN $SHEL_PMAX $SHEL_IDMIN $SHEL_SIZMIN
endif

if (-e db.dust.pat.txt) then
  Notify "Making dust DB (take some time... please wait)"
  $PROG_DIR/subdb/go_subdb.sh db.prot.fst db.dust.pat.txt \
    $DUST_DELTA $DUST_COVMIN $DUST_PMAX $DUST_IDMIN $DUST_SIZMIN
endif

#
# recompress entries
#

set ff = `find $DB_BASE/download -depth 1 -type f -print`

if ($#ff != 0) then
  Notify "recompressing $#ff entries"
  foreach f ($ff)
    gzip -f $f
  end
endif

# compress fasta

set ff = `find $DB_BASE/fasta -depth 1 -name \*.fst -print`

if ($#ff != 0) then
  Notify "compressing $#ff fasta entries"
  foreach f ($ff)
    gzip -f $f
  end
endif

# install everything in proper directory

foreach dir ("core" "shell" "dust")
  if (-e $DB_BASE/$dir) \rm -r $DB_BASE/$dir
  if ((-d db.$dir.pat.db) && (-e db.$dir.pat.db/Annot.lst)) then
    Notify "installing $DB_BASE/$dir"
    \mv -f db.$dir.pat.db $DB_BASE/$dir
  endif
end

if (-e $DB_BASE/models) \rm -r $DB_BASE/models
if (-d models) \mv -f models $DB_BASE

Notify "Done"
exit 0

@ -0,0 +1,29 @@
#
# blosum62 substitution matrix
# with larger penalty for stops
#
  A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
A 4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -50
R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -50
N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -50
D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -50
C 0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -50
Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -50
E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -50
G 0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -50
H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -50
I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -50
L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -50
K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -50
M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -50
F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -50
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -50
S 1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -50
T 0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -50
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -50
Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -50
V 0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -50
B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -50
Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -50
X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -50
* -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 1
@ -0,0 +1,195 @@
#!/bin/csh -f
#
# usage: go_subdb.sh prot.fst pat.txt [deltalen covmin pmax idmin sizmin]
# usage: prot.fst : proteins fasta file
# usage: pat.txt  : text file containing patterns and names for families to extract
# usage: output directory containig subdbs : basename <pat:r>.db
#

unsetenv ORG_SOURCED

setenv ORG_HOME `dirname $0`/../../../../..
source $ORG_HOME/scripts/csh_init.sh

NeedArg 2

set ProtFile = $Argv[1]; Shift
set PatFile  = $Argv[1]; Shift

NeedFile $ProtFile
NeedFile $PatFile

#
# parameters
#

set Delta  = 0.5
set Covmin = 30
set Pmax   = 1e-6
set Idmin  = 30
set Sizmin = 5

if ($#Argv > 0) then
  set Delta = $Argv[1]; Shift
endif

if ($#Argv > 0) then
  set Covmin = $Argv[1]; Shift
endif

if ($#Argv > 0) then
  set Pmax = $Argv[1]; Shift
endif

if ($#Argv > 0) then
  set Idmin = $Argv[1]; Shift
endif

if ($#Argv > 0) then
  set Sizmin = $Argv[1]; Shift
endif

#
# output directory
#

set OutDir = `basename $PatFile:r`.db

if (-d $OutDir) \rm -r $OutDir
mkdir $OutDir

set OutLog = `basename $PatFile:r`.log

echo -n '' > $OutLog

alias Report 'egrep "^>" \!:1 | wc -l | awk -v P=`basename \!:1` -v H=\!:2 '"'{print H,P,"'$1}'"'"' >> $OutLog'

#
# remove entries with bad symbols
#

Notify "cleanup $ProtFile"

Report $ProtFile "init_size"

$AwkCmd -f $LIB_DIR/db.filter.sym.awk $ProtFile > P_$$

Report $ProtFile "cleanup_size"

#
# select by name pattern
#

Notify "select by patterns"

mkdir D_$$
mkdir E_$$
mkdir F_$$

set noms = `awk '{print $1}' $PatFile`

foreach nom ($noms)
  set pat = `egrep "^$nom " $PatFile | awk '{print $2}'`
  $AwkCmd -f $LIB_DIR/db.filter.pat.awk -v PAT="$pat" P_$$ > D_$$/$nom.fst
  set n = `egrep '^>' D_$$/$nom.fst | wc -l`
  Notify "  pattern : $nom : $n"
  Report D_$$/$nom.fst "pattern_filter"
  if ($n <= $Sizmin) \rm -f D_$$/$nom.fst
end

set ok = `ls D_$$ | wc -l`
if ($ok == 0) goto fin

#
# select by length
#

Notify "select by length"

foreach f (D_$$/*.fst) 
  set nom = `basename $f:r`
  $AwkCmd -f $LIB_DIR/db.getlen.awk $f > L_$$
  $LIB_DIR/db.filter.len.r L_$$ $Delta |\
    $AwkCmd '($NF == "TRUE") {print $2}' > M_$$
  $AwkCmd -v FILE=M_$$ -f $LIB_DIR/db.subdb.awk $f > E_$$/$nom.fst
  Report E_$$/$nom.fst "length_filter"
  set n = `egrep '^>' E_$$/$nom.fst | wc -l`
  Notify "  length filter : $nom : $n"
  if ($n <= $Sizmin) \rm -f E_$$/$nom.fst
end

set ok = `ls E_$$ | wc -l`
if ($ok == 0) goto fin


#
# select by similarity
#

Notify "select by similarity"

foreach f (E_$$/*.fst) 
  set nom = `basename $f:r`

  Notify "  blasting $nom"
  
  makeblastdb -dbtype 'prot' -in $f >>& db.log
  blastp -db $f -query $f -outfmt 7 > $f.blast.out
  \rm -f $f.p??
  
  $AwkCmd -v COVMIN=$Covmin -v PMAX=$Pmax -v IDMIN=$Idmin \
      -f $LIB_DIR/db.blastlink.awk $f.blast.out |\
      $AwkCmd -f $LIB_DIR/db.todl.awk > G_$$
      
  ($LIB_DIR/db.cc.r G_$$ > $f.cc.txt) >>& db.log
  
  awk -v NAME=$nom -f $LIB_DIR/db.reportcc.awk $f.cc.txt >> $OutLog 
  
  $AwkCmd -f $LIB_DIR/db.selcc.awk $f.cc.txt > S_$$
  $AwkCmd -v FILE=S_$$ -f $LIB_DIR/db.subdb.awk $f > F_$$/$nom.fst

  Report F_$$/$nom.fst "similarity_filter"

  set n = `egrep '^>' F_$$/$nom.fst | wc -l`
  Notify "  blast filter : $nom : $n"
  if ($n <= $Sizmin) \rm -f F_$$/$nom.fst
  
end

set ok = `ls D_$$ | wc -l`
if ($ok == 0) goto fin

#
# annotations
#

echo -n "" > J_$$

foreach f (F_$$/*.fst) 
  $AwkCmd -f $LIB_DIR/db.annot.awk $f >> J_$$
end

awk '(NF >= 3) {print $1, $NF}' $PatFile | sort > A_$$
sort J_$$ | egrep -v '^ *$' > B_$$
join A_$$ B_$$ > F_$$/Annot.lst

#
# copy files
#

set n = `ls F_$$/* | wc -l`
Notify "copy $n files to $OutDir"

\mv -f F_$$/* $OutDir

#
# end
#

fin:
Notify "output directory : $OutDir"

\rm -r ?_$$


exit 0
@ -0,0 +1,39 @@
#

/^>/ {
  N++
  na = split($1, a, "@")
  if (a[na-1] > NEXMAX) NEXMAX = a[na-1]
  NEX[a[na-1]]++
  ANNOT[$NF]++
}

END {
  na = split(FILENAME, a, "/")
  na = split(a[na], a, "\\.")
  printf("%s %d ", a[1], N)
  s = ""
  for (i = 1 ; i <= NEXMAX ; i ++) {
    if (NEX[i] != 0)
      s = s "" i ":" NEX[i] "_"
  }
  gsub("_+$", "", s)
  printf("%s ", s)
  
  s = (NEXMAX == 1) ? "MONEX" : "POLYEX"
  printf("%s ", s)
  
  nmax = 0
  amax = "none"
  for (e in ANNOT) {
    if (ANNOT[e] > nmax) {
      nmax = ANNOT[e]
      amax = e
    }
  }
  print amax
  
}


@ -0,0 +1,48 @@
#

function min(x, y) {
  return ((x < y) ? x : y)
}

BEGIN {
  if (COVMIN == "") COVMIN = 50
  if (PMAX == "")   PMAX   = 1e-6
  if (IDMIN == "")  IDMIN  = 30
}

/^#/ {
  hitnum = 0;
  next;
}

{
  if ($1 == $2) next
  
  hitnum++;
  
  na = split($1, a, "@");
  if (na < 2) {
    print "query file not properly formatted" > "/dev/stderr"
    exit(1);
  }
  len1  = a[na];

  na = split($2, a, "@");
  if (na < 2) {
    print "bank file not properly formatted" > "/dev/stderr"
    exit(1);
  }
  len2 = a[na];
  
  id  = $3 + 0.0;
  ali = $4;

  covmin = ali * 100. / min(len1, len2);
  
  proba = $11 + 0.0;
  
  if ((covmin > COVMIN) && ((proba < PMAX) || (proba == 0)) && (id > IDMIN)) {
    print $1, $2, hitnum, id, covmin, proba, ali, len1, len2;
  }
}

@ -0,0 +1,18 @@
#!/usr/bin/env Rscript
#

require(igraph, warn.conflicts=F)

args <- commandArgs(T)
path <- if(length(args) > 0) args[1] else 'graph.dl'

g <- read.graph(path, format='dl')

cc <- clusters(g)

res <- cbind(V(g)$name, membership(cc))

write.table(res, quote=FALSE, row.names=FALSE, col.names=FALSE)

quit(save="no")

@ -0,0 +1,19 @@
#!/usr/bin/env Rscript
#

args <- commandArgs(T)
path  <- if(length(args) > 0) args[1] else 'len.txt'
delta <- if(length(args) > 1) args[2] else 0.5

tab <- read.table(path, header=T)

lmed <- median(tab$len)

dlen <- lmed * as.numeric(delta)

tab$ok <- (abs(tab$len-lmed)/lmed) <= delta

write.table(tab, quote=F)

quit(save='no')

@ -0,0 +1,10 @@
#

/^>/ {
  split($1, a, "@")
  ok = a[3] ~ PAT 
}

ok {
  print $0
}
@ -0,0 +1,30 @@
#
#
#

function Check(seq) {
  if (seq == "") return 0
  gsub("[ACDEFGHIKLMNPQRSTVWXY\n]+", "", seq)
  return (length(seq) == 0)
}

/^>/ {
  if (Check(Seq)) {
    print Name
    printf("%s", Seq)
  }
  Name = $0
  Seq = ""
  next
}

{
  Seq = Seq "" $0 "\n"
}

END {
  if (Check(Seq)) {
    print Name
    printf("%s", Seq)
  }
}
@ -0,0 +1,10 @@
#
BEGIN {
  print "id len"
}

/^>/ {
 na = split($1, a, "@")
 print substr($1, 2), a[na]
}

@ -0,0 +1,15 @@
#
#

{
  cnt[$NF]++
}

END {
  n = asort(cnt)
  printf("cc_size %s", NAME)
  for (i = n ; i >= 1 ; i--)
    printf(" %d", cnt[i])
  print ""
}

@ -0,0 +1,19 @@
#

{
  N[$NF]++
  E[$NF, N[$NF]] = $1
}

END {
  cmax = 1
  nmax = N[1]
  for (i in N) {
    if (N[i] > nmax) {
      nmax = N[i]
      cmax = i
    }
  }
  for (i = 1 ; i <= nmax ; i++)
    print E[cmax, i]
}
@ -0,0 +1,17 @@
#

BEGIN {
  if (FILE == "") FILE = "db.sel.txt"
  while (getline < FILE)
    INC[$1] = $1
  close(FILE)
}

/^>/ {
  name = substr($1, 2)
  ok = name in INC
}

ok {
  print $0
}
@ -0,0 +1,21 @@
#

{
  node[$1]++
  node[$2]++
  link[++M] = $1 " " $2
}


END {
 for (n in node)
   N++
 print "DL n=" N
 print "format = edgelist1"
 print "labels embedded:"
 print "data:"
 for (i = 1 ; i <= M ; i++)
   print link[i]
}


@ -47,8 +47,8 @@ $AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/$PrdType.cds_short.awk > P_$$

Notify "compare bank to predictions"

$AwkCmd -f $LIB_DIR/libnws.awk      \
        -f $LIB_DIR/compareCds.awk  \
$AwkCmd -f $LIB_DIR/libnws.awk       \
        -f $LIB_DIR/compare.cds.awk  \
        R_$$ P_$$ > S_$$

# base statistics

@ -15,6 +15,8 @@ NeedArg 1

egrep '^#|^MATCH' $* | awk -f $LIB_DIR/summary.cmp.awk > compare.txt

Notify "text file: compare.txt"

$LIB_DIR/summarize_cmp.r 


@ -0,0 +1,19 @@
#
# get fasta sequence from cds list 
#
# [-v FIELD=13] CDS sequence
# [-v FIELD=14] Prot Sequence
#

BEGIN {
  if (CHARPERLINE == "") CHARPERLINE = 50
  if (FIELD == "") FIELD = 14
}

/^#/ { next }

{
  name = $1 "@" $2 "@" $3 "@" $5 "@" $6 "@" $7 "@" $8 "@" int($9/3)
  comment = $NF
  PrintFasta($FIELD, name " " comment)
}
@ -0,0 +1,97 @@
#
# get cds features from embl (long version)
#
# -v FASTA

# @include lib.embl.awk


BEGIN {
  print "#locus locustag genefam gene from to strand nexon length status start stop dnaseq protseq product"

  if (HEADONLY != "") exit(0)

  if (MAXSPAN == "") MAXSPAN = 10000

  if (FASTA == "") Error("No FASTA file specified", 1)
  
  if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)

  Seq = tolower(ReadFasta(FASTA))
}

/^ID / {
  locus = $2
  gsub(";", "", locus)
  incds = 0
  next
}

/^FT   CDS/ {
  revstrand = match($3, "^complement")
  s = substr($0, 22)
  gsub("^complement", "", s)
  ok = ! match(s, "complement|order")
  nexon = Nexons(s)
  SpanLocation(s, sloc)
  spanlen = sloc[2] - sloc[1] + 1
  len = LenLocation(s)
  ok = ok && (len < MAXSPAN)
  cdsseq = ok ? SeqLocation(Seq, s, revstrand) : "XXX"
  cstart = substr(cdsseq, 1,3)
  cstop  = substr(cdsseq, length(cdsseq)-2)

  gene = "none"
  locustag = "none"
  product = "none"
  translation = "X"
  incds = 1
  next
}

(incds && /^FT   [^ ]/) {
  print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"), 
        nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
  incds = 0
  next
}

/^FT                   \/gene=/ {
  split($0, a, "=")
  gene = a[2]
  gsub("^[^a-z,A-Z]+", "", gene)
  gsub("\"", "", gene)
  gsub(" ", "_", gene)
  next
}

/^FT                   \/locus_tag=/ {
  split($0, a, "=")
  locustag = a[2]
  gsub("\"", "", locustag)
  gsub(" ", "_", locustag)
  next
}

/^FT                   \/product=/ {
  split($0, a, "=")
  product = a[2]
  gsub("\"", "", product)
  gsub(" ", "_", product)
  next
}

/^FT                   \/translation=/ {
  split($0, a, "=")
  translation = a[2]
  gsub("\"", "", translation)
  gsub(" ", "", translation)
  next
}

END {
  if (incds) {
    print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"), 
          nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
  }
}
@ -0,0 +1,97 @@
#
# get feature info from embl
#

# @include libgbk.awk

function GC(s, _local_, i, len) {
  s = toupper(s)
  len = length(s)
  gsub("G|C", "", s)
  return ((len - length(s)) * 100 / (len ? len : 1))
}

#
# rules
#

BEGIN {
  print "#locus orga len oklen gc nbCds nbCds_int0 nbCds_int1 nbCds_intsup1 perCds_noex meanCdsSize nbtRNA nbrRNA nboRNA"
}

/^ID/ {
  locus = $2
  gsub(";", "", locus)
  next
}

/^OS/ {
  orga = substr($0, 6)
  gsub(" ", "_", orga)
  next
}

/^FT   source/ {
  GetLoc($3, loc);
  len = loc[2];
  next
}

/^FT   CDS/ {
  meanCds = meanCds * nbCds + LenLocation($3)
  nbCds++
  meanCds /= nbCds
  n = Nexons($3)  
  if (n > 3) n = 3
  nbCdx[n]++
  next
}

/^FT   tRNA/ {
  nbTrna++
  next
}

/^FT   rRNA/ {
  nbRrna++
  next
}

/^FT   mRNA/ {
  next
}

/^FT   .*RNA/ {
  nbOrna++
  next
}

/^SQ / {
  inseq = 1
  seq = ""
  next
}

inseq && /^     / {
  s = $0
  gsub("[0-9]+", "", s)
  gsub(" ", "", s)
  seq = seq "" s
  next
}

/^\/\// {
  oklen = (len == length(seq) ? "ok" : "wrong")
  gc = GC(seq)
  print locus, orga, len, oklen, gc, nbCds+0, nbCdx[1]+0, \
         nbCdx[2]+0, nbCdx[3]+0, (nbCdx[1]+0)*100/Max(1, nbCds+0), \
         meanCds+0, nbTrna+0, nbRrna+0, nbOrna+0
  nbCds = nbTrna = nbRrna = nbOrna = len = inseq = meanCds = 0
  delete nbCdx
  orga = locus = "?"
}

  
@ -0,0 +1,97 @@
#
# get intron features from embl
#

# @include libembl.awk

BEGIN {
  print "#locus locustag genefam gene from to strand intron_num intron_nb acceptor-donor status"

  if (HEADONLY != "") exit(0)

  if (FASTA == "") Error("No FASTA file specified", 1)
  
  if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)

  Seq = tolower(ReadFasta(FASTA))
}

/^ID / {
  locus = $2
  gsub(";", "", locus)
  next
}

/^FT   CDS/ {
  revstrand = match($3, "^complement")
  s = substr($0, 22)
  gsub("^complement", "", s)
  ok = ! match(s, "complement|order")
  if (! ok) next

  na = ParseLocation(s, locs)
  if (na < 2) next

  delete SINfo
  Ninfo = 0
  
  val = locs[1][1]
  for (i = 2 ; i <= na ; i++) {
    if (locs[i][1] < val) ok = 0
    val = locs[i][1]
  }
  if (! ok) next

  val = locs[1][2]
  for (i = 2 ; i <= na ; i++) {
    if (locs[i][2] < val) ok = 0
    val = locs[i][2]
  }
  if (! ok) next

  from = locs[1][2] + 1
  for (i = 2 ; i <= na ; i++) {
    to = locs[i][1] - 1
    inseq = SeqLocation(Seq, (from - 4) ".." (to + 4), revstrand)
    SINfo[++Ninfo] = from " " to " " (revstrand ? "R" : "D") " "\
                     (revstrand ? na-i+1 : i-1) " " na-1 " "\
                     substr(inseq, 1,4) "."\
                     substr(inseq, 5,6) "-"\
                     substr(inseq, length(inseq)-9, 6) "."\
                     substr(inseq, length(inseq)-3, 4) " "\
                     "ok"
    from = locs[i][2] + 1
  }
  
  gene = "none"
  locustag = "none"
  next
}

/^FT                   \/gene=/ {
  split($0, a, "=")
  gene = a[2]
  gsub("^[^a-z,A-Z]+", "", gene)
  gsub("\"", "", gene)
  gsub(" ", "_", gene)
  next
}

/^FT                   \/locus_tag=/ {
  split($0, a, "=")
  locustag = a[2]
  gsub("\"", "", locustag)
  gsub(" ", "_", locustag)
  next
}

/^FT                   \/translation=/ {
  for (i = 1 ; i <= Ninfo ; i++)
    print locus, locustag, GeneFamily(gene), gene, SINfo[i]
  Ninfo = 0
  next
}

/^\/\// {
  locus = "?"
}
@ -0,0 +1,36 @@
#
# get fasta sequence from embl
#

/^ID / {
  locus = $2
  gsub(";", "", locus)
  next
}

/^SQ / {
  inseq = 1
  nln = 0
  delete seq
  next
}

/^\/\// {
  inseq = 0
  print ">" locus
  for (i = 1 ; i <= nln ; i++)
    print seq[i]
  next
}

inseq {
  s = $0
  gsub(" ", "", s)
  gsub("[0-9]+", "", s)
  seq[++nln] = s
  next
}
  

@ -0,0 +1,99 @@
#
# get cds features from genbank (long version)
#
# -v FASTA

# @include libgbk.awk

BEGIN {
  print "#locus locustag genefam gene from to strand nexon length status start stop dnaseq protseq product"

  if (HEADONLY != "") exit(0)
  
  if (MAXSPAN == "") MAXSPAN = 10000

  if (FASTA == "") Error("No FASTA file specified", 1)
  
  if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)

  Seq = tolower(ReadFasta(FASTA))
}

/^LOCUS/ {
  locus = $2
  incds = 0
  next
}

/^     CDS/ {
  revstrand = match($2, "^complement")
  s = substr($0, 22)
  gsub("^complement", "", s)
  ok = ! match(s, "complement|order")
  nexon = Nexons(s)
  SpanLocation(s, sloc)
  spanlen = sloc[2] - sloc[1] + 1
  len = LenLocation(s)
  ok = ok && (len < MAXSPAN)
  cdsseq = ok ? SeqLocation(Seq, s, revstrand) : "XXX"
  cstart = substr(cdsseq, 1,3)
  cstop  = substr(cdsseq, length(cdsseq)-2)

  gene = "none"
  locustag = "none"
  product = "none"
  translation = "X"
  incds = 1
  next
}

(incds && /^     [^ ]/) {
  print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"), 
        nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
  incds = 0
  next
}

/^                     \/gene=/ {
  split($0, a, "=")
  gene = a[2]
  gsub("^[^a-z,A-Z]+", "", gene)
  gsub("\"", "", gene)
  gsub(" ", "_", gene)
  next
}

/^                     \/locus_tag=/ {
  split($0, a, "=")
  locustag = a[2]
  gsub("\"", "", locustag)
  gsub(" ", "_", locustag)
  next
}

/^                     \/product=/ {
  split($0, a, "=")
  product = a[2]
  gsub("\"", "", product)
  gsub(" ", "_", product)
  next
}

/^                     \/translation=/ {
  split($0, a, "=")
  translation = a[2]
  gsub("\"", "", translation)
  gsub(" ", "", translation)
  next
}

/^\/\// {
  locus = "?"
}

END {
  if (incds) {
    print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"), 
          nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
  }
}
@ -0,0 +1,97 @@
#
# get feature info from genbank
#

# @include libgbk.awk

function GC(s, _local_, i, len) {
  s = toupper(s)
  len = length(s)
  gsub("G|C", "", s)
  return ((len - length(s)) * 100 / len)
}

#
# rules
#

BEGIN {
  print "#locus orga len oklen gc nbCds nbCds_int0 nbCds_int1 nbCds_intsup1 perCds_noex meanCdsSize nbtRNA nbrRNA nboRNA"
}

/^LOCUS/ {
  locus = $2
  next
}

/^  ORGANISM/ {
  orga = substr($0, 13)
  split(orga, a, ";")
  orga = a[1]
  gsub(" ", "_", orga)
  next
}

/^     source/ {
  GetLoc($2, loc);
  len = loc[2];
  next
}

/^     CDS/ {
  meanCds = meanCds * nbCds + LenLocation($2)
  nbCds++
  meanCds /= nbCds
  n = Nexons($2)  
  if (n > 3) n = 3
  nbCdx[n]++
  next
}

/^     tRNA/ {
  nbTrna++
  next
}

/^     rRNA/ {
  nbRrna++
  next
}

/^     mRNA/ {
  next
}

/^     .*RNA/ {
  nbOrna++
  next
}

/^ORIGIN/ {
  inseq = 1
  seq = ""
  next
}

inseq && /^ +[1-9][0-9]*/ {
  s = substr($0, 11)
  gsub(" ", "", s)
  seq = seq "" s
  next
}

/^\/\// {
  oklen = (len == length(seq) ? "ok" : "wrong")
  gc = GC(seq)
  print locus, orga, len, oklen, gc, nbCds+0, nbCdx[1]+0, \
         nbCdx[2]+0, nbCdx[3]+0, (nbCdx[1]+0)*100/Max(1, nbCds+0), \
         meanCds+0, nbTrna+0, nbRrna+0, nbOrna+0
  nbCds = nbTrna = nbRrna = nbOrna = len = inseq = meanCds = 0
  delete nbCdx
  orga = locus = "?"
}

  
@ -0,0 +1,96 @@
#
# get intron features from genbank
#

# @include libgbk.awk

BEGIN {
  print "#locus locustag genefam gene from to strand intron_num intron_nb acceptor-donor status"

  if (HEADONLY != "") exit(0)
  
  if (FASTA == "") Error("No FASTA file specified", 1)
  
  if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)

  Seq = tolower(ReadFasta(FASTA))
}

/^LOCUS/ {
  locus = $2
  next
}

/^     CDS/ {
  revstrand = match($2, "^complement")
  s = substr($0, 22)
  gsub("^complement", "", s)
  ok = ! match(s, "complement|order")
  if (! ok) next

  na = ParseLocation(s, locs)
  if (na < 2) next

  delete SINfo
  Ninfo = 0
  
  val = locs[1][1]
  for (i = 2 ; i <= na ; i++) {
    if (locs[i][1] < val) ok = 0
    val = locs[i][1]
  }
  if (! ok) next

  val = locs[1][2]
  for (i = 2 ; i <= na ; i++) {
    if (locs[i][2] < val) ok = 0
    val = locs[i][2]
  }
  if (! ok) next

  from = locs[1][2] + 1
  for (i = 2 ; i <= na ; i++) {
    to = locs[i][1] - 1
    inseq = SeqLocation(Seq, (from - 4) ".." (to + 4), revstrand)
    SINfo[++Ninfo] = from " " to " " (revstrand ? "R" : "D") " "\
                     (revstrand ? na-i+1 : i-1) " " na-1 " "\
                     substr(inseq, 1,4) "."\
                     substr(inseq, 5,6) "-"\
                     substr(inseq, length(inseq)-9, 6) "."\
                     substr(inseq, length(inseq)-3, 4) " "\
                     "ok"
    from = locs[i][2] + 1
  }
  
  gene = "none"
  locustag = "none"
  next
}

/^                     \/gene=/ {
  split($0, a, "=")
  gene = a[2]
  gsub("^[^a-z,A-Z]+", "", gene)
  gsub("\"", "", gene)
  gsub(" ", "_", gene)
  next
}

/^                     \/locus_tag=/ {
  split($0, a, "=")
  locustag = a[2]
  gsub("\"", "", locustag)
  gsub(" ", "_", locustag)
  next
}

/^                     \/translation=/ {
  for (i = 1 ; i <= Ninfo ; i++)
    print locus, locustag, GeneFamily(gene), gene, SINfo[i]
  Ninfo = 0
  next
}

/^\/\// {
  locus = "?"
}
@ -0,0 +1,32 @@
#
# get fasta sequence from genbank
#

/^LOCUS/ {
  locus = $2
  next
}

/^ORIGIN/ {
  inseq = 1
  nln = 0
  delete seq
}

inseq && /^ +[1-9][0-9]*/ {
  s = substr($0, 11)
  gsub(" ", "", s)
  seq[++nln] = s
  next
}

/^\/\// {
  print ">" locus
  for (i = 1 ; i <= nln ; i++)
    print seq[i]
}

  
@ -0,0 +1,31 @@
#!/usr/bin/env Rscript
#
# check and install required packages
#

out <- function(...) {
  cat(paste0('+ ', ..., '\n'), file=stderr())
}

installed <- function(package) {
  package %in% rownames(installed.packages())
}

check <- function(package, repos="http://cran.univ-lyon1.fr") {
  if (installed(package)) {
    out("R package ", package, " installed")
  } else {
    out("Installing R package ", package, " from ", repos)
    install.packages(package, repos=repos)
  }
  invisible(installed(package))
}

check("grid")
check("gridExtra")
check("vcd")
check("plotrix")
check("igraph")

quit(save='no', status=0)

@ -0,0 +1,224 @@
#!/usr/bin/env Rscript
#
# compute start, stop, splice-junctions models for core DB
#
# source("make.models.r")
#

LIB_DIR <- Sys.getenv("LIB_DIR")
if (LIB_DIR == "") LIB_DIR = "."

source(paste0(LIB_DIR, "/util.base.r"))
source(paste0(LIB_DIR, "/util.cons.r"))
source(paste0(LIB_DIR, "/util.modelio.r"))

# -------------------------------
# parameters
# -------------------------------

# core cutoffs

source("db.models.params.txt")

# -------------------------------
# genome infos
# -------------------------------

notify("loading info table")
chromo <- read.table("db.info.txt", com="", head=T, stringsAsFactors=F)

# -------------------------------
# CDS
# -------------------------------

notify("loading cds table")
cds <- read.table("db.cds.txt", com="", header=T, stringsAsFactors=F)

cds$start <- as.factor(cds$start)
cds$stop <- as.factor(cds$stop)

cds <- cds[cds$status=="Ok",]
cds <- cds[cds$genefam!="none",]

cds$categ <- "dust"

x <- sort(table(cds$genefam), dec=T)
ok <- names(x[x >= CORE_NCDS_CUTOFF])

cds$categ[cds$genefam %in% ok] <- "core"

x <- x[! names(x) %in% ok]
ok <- names(x[x >= SHEL_NCDS_CUTOFF])

cds$categ[cds$genefam %in% ok] <- "shell"

#

cds.ori <- cds

cds.lst <- split(cds.ori, cds.ori$categ)

#
# write out families 
#

# patterns & names

invisible(lapply(cds.lst, function(cds) {

  x <- sort(table(cds$genefam), decreasing=T)
  tab <- paste0("^", names(x), "$")
  names(tab) <- names(x)

  y <- sapply(split(cds$gene, cds$genefam), function(g) {
    head(names(sort(table(g), decreasing=T)), 1)
  })
  
  tab <- cbind(tab, y[names(x)])

  categ <- unique(cds$categ)
  f <- paste0("db.", categ, ".pat.txt")
  notify("writing patterns for", categ, ":", f)
  write.table(tab, file=f, quote=F, col.names=F, row.names=T)
}))

# -------------------------------
# Start models (core only)
# -------------------------------

if (! "core" %in% names(cds.lst)) {
  notify("*** no gene found in core")
  notify("*** please change parameters")
  quit(save='no', status=1)
}

cds <- cds.lst[["core"]]

#
# start by genes
#

tab <- split(cds$start, cds$genefam)

fatg <- sapply(tab, function(x) table(x)["atg"]/length(x)*100)
names(fatg) <- names(tab)

start.dft <- names(which(fatg >= CORE_START_ATG_CUTOFF))
start.spc <- names(which(fatg < CORE_START_ATG_CUTOFF))

tab <- cds[cds$genefam %in% start.dft,]
tab <- table(tab$start)

# default model

x <- sort(tab[tab>=CORE_START_DFT_CUTOFF], decreasing=T)
write.model.start(x, "default")

# gene specific models

invisible(sapply(start.spc, function(g) {
  x <- cds[cds$genefam == g,]
  tx <- table(x$start)
  tx <- sort(tx[tx>=CORE_START_OTH_CUTOFF], decreasing=T)
  write.model.start(tx, g)
}))

# -------------------------------
# Stop models (core only)
# -------------------------------

# write default stop model

tab <- table(cds$stop)
x <- sort(tab[tab>=CORE_STOP_CUTOFF], decreasing=T)
write.model.stop(x, "default")

# -------------------------------
# splice junctions
# -------------------------------

notify("loading intron table")
intron <- read.table("db.intron.txt", com="", header=T, stringsAsFactors=F)

# remove invalid sequences

intron$seq <- gsub("\\.|-", "", intron$acceptor.donor)

lseq <- nchar(gsub("[^acgt]", "", intron$seq))

intron <- intron[lseq == 20,]

# remove genes out of core

intron <- intron[intron$genefam %in% cds$genefam,]

# acceptors / donors

intron$acc <- substr(intron$seq, 5, 6)
intron$don <- substr(intron$seq, 15, 16)

# consensus

cons.px <- cons.build(intron$acceptor.donor)
cons.px <- cons.px[,! is.nan(colSums(cons.px))]

seq.px <- sapply(intron$acceptor.donor, function(s) gsub("[^acgt]", "", s))

conf.px <- cons.confusion(cons.px, seq.px)

sfam <- split(conf.px$l2scor, intron$genefam)
sfam <- sfam[order(sapply(sfam, median))]

# extract splice exceptions

name.bad  <- names(which(sapply(sfam, median) < 0))
name.spc  <- names(which(sapply(sfam[name.bad], length) >= CORE_SPLICE_CUTOFF))
name.ok   <- setdiff(unique(intron$genefam), name.bad)
name.bad  <- setdiff(name.bad, name.spc)
name.list <- c(sapply(name.spc, function(x) x), list(default=name.ok))

cons <- lapply(name.list, function(x) cons.build(intron[intron$genefam %in% x, "acceptor.donor"]))

# write junction models

invisible(sapply(names(cons), function(n) write.model.splice3(cons[[n]], n)))
invisible(sapply(names(cons), function(n) write.model.splice5(cons[[n]], n)))

# use uniform model for bad guys

invisible(sapply(name.bad, function(n) write.unif.splice(3, n)))
invisible(sapply(name.bad, function(n) write.unif.splice(5, n)))
  
invisible(write.unif.splice('', "none"))

# -------------------------------
# keep data for plotting
# -------------------------------

DB <- list()

params <- list()

params$CORE_NCDS_CUTOFF      <- CORE_NCDS_CUTOFF
params$CORE_START_ATG_CUTOFF <- CORE_START_ATG_CUTOFF
params$CORE_START_DFT_CUTOFF <- CORE_START_DFT_CUTOFF
params$CORE_START_OTH_CUTOFF <- CORE_START_OTH_CUTOFF
params$CORE_STOP_CUTOFF      <- CORE_STOP_CUTOFF
params$CORE_SPLICE_CUTOFF    <- CORE_SPLICE_CUTOFF

params$SHEL_NCDS_CUTOFF      <- SHEL_NCDS_CUTOFF

DB$params  <- params
DB$chromo  <- chromo
DB$cds.lst <- cds.lst
DB$intron  <- intron
DB$cons    <- cons

notify("saving db.data.Rdata")
save(DB, file="db.data.Rdata")

# -------------------------------
# end
# -------------------------------

quit(save='no')
@ -0,0 +1,424 @@
#!/usr/bin/env Rscript
#
# plots models previously computed by make.models.r
#
# source("plot.models.r")
#

require(vcd)
require(plotrix)

LIBDIR <- Sys.getenv("LIB_DIR")
if (LIBDIR == "") LIBDIR = "."

source(paste0(LIBDIR, "/util.base.r"))
source(paste0(LIBDIR, "/util.plot.r"))
source(paste0(LIBDIR, "/util.cons.r"))
source(paste0(LIBDIR, "/util.grid.r"))

# -------------------------------
# setup
# -------------------------------

OUT.DEV <- TRUE
OUT.TYPE <- "pdf"
OUT.FILE <- "models"

if (OUT.DEV) uplot.init.dev(OUT.FILE, OUT.TYPE)

# -------------------------------
# Load data
# -------------------------------

notify("loading DB data")
load("db.data.Rdata")

params  <- DB$params
chromo  <- DB$chromo
cds.lst <- DB$cds.lst
intron  <- DB$intron
cons    <- DB$cons

# -------------------------------
# Genomes infos
# -------------------------------

grd.titlepage("Species")
grd.textpage(lineno=1, "# org: ", nrow(chromo))

#
# general stats
#

grd.hist(chromo, "len", main="Histogram of chromosome length")
grd.hist(chromo, "gc", pos.quant=c(0.75, 0.6), main="Histogram of chromosome GC")
grd.hist(chromo, "nbCds")
grd.fplot(chromo, "len", "nbCds")

#
# nb cds no introns
#

chromo$nbCds_Mono <- chromo$nbCds_int0
chromo$nbCds_Poly <- chromo$nbCds_int1 + chromo$nbCds_intsup1
chromo$percentPoly <- round(chromo$nbCds_Poly * 100 / (chromo$nbCds_Poly + chromo$nbCds_Mono))

grd.hist(chromo, "nbCds_Mono", main="Histogram of monoexonic Cds")
grd.hist(chromo, "nbCds_Poly", main="Histogram of polyexonic Cds")
grd.hist(chromo, "percentPoly", pos.sum=c(0.23,0.6), main="Histogram of % polyexonic")

grd.fplot(chromo, "nbCds", "nbCds_Mono", TRUE, ablin=list(a=0, b=1, col=3))
grd.fplot(chromo, "nbCds", "nbCds_Poly")

#
# cds size
#

grd.hist(chromo, "meanCdsSize", pos.quant=NULL, main="Histogram of Cds size")

# -------------------------------
# CDS
# -------------------------------

cds.all <- do.call(rbind, cds.lst)

grd.titlepage("CDS")

grd.textpage(lineno=1, "# core cds core cutoff: ", params$CORE_NCDS_CUTOFF)
grd.textpage(lineno=2, "# core cds shell cutoff: ", params$SHEL_NCDS_CUTOFF)

grd.textpage(lineno=4, "# total cds: ", nrow(cds.all))
grd.textpage(lineno=5, "# core cds: ", nrow(cds.lst[["core"]]))
grd.textpage(lineno=6, "# shell cds: ", nrow(cds.lst[["shell"]]))
grd.textpage(lineno=7, "# dust cds: ", nrow(cds.lst[["dust"]]))

grd.textpage(lineno=9,  "# total org: ", length(unique(cds.all$X.locus)))
grd.textpage(lineno=10, "# core org: ",  length(unique(cds.lst[["core"]]$X.locus)))
grd.textpage(lineno=11, "# shell org: ", length(unique(cds.lst[["shell"]]$X.locus)))
grd.textpage(lineno=12, "# dust org: ",  length(unique(cds.lst[["dust"]]$X.locus)))


grd.textpage(lineno=14, "# total families: ", length(unique(cds.all$genefam)))
grd.textpage(lineno=15, "# core families: ",  length(unique(cds.lst[["core"]]$genefam)))
grd.textpage(lineno=16, "# shell families: ", length(unique(cds.lst[["shell"]]$genefam)))
grd.textpage(lineno=17, "# dust families: ",  length(unique(cds.lst[["dust"]]$genefam)))

uplot.setup(mfrow=c(2,2), xpd=NA)

x <- sapply(cds.lst, nrow)
uplot.pie(x, main="CDS", text.r=1.1, col=c(3,2,4))

x <- sapply(cds.lst, function(cds) length(unique(cds$X.locus)))
uplot.pie(x, main="ORG", text.r=1.1, col=c(3,2,4))

x <- sapply(cds.lst, function(cds) length(unique(cds$genefam)))
uplot.pie(x, main="FAM", text.r=1.1, col=c(3,2,4))

uplot.setup(xpd=F)

#
# plot genes cutoff
#

cds.all <- do.call(rbind, cds.lst)
cds.byfam <- split(cds.all, cds.all$genefam)

tab <- sort(sapply(cds.byfam, nrow), decreasing=T)
cols <- rep("red", length(tab))
cols[tab >= params$SHEL_NCDS_CUTOFF] <- "blue"
cols[tab >= params$CORE_NCDS_CUTOFF] <- "green"
barplot(tab, col=cols, border=NA, main="# genes")

cols <- cols[tab >= 50]
tab <- tab[tab >= 50]
barplot(tab, col=cols, border=NA, las=2, cex.names=0.5, main="# genes in core")
abline(h=params$CORE_NCDS_CUTOFF, col=1)
text(50, 200, "CORE_NCDS_CUTOFF", pos=3)

#
# cds length for core
#

invisible(sapply(c("core", "shell", "dust"), function(what) {

  cds <- cds.lst[[what]]

  x <- split(cds$length, cds$genefam)
  x <- x[order(sapply(x, mean))]
  
  uplot.setup(mfrow=c(2,1))
  
  boxplot(x, pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5, outcex = 0.1),
          las=2, cex.axis=0.5, main=paste0(what, " genes - length distribution"))
  
  boxplot(x, pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5, outcex = 0.1), ylim=c(0,2000),
          las=2, cex.axis=0.5, main=paste0(what, " genes - length distribution zoom"))
  
  uplot.setup()
}))

# -------------------------------
# starts & stops
# -------------------------------

cds <- cds.lst[["core"]]

grd.titlepage("Starts and Stops")

tab <- sort(table(cds$start), dec=T)
tab <- tab[tab >= 100]
tab <- tab / sum(tab) * 100

barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="start frequencies (%)")
text(0.7, 50, round(tab[1], 2))
text(1.9, 2.3, round(tab[2], 2))
text(3.1, 2.3, round(tab[3], 2))

#
# start by org and gc
#

x <- split(cds$start, cds$X.locus)

fatg <- sapply(x, function(x) table(x)["atg"]/length(x)*100)
names(fatg) <- names(x)
chromo$fatg <- round(fatg[chromo$X.locus], 2)

fgtg <- sapply(x, function(x) table(x)["gtg"]/length(x)*100)
names(fgtg) <- names(x)
chromo$fgtg <- round(fgtg[chromo$X.locus], 2)

facg <- sapply(x, function(x) table(x)["acg"]/length(x)*100)
names(facg) <- names(x)
chromo$facg <- round(facg[chromo$X.locus], 2)

grd.hist(chromo, "fatg", pos.quant=c(0.5, 0.6), main="Histogram of atg freq. by org")
grd.hist(chromo, "fgtg", main="Histogram of gtg freq. by org")
grd.hist(chromo, "facg", pos.sum=c(0.3, 0.6), main="Histogram of acg freq. by org")

grd.fplot(chromo, "gc", "fatg", main="atg freq. by org GC", pos=c(0.2, 0.3))
grd.fplot(chromo, "gc", "fgtg", main="gtg freq. by org GC")
grd.fplot(chromo, "gc", "facg", main="acg freq. by org GC")

ter <- cbind(fatg, fgtg, facg)
colnames(ter) <- c("ATG", "GTG", "ACG")
igc <- cut(chromo$gc, breaks=quantile(chromo$gc, seq(0, 1, 0.1)), include.lowest=T, labels=1:10)
cols <- rainbow(10)[igc]
ternaryplot(ter, col=cols, cex=0.2, main="Start by org", labels="outside")

#
# start by common genes
#

x <- split(cds$start, cds$genefam)

fatg <- sapply(x, function(x) table(x)["atg"]/length(x)*100)
names(fatg) <- names(x)

fgtg <- sapply(x, function(x) table(x)["gtg"]/length(x)*100)
names(fgtg) <- names(x)

facg <- sapply(x, function(x) table(x)["acg"]/length(x)*100)
names(facg) <- names(x)

barplot(sort(fatg)[1:10], las=2, main="atg freq. by gene")
barplot(sort(fgtg, dec=T)[1:10], las=2, main="gtg freq. by gene")
barplot(sort(facg, dec=T)[1:10], las=2, main="acg freq. by gene")

ter <- cbind(fatg, fgtg, facg)
colnames(ter) <- c("ATG", "GTG", "ACG")
ternaryplot(ter, col=1, cex=0.5, id=rownames(ter), main="Starts by genes", labels="outside")

# -------------------------------
# stops
# -------------------------------

#
# stop by org and gc
#

tab <- sort(table(cds$stop), dec=T)
tab <- tab[tab >= 100]
tab <- tab / sum(tab) * 100

barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="stop frequencies (%)")
text(0.7, 80, round(tab[1], 2))
text(1.9, 30, round(tab[2], 2))
text(3.1, 25, round(tab[3], 2))

x <- split(cds$stop, cds$X.locus)

ftaa <- sapply(x, function(x) table(x)["taa"]/length(x)*100)
names(ftaa) <- names(x)
chromo$ftaa <- round(ftaa[chromo$X.locus], 2)

ftag <- sapply(x, function(x) table(x)["tag"]/length(x)*100)
names(ftag) <- names(x)
chromo$ftag <- round(ftag[chromo$X.locus], 2)

ftga <- sapply(x, function(x) table(x)["tga"]/length(x)*100)
names(ftga) <- names(x)
chromo$ftga <- round(ftga[chromo$X.locus], 2)

grd.hist(chromo, "ftaa", pos.quant=c(0.7, 0.6), main="Histogram of taa freq. by org")
grd.hist(chromo, "ftag", pos.quant=c(0.8, 0.6), main="Histogram of tag freq. by org")
grd.hist(chromo, "ftga", pos.quant=c(0.8, 0.6), main="Histogram of tga freq. by org")

grd.fplot(chromo, "gc", "ftaa", main="taa freq. by org GC", pos=c(0.2, 0.3))
grd.fplot(chromo, "gc", "ftag", main="tag freq. by org GC")
grd.fplot(chromo, "gc", "ftga", main="tga freq. by org GC")

ter <- cbind(ftaa, ftag, ftga)
colnames(ter) <- c("TAA", "TAG", "TGA")
igc <- cut(chromo$gc, breaks=quantile(chromo$gc, seq(0, 1, 0.1)), include.lowest=T, labels=1:10)
cols <- rainbow(10)[igc]
ternaryplot(ter, col=cols, cex=0.2, main="Stops by org", labels="outside")

#
# stop by common genes
#

x <- split(cds$stop, cds$genefam)

ftaa <- sapply(x, function(x) table(x)["taa"]/length(x)*100)
names(ftaa) <- names(x)

ftag <- sapply(x, function(x) table(x)["tag"]/length(x)*100)
names(ftag) <- names(x)

ftga <- sapply(x, function(x) table(x)["tga"]/length(x)*100)
names(ftga) <- names(x)

barplot(sort(ftaa), las=2, cex.names=0.5, ylim=c(0,100), main="taa freq. by gene")
barplot(sort(ftag), las=2, cex.names=0.5, ylim=c(0,100), main="tag freq. by gene")
barplot(sort(ftga), las=2, cex.names=0.5, ylim=c(0,100), main="tga freq. by gene")

ter <- cbind(ftaa, ftag, ftga)
colnames(ter) <- c("TAA", "TAG", "TGA")
ternaryplot(ter, col=1, cex=0.3, id=rownames(ter), main="Stops by genes", labels="outside")

# -------------------------------
# splice junctions
# -------------------------------

grd.titlepage("Splice Junctions")

grd.textpage(lineno=1, "# intron in core: ", nrow(intron))

#
# intron size
#

intron$size <- intron$to - intron$from + 1

grd.hist(intron, "size", pos.quant=NULL, main="Histogram of intron size", br=1000, xlim=c(0,2000))

#
# nb intron / gene
#

x <- split(intron, intron$genefam)
x <- x[order(sapply(x, function(x) mean(x$intron_nb)), decreasing=T)]

nmax <- max(intron$intron_nb)
lintron <- lapply(x, function(x) x$intron_nb)
mintron <- sapply(lintron, function(x) table(factor(x, levels=1:nmax)))

lintron0 <- table(cds[cds$nexon == 1,"genefam"])[names(lintron)]
mintron <- rbind("0"=lintron0, mintron)
mintron <- t(t(mintron)/colSums(mintron))

mintron[mintron==0] <- NA

nn <- nrow(mintron)
xx <- mintron[nn:1,]
ll <- lapply(1:nn, function(i) xx[i,])
mintron <- mintron[,do.call(order, c(ll, decreasing=T))]

battleship.plot(mintron, maxxspan=0.3, maxyspan=0.3, 
                cex.labels=0.7,
                main="% intron per polyexonic gene")

#
# acceptors / donors
#

tab <- sort(table(intron$acc), dec=T)
tab <- tab[tab >= 100]
tab <- tab / sum(tab) * 100

barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="acceptor frequencies (%)")
text(0.7, 50, round(tab[1], 2))
text(1.9, 3, round(tab[2], 2))
text(3.1, 2.3, round(tab[3], 2))

tab <- sort(table(intron$don), dec=T)
tab <- tab[tab >= 100]
tab <- tab / sum(tab) * 100

barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="donor frequencies (%)")
text(0.7, 50, round(tab[1], 1))
text(1.9, 40, round(tab[2], 1))
text(3.1, 15, round(tab[3], 1))
text(4.3, 12, round(tab[4], 1))

#
# consensus all
#

cons$all <- cons.build(intron$acceptor.donor)

invisible(sapply(rev(names(cons)), function(what) {
  cons.plot(cons[[what]], paste0("consensus ", what))
}))

#
# default consensus score by consensus length
#

cons.def <- cons[["default"]]
cons.def <- cons.def[,! is.nan(colSums(cons.def))]
seq.def <- sapply(intron$acceptor.donor, function(s) gsub("[^acgt]", "", s))

epx <- apply(cons.def, 2, function(col) -sum(col * log(col, base=4)))
opx <- order(epx)

conf.def <- lapply(seq(2, length(opx), by=2), function(n) {
         pos <- head(opx, n)
         notify(n, "/", length(opx))
         cons.confusion(cons.def, seq.def, thresh=0, pos=pos)
})

acc <- sapply(conf.def, function(x) x$acc)
sen <- sapply(conf.def, function(x) x$sen)
sel <- sapply(conf.def, function(x) x$sel)

plot(sel, ylim=c(0.7, 1), pch=1, type="b", main="accuracy by nb consensus positions", ylab="")
lines(sen, type="b", pch=2)
lines(acc, type="b", pch=3)
legend(1, 0.95, c("sensit.", "select.", "accur."), pch=1:3, horiz=T, bty="n")

#
# default consensus score by genes
#

conf.def <- cons.confusion(cons.def, seq.def, thresh=0)
cons.histconf(conf.def)

sfam <- split(conf.def$l2scor, intron$genefam)
sfam <- sfam[order(sapply(sfam, median))]

boxplot(sfam, pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5, outcex = 0.1),
        las=2, cex.axis=0.7, main="default junction logr score by genes")
abline(h=0)

#
# end
#

if (OUT.DEV) {
  cat("+ plot file:", paste0(OUT.FILE, ".", OUT.TYPE), "\n")
  invisible(dev.off())
}

quit(save='no')
@ -0,0 +1,71 @@
#!/usr/bin/env Rscript
#
# plot summary graphics of comparisons
#
#

LIBDIR <- Sys.getenv("LIB_DIR")
if (LIBDIR == "") LIBDIR = "."

source(paste0(LIBDIR, "/util.plot.r"))

COLORS <- 2:10

#

OUT.DEV <- TRUE
OUT.TYPE <- "pdf"
OUT.FILE <- "compare"
if (OUT.DEV) uplot.init.dev(OUT.FILE, OUT.TYPE)

#

tab <- read.table("compare.txt", header=T, comment.char="", stringsAsFactors=F)

#

par(xpd=NA)

#

sel  <- c("cor", "alcor", "acc", "wrong", "over", "misstot")
tab$ptot <- rowSums(tab[,sel])
for (s in sel)
  tab[,paste0("p", s)] <- tab[,s] * 100 / tab$ptot

colors <- head(COLORS, length(sel))

cols <- paste0("p", sel)
ord  <- order(tab$pcor+tab$palcor+tab$pacc, decreasing=T)

barplot(t(tab[ord,cols]), names.arg=tab$X.org[ord], 
        ylim=c(0,100), col=colors, las=2, cex.names=0.5)

legend(0, 110, sel, fill=colors, cex=0.7, horiz=T)

#

sel  <- c("cor", "alcor", "acc", "wrong", "over", "misschlo")
tab$rtot <- rowSums(tab[,sel])
for (s in sel)
  tab[,paste0("r", s)] <- tab[,s] * 100 / tab$rtot

colors <- head(COLORS, length(sel))

cols <- paste0("r", sel)
ord  <- order(tab$rcor+tab$ralcor+tab$racc, decreasing=T)

barplot(t(tab[ord,cols]), names.arg=tab$X.org[ord], 
        ylim=c(0,100), col=colors, las=2, cex.names=0.5)

legend(0, 110, sel, fill=colors, cex=0.7, horiz=T)

#

if (OUT.DEV) {
  cat("# plot file:", paste0(OUT.FILE, ".", OUT.TYPE), "\n")
  invisible(dev.off())
}

quit(save='no')

@ -0,0 +1,54 @@
#
#

function getOrg(s, _local_, a, na, org) {
  na = split(s, a, "/")
  na = split(a[na], a, "\\.")
  return a[1]  
}


BEGIN {
  PROCINFO["sorted_in"] = "@ind_num_asc"
  print "#org tot cor alcor acc wrong over misstot misschlo missoth"
}

/MISSED in ChloroDB/ {
  org = getOrg($1)
  Org[org]++
  Cnt[org]["MISSCHLORO"] = $2
  next
}

/MISSED not in ChloroDB/ {
  org = getOrg($1)
  Org[org]++
  Cnt[org]["MISSNOTCHLORO"] = $2
  next
}

/^#/ { next }

/^.*:MATCH/ {
  org = getOrg($1)
  Org[org]++
  split($NF, a, "\\.")
  Cnt[org][a[1]]++
}

END {
  for (org in Org) {
    Cnt[org]["TOTAL"] =   Cnt[org]["CORRECT"] + Cnt[org]["ALMOST_CORRECT"] \
                        + Cnt[org]["ACCEPTABLE"] + Cnt[org]["WRONG"]       \
                        + Cnt[org]["MISSED"]
  }
  for (org in Org) {
    print org, Cnt[org]["TOTAL"]+0, Cnt[org]["CORRECT"]+0,          \
          Cnt[org]["ALMOST_CORRECT"]+0, Cnt[org]["ACCEPTABLE"]+0,   \
          Cnt[org]["WRONG"]+0, Cnt[org]["OVERPRED"]+0,              \
          Cnt[org]["MISSED"]+0,                                     \
          Cnt[org]["MISSCHLORO"]+0, Cnt[org]["MISSNOTCHLORO"]+0
          
  }

}
@ -0,0 +1,12 @@
#
# R basic utilities
#

#
# notify on stderr
#

notify <- function(...) cat("+", ..., "\n")


@ -0,0 +1,109 @@
#
# R consensus utilities
#

#
# compute consensus
#

cons.build <- function(seqs, backcount=1) {
  xx <- do.call(rbind, sapply(seqs, strsplit, "", USE.NAMES=F))
  lv <- c("a", "c", "g", "t", ".", "-")
  mx <- apply(xx, 2, function(x) table(factor(x, levels=lv)))[1:4,]
  cx <- colSums(mx)
  mx <- mx + backcount
  mx[,cx==0] <- 0
  apply(mx, 2, function(x) x / sum(x))
}

#
# score consensus
#
  
cons.score <- function(cons, seq, pos=1:ncol(cons)) {
   seq <- strsplit(seq, "")[[1]]
   if (length(seq) != ncol(cons)) {
     warning("incompatible seq and cons size")
     return(NA)
   }
   ppx <- sapply(pos, function(i) cons[seq[i],i])
   sum(log10(ppx+1e-6))
}

#
# logratio to uniform model score
#

cons.logratio <- function(cons, seq, m0=NULL, pos=1:ncol(cons)) {
  if (is.null(m0)) {
    m0 <- matrix(rep(0.25, 4), nrow=4, ncol=ncol(cons))
    rownames(m0) <- c('a', 'c', 'g', 't')
  }
  
  sc <- cons.score(cons, seq, pos=pos)
  sc0 <- cons.score(m0, seq, pos=pos)
  
  2 * (log(10^sc, base=2) - log(10^sc0, base=2))
}

#
# shuffle sequence
#

seq.shuf <- function(seq) {
  paste0(sample(strsplit(seq, "")[[1]], nchar(seq), replace=F), collapse="")
}

#
# compute confusion matrix between actual and shuffled sequences
#

cons.confusion <- function(cons, seq, m0=NULL, pos=1:ncol(cons), thresh=0) {
  som <- function(x) sum(x, na.rm=T)

  res <- list()
  res$l2scor <- l2scor <- sapply(seq, function(s) cons.logratio(cons, s, m0=m0, pos=pos))

  seq <- sapply(seq, seq.shuf)
  res$r2scor <- r2scor <- sapply(seq, function(s) cons.logratio(cons, s, m0=m0, pos=pos))

  res$conf <- conf <- matrix(c(som(l2scor >= thresh), som(l2scor < thresh),
                               som(r2scor >= thresh), som(r2scor < thresh)),
                             nrow=2, byrow=T)

  res$acc <- sum(diag(conf)) / sum(conf)
  res$sen <- conf[1,1] / sum(conf[1,])
  res$sel <- conf[1,1] / sum(conf[,1])

  res  
}

#
# plot consensus
#


cons.plot <- function(cons, main="consensus") {
  cols <- c("blue", "orange", "red", "green")
  bp <- barplot(cons, col=cols, ylim=c(0,1), main=main)
  plx <- apply(cons, 2, function(col) -sum(col * log(col+1e-6, base=4)))
  lines(bp, plx, type="b", pch=19)
  legend(0, 1.1, c("a","c","g","t"), fill=cols, horiz=T, xpd=NA, bty="n")
  legend(20, 1.1, "entropy", pch=19, horiz=T, xpd=NA, bty="n")
  invisible()
}

#
# plot confusion scores histograms
#

cons.histconf <- function(conf, main="junction logr score") {
  lrh <- hist(c(conf$l2scor, conf$r2scor), br=50, plot=F)
  lh <- hist(conf$l2scor, br=lrh$breaks, plot=F)
  rh <- hist(conf$r2scor, br=lrh$breaks, plot=F)
  xx <- rbind(lh$counts, rh$counts) / sum(lh$counts)
  colnames(xx) <- lrh$mids
  barplot(xx, col=c(3,2), beside=T, main=main)
  legend(0, 0.1, c("true", "shuffled"), fill=c(3,2), horiz=F, xpd=NA)
  invisible()
}
@ -0,0 +1,93 @@
#
# R misc grid plotting
#

require(grid)
require(gridExtra)

#
# get line height
#

grd.lineheight <- function(s="X") {
  convertHeight(unit(1,"strheight", s), "native", valueOnly=T)
}

#
# quantile table
#

grd.qtab <- function(df, what, cols, n=5) {
  df <- df[order(df[,what], decreasing=T),cols]
  sep <- head(df,1)
  sep[] <- "-"
  rbind(head(df, n), sep, tail(df, n))
}

#
# histogram with tables
#

grd.hist <- function(df, what, cols = c(1,2, which(colnames(df) == what)),
                      breaks=50, pos.sum=c(0.2,0.6), pos.quant=c(0.7,0.6), cex=0.7,
                      main=paste0("Histogram of ", what), ...) {
  hist(df[,what], breaks=breaks, xlab=what, main=main, ...)
  if (! is.null(pos.sum)) {
    pushViewport(viewport(pos.sum[1], pos.sum[2], gp=gpar(cex=cex)))
    grid.table(x<-summary(df[,what]), rows=names(x))
    popViewport()
  }
  if (! is.null(pos.quant)) {
    pushViewport(viewport(pos.quant[1], pos.quant[2], gp=gpar(cex=cex)))
    grid.table(grd.qtab(df, what, cols), rows=NULL)
    popViewport()
  }
  invisible()  
}

#
# plot with fit
#

grd.fplot <- function(df, what.x, what.y, linfit=T, pos=c(0.2, 0.8), ablin=NULL, ...) {
  plot(df[,what.x], df[,what.y], xlab=what.x, ylab=what.y, ...)
  if (linfit) {
    fit <- lm(df[,what.y] ~ df[,what.x])
    abline(fit, col=2)
    pushViewport(viewport(gp=gpar(col=2)))
    a <- sprintf("%.2e", coef(fit)[2])
    b <- sprintf("%.2e", coef(fit)[1])
    grid.text(paste0(what.y, " = ", a, " * ", what.x, " + ", b),
              pos[1], pos[2], just="left")
    pos[2] = pos[2] - 2 * grd.lineheight()
    grid.text(paste0("R2=", round(summary(fit)$r.squared, 3)),
              pos[1], pos[2], just="left")
    popViewport()
  }
  if (! is.null(ablin))
    do.call(abline, ablin)
  invisible()  
}

#
# write text
#

grd.textpage <- function(..., lineno=0, left=0.1, top=0.9, cex=1, fact=1.4) {
  txt <- do.call(paste0, list(...))
  pushViewport(viewport(gp=gpar(cex=cex)))
  grid.text(txt, left, top-lineno*grd.lineheight()*fact, just="left")
  popViewport()
  invisible(txt)
}

#
# title page
#

grd.titlepage <- function(title, x=0.5, y=0.7, cex=3, ...) {
  notify("processing", title)
  grid.newpage()
  grid.text(title, x, y, gp=gpar(cex=cex), ...)
  invisible()  
}
@ -0,0 +1,90 @@
#
# R models I/O utilities
#

#
# write start model
#

write.model.start <- function(frq, what) {
  dir.create("models", showWarnings=F)
  fil <- paste0("models/start.", what, ".frq")
  notify("writing start model:", fil)
  cat("# start model :", what, "\n", file=fil)
  for (x in names(frq))
    cat(x, frq[x]/sum(frq), frq[x], "\n", file=fil, append=T)
  invisible(fil)
}

#
# write stop model
#

write.model.stop <- function(frq, what) {
  dir.create("models", showWarnings=F)
  fil <- paste0("models/stop.", what, ".frq")
  notify("writing stop model:", fil)
  cat("# stop model :", what, "(freq. ignored)\n", file=fil)
  for (x in names(frq))
    cat(x, frq[x]/sum(frq), frq[x], "\n", file=fil, append=T)
  invisible(fil)
}

#
# write splice3 model
# [FIXME] positions are hard-coded
#

write.model.splice3 <- function(cons, what) {
  dir.create("models", showWarnings=F)
  fil <- paste0("models/splice3.", what, ".frq")
  notify("writing splice3 model:", fil)
  .catcons <- function(i) {
     cat(round(cons[c("a","c","g","t"), i]*100, 0), "\n",
         file=fil, append=T)
  }
  cat("# 3' splice model :", what, "\n", file=fil)
  cat("# A C G T\n", file=fil, append=T)
  sapply(seq.int(1, 4), .catcons)
  cat("splice\n", file=fil, append=T)
  sapply(seq.int(6, 11), .catcons)
  invisible(fil)
}

#
# write splice5 model
# [FIXME] positions are hard-coded
#

write.model.splice5 <- function(cons, what) {
  dir.create("models", showWarnings=F)
  fil <- paste0("models/splice5.", what, ".frq")
  notify("writing splice5 model:", fil)
  .catcons <- function(i) {
    cat(round(cons[c("a","c","g","t"), i]*100, 0), "\n",
        file=fil, append=T)
  }
  cat("# 5' splice model :", what, "\n", file=fil)
  cat("# A C G T\n", file=fil, append=T)
  sapply(seq.int(13, 18), .catcons)
  cat("splice\n", file=fil, append=T)
  sapply(seq.int(20, 23), .catcons)
  invisible(fil)
}

#
# write splice3/5 uniform model
#

write.unif.splice <- function(pos, what) {
  dir.create("models", showWarnings=F)
  fil <- paste0("models/splice", pos, ".", what, ".frq")
  notify("writing uniform splice", pos, "model:", fil)
  cat("# 3'/5' splice null model", file=fil)
  cat("# A C G T\n", file=fil, append=T)
  cat("25 25 25 25\n", file=fil, append=T)
  cat("splice\n", file=fil, append=T)
  cat("25 25 25 25\n", file=fil, append=T)
  invisible(fil)
}

@ -0,0 +1,105 @@
#
# R plot utilities
#

#
# setup graphic device
# tiff: high resolution 600 dpi
# pdf
#

uplot.init.dev <- function(fname, type="pdf", width=7, height=7, resol=600, ...) {
  fname <- paste0(fname, ".", type)
  res <- NULL
  if (type == "tiff") {
    res <- tiff(fname, width=width, height=height, units="in", res=resol, ...)
  }
  if (type == "pdf") {
    res <- pdf(fname, width=width, height=height, ...)
  }
  invisible(res)
}

#
# convert pdf to tiff using ghostscript
#

uplot.convert2tiff <- function(fname, resol=600) {
  infile <- paste0(fname, ".pdf")
  oufile <- paste0(fname, ".tif")
  cmd <- paste0("echo quit | gs -r", resol, "-dBATCH -dNOPAUSE -sDEVICE=tiff12nc -sCompression=lzw -sOutputFile=", oufile, " ", infile)
  system(cmd)
}

#
# default plot setup
#

uplot.setup <- function(mfrow=c(1,1),
                       las=1,
                       mgp=c(2, 0.7, 0),
                       oma=c(0, 0, 0, 0),
                       mar=c(4, 3, 3, 2),
                       cex.main=1,
                       font.main=1,
                       family='Helvetica', ...) {
  par(mfrow=mfrow, las=las, mgp=mgp, oma=oma, mar=mar, cex.main=cex.main, font.main=font.main, family=family, ...)
}

#
# pie plot
#

uplot.pie <- function(tab, main="", labels=c("name", "val", "per"), text.r=0.5, text.col="black", text.cex=1, main.pos=c(0,0), main.col="black", ...) {
  pie(tab, edges=2000, main="", labels="", ...)
  text(main.pos[1], main.pos[2], main, cex=1.5, col=main.col)
  prop <- tab/sum(tab)
  theta <- 2*pi * (cumsum(prop) - prop/2)
  lab <- list(name=names(tab), val=tab, per=sprintf("%d%%", round(prop*100)))
  lab <- apply(data.frame(lab[labels]), 1, paste, collapse="\n") 
  if (length(lab) > 0)
    text(text.r*cos(theta), text.r*sin(theta), lab, cex=text.cex, col=text.col)
  invisible(NULL)
}

#
# plot utility : color representation of a table
#

uplot.table <- function(tab, col=heat.colors(100), with.lines=TRUE) {
  image(as.matrix(tab), xaxt="n", yaxt="n", col=col)
  nli <- nrow(tab)
  nco <- ncol(tab)
  dx  <- 0.5 / (nli-1)
  dy  <- 0.5 / (nco-1)
  xf <- (seq_len(nli)-1)/(nli-1) - dx
  yf <- (seq_len(nco)-1)/(nco-1) - dy
  if (with.lines) {
    segments(xf, -dy, xf, 1+dy)
    segments(-dx, yf, 1+dx, yf)
  }
  Axis(c(0,1), at=xf+dx, side=1, labels=rownames(tab), las=2, cex.axis=0.5, padj=0)
  Axis(c(0,1), at=yf+dy, side=2, labels=colnames(tab), las=2, cex.axis=0.5, padj=0)
  invisible(NULL)
}

#
# plot utility : identify points within user's rectangle
#

rect.identify <- function(data) {
  if (is.null(dim(data))) data <- cbind(seq_along(data), data)
  xy <- locator(n=2, type='n')
  r <- matrix(c(range(xy$x), range(xy$y)), ncol=2, byrow=TRUE)
  rect(r[1,1], r[2,1], r[1,2], r[2,2], border='red')
  .in.range <- function(p, r) {
    .in.int <- function(i) {
      (p[i] >= r[i,1]) && (p[i] <= r[i,2])
    }
    .in.int(1) && .in.int(2)
  }
  isel <- which(apply(data, 1, .in.range, r))
  points(data[isel,], col='red', pch=19)
  isel
}