e4d6a8484da5a00bbbbbce0d27957f2fb9ba0ed9

@ -0,0 +1,254 @@
#!/bin/csh -f
#
# make ChloroDB's
#
# usage: copy genbank/embl files into 'DB_DIR/download'
# usage: [create a paramter.sh file in 'DB_DIR']
# usage: go_chlorodb [DB_DIR]
#
unsetenv ORG_SOURCED

setenv ORG_HOME `dirname $0`/../../../..
source $ORG_HOME/scripts/csh_init.sh

#
# which DB to process
#

set DB_BASE = $DATA_DIR/cds/chlorodb  # default location

if ($#Argv > 0) then
  set DB_BASE = $Argv[1]; Shift
endif

set DB_BASE = `cd $DB_BASE && pwd -P`

NeedDir $DB_BASE/download

if (! -d $DB_BASE/info)  mkdir $DB_BASE/info
if (! -d $DB_BASE/fasta) mkdir $DB_BASE/fasta

cd $DB_BASE/info

#
# params
#

if (! -e $DB_BASE/parameters.sh) then
  @ n = `find $DB_BASE/download -depth 1 -type f -print | wc -l`
  @ cor_cutoff = $n / 2
  @ atg_cutoff = $n / 10
  @ dbs_cutoff = $n / 4
  if ($cor_cutoff == 0) @ cor_cutoff = 1
  if ($atg_cutoff == 0) @ atg_cutoff = 1
  if ($dbs_cutoff == 0) @ dbs_cutoff = 1
  echo "# sourced file"                          >  $DB_BASE/parameters.sh
  echo ""                                        >> $DB_BASE/parameters.sh
  echo "set CORE_NCDS_CUTOFF      = $cor_cutoff" >> $DB_BASE/parameters.sh
  echo "set CORE_START_ATG_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
  echo "set CORE_START_DFT_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
  echo "set CORE_START_OTH_CUTOFF = 10"          >> $DB_BASE/parameters.sh
  echo "set CORE_STOP_CUTOFF      = $cor_cutoff" >> $DB_BASE/parameters.sh
  echo "set CORE_SPLICE_CUTOFF    = $atg_cutoff" >> $DB_BASE/parameters.sh
  echo ""                                        >> $DB_BASE/parameters.sh
  echo "set SHEL_NCDS_CUTOFF      = 10"          >> $DB_BASE/parameters.sh
  echo ""                                        >> $DB_BASE/parameters.sh
  echo "set CORE_DELTA            = Inf"         >> $DB_BASE/parameters.sh
  echo "set CORE_COVMIN           = 30"          >> $DB_BASE/parameters.sh
  echo "set CORE_PMAX             = 1e-6"        >> $DB_BASE/parameters.sh
  echo "set CORE_IDMIN            = 30"          >> $DB_BASE/parameters.sh
  echo "set CORE_SIZMIN           = $cor_cutoff" >> $DB_BASE/parameters.sh
  echo ""                                        >> $DB_BASE/parameters.sh
  echo "set SHEL_DELTA            = 0.5"         >> $DB_BASE/parameters.sh
  echo "set SHEL_COVMIN           = 30"          >> $DB_BASE/parameters.sh
  echo "set SHEL_PMAX             = 1e-6"        >> $DB_BASE/parameters.sh
  echo "set SHEL_IDMIN            = 30"          >> $DB_BASE/parameters.sh
  echo "set SHEL_SIZMIN           = $dbs_cutoff" >> $DB_BASE/parameters.sh
  echo ""                                        >> $DB_BASE/parameters.sh
  echo "set DUST_DELTA            = 0.5"         >> $DB_BASE/parameters.sh
  echo "set DUST_COVMIN           = 30"          >> $DB_BASE/parameters.sh
  echo "set DUST_PMAX             = 1e-6"        >> $DB_BASE/parameters.sh
  echo "set DUST_IDMIN            = 30"          >> $DB_BASE/parameters.sh
  echo "set DUST_SIZMIN           = 10"          >> $DB_BASE/parameters.sh
  
endif

source $DB_BASE/parameters.sh

##set CMIN_COD = 0
##set FMIN_COD = 0.01

#
# temporarily uncompress
#

set ff = `find $DB_BASE/download -depth 1 -name \*.gz -print`

if ($#ff != 0) then
  Notify "uncompressing $#ff entries"
  foreach f ($ff)
    gunzip -f $f
  end
endif

#
# convert gbk/embl to fasta
#

set ff = `find $DB_BASE/download -depth 1 \( -name \*.gbk -or -name \*.embl \) -print`

Notify "convert $#ff gbk/embl entries to fasta"

foreach f ($ff)
  set nom = `basename $f:r`
  set typ = $f:e
  $AwkCmd -f $LIB_DIR/$typ.tofasta.awk $f > $DB_BASE/fasta/$nom.fst
end

#
# get gbk/embl info
#

Notify "get gbk/embl info for $#ff entries"

echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.info.awk > db.info.txt  # just get header

foreach f ($ff)
  set nom = `basename $f:r`
  set typ = $f:e
  $AwkCmd -f $LIB_DIR/$typ.oneliner.awk  $f |\
  $AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/$typ.info.awk |\
  egrep -v '^#' >> db.info.txt
end

#
# get cds info
#

Notify "get gbk/embl cds for $#ff entries"

echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.cds_long.awk > db.cds.txt  # just get header

foreach f ($ff)
  set nom = `basename $f:r`
  set typ = $f:e
  $AwkCmd -f $LIB_DIR/$typ.oneliner.awk  $f |\
  $AwkCmd -v FASTA=$DB_BASE/fasta/$nom.fst -f $LIB_DIR/libutil.awk \
          -f $LIB_DIR/$typ.cds_long.awk |\
  egrep -v '^#' >> db.cds.txt
end

#
# get fasta for  prots
#

Notify "get prots"
$AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/cds2fasta.awk db.cds.txt > db.prot.fst

#
# get introns
#

Notify "get gbk/embl introns for $#ff entries"

echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.intron.awk > db.intron.txt  # just get header

foreach f ($ff)
  set nom = `basename $f:r`
  set typ = $f:e
  $AwkCmd -f $LIB_DIR/$typ.oneliner.awk  $f |\
  $AwkCmd -v FASTA=$DB_BASE/fasta/$nom.fst -f $LIB_DIR/libutil.awk \
          -f $LIB_DIR/$typ.intron.awk |\
  egrep -v '^#' >> db.intron.txt
end

#
# make models
#

Notify "Making models"

echo -n ""                                              >  db.models.params.txt
echo "CORE_NCDS_CUTOFF <- $CORE_NCDS_CUTOFF"            >> db.models.params.txt
echo "CORE_START_ATG_CUTOFF <- $CORE_START_ATG_CUTOFF"  >> db.models.params.txt
echo "CORE_START_DFT_CUTOFF <- $CORE_START_DFT_CUTOFF"  >> db.models.params.txt
echo "CORE_START_OTH_CUTOFF <- $CORE_START_OTH_CUTOFF"  >> db.models.params.txt
echo "CORE_STOP_CUTOFF <- $CORE_STOP_CUTOFF"            >> db.models.params.txt
echo "CORE_SPLICE_CUTOFF <- $CORE_SPLICE_CUTOFF"        >> db.models.params.txt
echo "SHEL_NCDS_CUTOFF <- $SHEL_NCDS_CUTOFF"            >> db.models.params.txt

$LIB_DIR/make.models.r |& Cat

GetStatus
OnError then 
  Error 2 "model parameter too stringent"
endif

#
# add matrices
#

cp -f $PROG_DIR/matrices/* models

#
# make subDBs
#

if (-e db.core.pat.txt) then
  Notify "Making core DB (take some time... please wait)"
  $PROG_DIR/subdb/go_subdb.sh db.prot.fst db.core.pat.txt \
    $CORE_DELTA $CORE_COVMIN $CORE_PMAX $CORE_IDMIN $CORE_SIZMIN
endif

if (-e db.shell.pat.txt) then
  Notify "Making shell DB (take some time... please wait)"
  $PROG_DIR/subdb/go_subdb.sh db.prot.fst db.shell.pat.txt \
    $SHEL_DELTA $SHEL_COVMIN $SHEL_PMAX $SHEL_IDMIN $SHEL_SIZMIN
endif

if (-e db.dust.pat.txt) then
  Notify "Making dust DB (take some time... please wait)"
  $PROG_DIR/subdb/go_subdb.sh db.prot.fst db.dust.pat.txt \
    $DUST_DELTA $DUST_COVMIN $DUST_PMAX $DUST_IDMIN $DUST_SIZMIN
endif

#
# recompress entries
#

set ff = `find $DB_BASE/download -depth 1 -type f -print`

if ($#ff != 0) then
  Notify "recompressing $#ff entries"
  foreach f ($ff)
    gzip -f $f
  end
endif

# compress fasta

set ff = `find $DB_BASE/fasta -depth 1 -name \*.fst -print`

if ($#ff != 0) then
  Notify "compressing $#ff fasta entries"
  foreach f ($ff)
    gzip -f $f
  end
endif

# install everything in proper directory

foreach dir ("core" "shell" "dust")
  if (-e $DB_BASE/$dir) \rm -r $DB_BASE/$dir
  if ((-d db.$dir.pat.db) && (-e db.$dir.pat.db/Annot.lst)) then
    Notify "installing $DB_BASE/$dir"
    \mv -f db.$dir.pat.db $DB_BASE/$dir
  endif
end

if (-e $DB_BASE/models) \rm -r $DB_BASE/models
if (-d models) \mv -f models $DB_BASE

Notify "Done"
exit 0

@ -0,0 +1,29 @@
#
# blosum62 substitution matrix
# with larger penalty for stops
#
  A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
A 4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -50
R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -50
N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -50
D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -50
C 0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -50
Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -50
E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -50
G 0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -50
H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -50
I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -50
L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -50
K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -50
M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -50
F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -50
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -50
S 1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -50
T 0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -50
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -50
Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -50
V 0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -50
B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -50
Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -50
X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -50
* -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 1
@ -0,0 +1,195 @@
#!/bin/csh -f
#
# usage: go_subdb.sh prot.fst pat.txt [deltalen covmin pmax idmin sizmin]
# usage: prot.fst : proteins fasta file
# usage: pat.txt  : text file containing patterns and names for families to extract
# usage: output directory containig subdbs : basename <pat:r>.db
#

unsetenv ORG_SOURCED

setenv ORG_HOME `dirname $0`/../../../../..
source $ORG_HOME/scripts/csh_init.sh

NeedArg 2

set ProtFile = $Argv[1]; Shift
set PatFile  = $Argv[1]; Shift

NeedFile $ProtFile
NeedFile $PatFile

#
# parameters
#

set Delta  = 0.5
set Covmin = 30
set Pmax   = 1e-6
set Idmin  = 30
set Sizmin = 5

if ($#Argv > 0) then
  set Delta = $Argv[1]; Shift
endif

if ($#Argv > 0) then
  set Covmin = $Argv[1]; Shift
endif

if ($#Argv > 0) then
  set Pmax = $Argv[1]; Shift
endif

if ($#Argv > 0) then
  set Idmin = $Argv[1]; Shift
endif

if ($#Argv > 0) then
  set Sizmin = $Argv[1]; Shift
endif

#
# output directory
#

set OutDir = `basename $PatFile:r`.db

if (-d $OutDir) \rm -r $OutDir
mkdir $OutDir

set OutLog = `basename $PatFile:r`.log

echo -n '' > $OutLog

alias Report 'egrep "^>" \!:1 | wc -l | awk -v P=`basename \!:1` -v H=\!:2 '"'{print H,P,"'$1}'"'"' >> $OutLog'

#
# remove entries with bad symbols
#

Notify "cleanup $ProtFile"

Report $ProtFile "init_size"

$AwkCmd -f $LIB_DIR/db.filter.sym.awk $ProtFile > P_$$

Report $ProtFile "cleanup_size"

#
# select by name pattern
#

Notify "select by patterns"

mkdir D_$$
mkdir E_$$
mkdir F_$$

set noms = `awk '{print $1}' $PatFile`

foreach nom ($noms)
  set pat = `egrep "^$nom " $PatFile | awk '{print $2}'`
  $AwkCmd -f $LIB_DIR/db.filter.pat.awk -v PAT="$pat" P_$$ > D_$$/$nom.fst
  set n = `egrep '^>' D_$$/$nom.fst | wc -l`
  Notify "  pattern : $nom : $n"
  Report D_$$/$nom.fst "pattern_filter"
  if ($n <= $Sizmin) \rm -f D_$$/$nom.fst
end

set ok = `ls D_$$ | wc -l`
if ($ok == 0) goto fin

#
# select by length
#

Notify "select by length"

foreach f (D_$$/*.fst) 
  set nom = `basename $f:r`
  $AwkCmd -f $LIB_DIR/db.getlen.awk $f > L_$$
  $LIB_DIR/db.filter.len.r L_$$ $Delta |\
    $AwkCmd '($NF == "TRUE") {print $2}' > M_$$
  $AwkCmd -v FILE=M_$$ -f $LIB_DIR/db.subdb.awk $f > E_$$/$nom.fst
  Report E_$$/$nom.fst "length_filter"
  set n = `egrep '^>' E_$$/$nom.fst | wc -l`
  Notify "  length filter : $nom : $n"
  if ($n <= $Sizmin) \rm -f E_$$/$nom.fst
end

set ok = `ls E_$$ | wc -l`
if ($ok == 0) goto fin


#
# select by similarity
#

Notify "select by similarity"

foreach f (E_$$/*.fst) 
  set nom = `basename $f:r`

  Notify "  blasting $nom"
  
  makeblastdb -dbtype 'prot' -in $f >>& db.log
  blastp -db $f -query $f -outfmt 7 > $f.blast.out
  \rm -f $f.p??
  
  $AwkCmd -v COVMIN=$Covmin -v PMAX=$Pmax -v IDMIN=$Idmin \
      -f $LIB_DIR/db.blastlink.awk $f.blast.out |\
      $AwkCmd -f $LIB_DIR/db.todl.awk > G_$$
      
  ($LIB_DIR/db.cc.r G_$$ > $f.cc.txt) >>& db.log
  
  awk -v NAME=$nom -f $LIB_DIR/db.reportcc.awk $f.cc.txt >> $OutLog 
  
  $AwkCmd -f $LIB_DIR/db.selcc.awk $f.cc.txt > S_$$
  $AwkCmd -v FILE=S_$$ -f $LIB_DIR/db.subdb.awk $f > F_$$/$nom.fst

  Report F_$$/$nom.fst "similarity_filter"

  set n = `egrep '^>' F_$$/$nom.fst | wc -l`
  Notify "  blast filter : $nom : $n"
  if ($n <= $Sizmin) \rm -f F_$$/$nom.fst
  
end

set ok = `ls D_$$ | wc -l`
if ($ok == 0) goto fin

#
# annotations
#

echo -n "" > J_$$

foreach f (F_$$/*.fst) 
  $AwkCmd -f $LIB_DIR/db.annot.awk $f >> J_$$
end

awk '(NF >= 3) {print $1, $NF}' $PatFile | sort > A_$$
sort J_$$ | egrep -v '^ *$' > B_$$
join A_$$ B_$$ > F_$$/Annot.lst

#
# copy files
#

set n = `ls F_$$/* | wc -l`
Notify "copy $n files to $OutDir"

\mv -f F_$$/* $OutDir

#
# end
#

fin:
Notify "output directory : $OutDir"

\rm -r ?_$$


exit 0
@ -0,0 +1,39 @@
#

/^>/ {
  N++
  na = split($1, a, "@")
  if (a[na-1] > NEXMAX) NEXMAX = a[na-1]
  NEX[a[na-1]]++
  ANNOT[$NF]++
}

END {
  na = split(FILENAME, a, "/")
  na = split(a[na], a, "\\.")
  printf("%s %d ", a[1], N)
  s = ""
  for (i = 1 ; i <= NEXMAX ; i ++) {
    if (NEX[i] != 0)
      s = s "" i ":" NEX[i] "_"
  }
  gsub("_+$", "", s)
  printf("%s ", s)
  
  s = (NEXMAX == 1) ? "MONEX" : "POLYEX"
  printf("%s ", s)
  
  nmax = 0
  amax = "none"
  for (e in ANNOT) {
    if (ANNOT[e] > nmax) {
      nmax = ANNOT[e]
      amax = e
    }
  }
  print amax
  
}


@ -0,0 +1,48 @@
#

function min(x, y) {
  return ((x < y) ? x : y)
}

BEGIN {
  if (COVMIN == "") COVMIN = 50
  if (PMAX == "")   PMAX   = 1e-6
  if (IDMIN == "")  IDMIN  = 30
}

/^#/ {
  hitnum = 0;
  next;
}

{
  if ($1 == $2) next
  
  hitnum++;
  
  na = split($1, a, "@");
  if (na < 2) {
    print "query file not properly formatted" > "/dev/stderr"
    exit(1);
  }
  len1  = a[na];

  na = split($2, a, "@");
  if (na < 2) {
    print "bank file not properly formatted" > "/dev/stderr"
    exit(1);
  }
  len2 = a[na];
  
  id  = $3 + 0.0;
  ali = $4;

  covmin = ali * 100. / min(len1, len2);
  
  proba = $11 + 0.0;
  
  if ((covmin > COVMIN) && ((proba < PMAX) || (proba == 0)) && (id > IDMIN)) {
    print $1, $2, hitnum, id, covmin, proba, ali, len1, len2;
  }
}

@ -0,0 +1,18 @@
#!/usr/bin/env Rscript
#

require(igraph, warn.conflicts=F)

args <- commandArgs(T)
path <- if(length(args) > 0) args[1] else 'graph.dl'

g <- read.graph(path, format='dl')

cc <- clusters(g)

res <- cbind(V(g)$name, membership(cc))

write.table(res, quote=FALSE, row.names=FALSE, col.names=FALSE)

quit(save="no")

@ -0,0 +1,19 @@
#!/usr/bin/env Rscript
#

args <- commandArgs(T)
path  <- if(length(args) > 0) args[1] else 'len.txt'
delta <- if(length(args) > 1) args[2] else 0.5

tab <- read.table(path, header=T)

lmed <- median(tab$len)

dlen <- lmed * as.numeric(delta)

tab$ok <- (abs(tab$len-lmed)/lmed) <= delta

write.table(tab, quote=F)

quit(save='no')

@ -0,0 +1,10 @@
#

/^>/ {
  split($1, a, "@")
  ok = a[3] ~ PAT 
}

ok {
  print $0
}
@ -0,0 +1,30 @@
#
#
#

function Check(seq) {
  if (seq == "") return 0
  gsub("[ACDEFGHIKLMNPQRSTVWXY\n]+", "", seq)
  return (length(seq) == 0)
}

/^>/ {
  if (Check(Seq)) {
    print Name
    printf("%s", Seq)
  }
  Name = $0
  Seq = ""
  next
}

{
  Seq = Seq "" $0 "\n"
}

END {
  if (Check(Seq)) {
    print Name
    printf("%s", Seq)
  }
}
@ -0,0 +1,10 @@
#
BEGIN {
  print "id len"
}

/^>/ {
 na = split($1, a, "@")
 print substr($1, 2), a[na]
}

@ -0,0 +1,15 @@
#
#

{
  cnt[$NF]++
}

END {
  n = asort(cnt)
  printf("cc_size %s", NAME)
  for (i = n ; i >= 1 ; i--)
    printf(" %d", cnt[i])
  print ""
}

@ -0,0 +1,19 @@
#

{
  N[$NF]++
  E[$NF, N[$NF]] = $1
}

END {
  cmax = 1
  nmax = N[1]
  for (i in N) {
    if (N[i] > nmax) {
      nmax = N[i]
      cmax = i
    }
  }
  for (i = 1 ; i <= nmax ; i++)
    print E[cmax, i]
}
@ -0,0 +1,17 @@
#

BEGIN {
  if (FILE == "") FILE = "db.sel.txt"
  while (getline < FILE)
    INC[$1] = $1
  close(FILE)
}

/^>/ {
  name = substr($1, 2)
  ok = name in INC
}

ok {
  print $0
}
@ -0,0 +1,21 @@
#

{
  node[$1]++
  node[$2]++
  link[++M] = $1 " " $2
}


END {
 for (n in node)
   N++
 print "DL n=" N
 print "format = edgelist1"
 print "labels embedded:"
 print "data:"
 for (i = 1 ; i <= M ; i++)
   print link[i]
}