e4d6a8484da5a00bbbbbce0d27957f2fb9ba0ed9

@@ -0,0 +1,39 @@
#

/^>/ {
  N++
  na = split($1, a, "@")
  if (a[na-1] > NEXMAX) NEXMAX = a[na-1]
  NEX[a[na-1]]++
  ANNOT[$NF]++
}

END {
  na = split(FILENAME, a, "/")
  na = split(a[na], a, "\\.")
  printf("%s %d ", a[1], N)
  s = ""
  for (i = 1 ; i <= NEXMAX ; i ++) {
    if (NEX[i] != 0)
      s = s "" i ":" NEX[i] "_"
  }
  gsub("_+$", "", s)
  printf("%s ", s)
  
  s = (NEXMAX == 1) ? "MONEX" : "POLYEX"
  printf("%s ", s)
  
  nmax = 0
  amax = "none"
  for (e in ANNOT) {
    if (ANNOT[e] > nmax) {
      nmax = ANNOT[e]
      amax = e
    }
  }
  print amax
  
}


@@ -0,0 +1,48 @@
#

function min(x, y) {
  return ((x < y) ? x : y)
}

BEGIN {
  if (COVMIN == "") COVMIN = 50
  if (PMAX == "")   PMAX   = 1e-6
  if (IDMIN == "")  IDMIN  = 30
}

/^#/ {
  hitnum = 0;
  next;
}

{
  if ($1 == $2) next
  
  hitnum++;
  
  na = split($1, a, "@");
  if (na < 2) {
    print "query file not properly formatted" > "/dev/stderr"
    exit(1);
  }
  len1  = a[na];

  na = split($2, a, "@");
  if (na < 2) {
    print "bank file not properly formatted" > "/dev/stderr"
    exit(1);
  }
  len2 = a[na];
  
  id  = $3 + 0.0;
  ali = $4;

  covmin = ali * 100. / min(len1, len2);
  
  proba = $11 + 0.0;
  
  if ((covmin > COVMIN) && ((proba < PMAX) || (proba == 0)) && (id > IDMIN)) {
    print $1, $2, hitnum, id, covmin, proba, ali, len1, len2;
  }
}

@@ -0,0 +1,18 @@
#!/usr/bin/env Rscript
#

require(igraph, warn.conflicts=F)

args <- commandArgs(T)
path <- if(length(args) > 0) args[1] else 'graph.dl'

g <- read.graph(path, format='dl')

cc <- clusters(g)

res <- cbind(V(g)$name, membership(cc))

write.table(res, quote=FALSE, row.names=FALSE, col.names=FALSE)

quit(save="no")

@@ -0,0 +1,19 @@
#!/usr/bin/env Rscript
#

args <- commandArgs(T)
path  <- if(length(args) > 0) args[1] else 'len.txt'
delta <- if(length(args) > 1) args[2] else 0.5

tab <- read.table(path, header=T)

lmed <- median(tab$len)

dlen <- lmed * as.numeric(delta)

tab$ok <- (abs(tab$len-lmed)/lmed) <= delta

write.table(tab, quote=F)

quit(save='no')

@@ -0,0 +1,10 @@
#

/^>/ {
  split($1, a, "@")
  ok = a[3] ~ PAT 
}

ok {
  print $0
}
@@ -0,0 +1,30 @@
#
#
#

function Check(seq) {
  if (seq == "") return 0
  gsub("[ACDEFGHIKLMNPQRSTVWXY\n]+", "", seq)
  return (length(seq) == 0)
}

/^>/ {
  if (Check(Seq)) {
    print Name
    printf("%s", Seq)
  }
  Name = $0
  Seq = ""
  next
}

{
  Seq = Seq "" $0 "\n"
}

END {
  if (Check(Seq)) {
    print Name
    printf("%s", Seq)
  }
}
@@ -0,0 +1,10 @@
#
BEGIN {
  print "id len"
}

/^>/ {
 na = split($1, a, "@")
 print substr($1, 2), a[na]
}

@@ -0,0 +1,15 @@
#
#

{
  cnt[$NF]++
}

END {
  n = asort(cnt)
  printf("cc_size %s", NAME)
  for (i = n ; i >= 1 ; i--)
    printf(" %d", cnt[i])
  print ""
}

@@ -0,0 +1,19 @@
#

{
  N[$NF]++
  E[$NF, N[$NF]] = $1
}

END {
  cmax = 1
  nmax = N[1]
  for (i in N) {
    if (N[i] > nmax) {
      nmax = N[i]
      cmax = i
    }
  }
  for (i = 1 ; i <= nmax ; i++)
    print E[cmax, i]
}
@@ -0,0 +1,17 @@
#

BEGIN {
  if (FILE == "") FILE = "db.sel.txt"
  while (getline < FILE)
    INC[$1] = $1
  close(FILE)
}

/^>/ {
  name = substr($1, 2)
  ok = name in INC
}

ok {
  print $0
}
@@ -0,0 +1,21 @@
#

{
  node[$1]++
  node[$2]++
  link[++M] = $1 " " $2
}


END {
 for (n in node)
   N++
 print "DL n=" N
 print "format = edgelist1"
 print "labels embedded:"
 print "data:"
 for (i = 1 ; i <= M ; i++)
   print link[i]
}