cds/tools/chlorodb added

Former-commit-id: 0579e878a69b7c285ca71870e9ca5730649a2fda
Former-commit-id: 7cced5b488441d87bf070a9a444317db0e048880
This commit is contained in:
alain viari
2015-11-13 17:41:18 +01:00
parent 0d5f0c1f20
commit e4d6a8484d
585 changed files with 4750 additions and 50 deletions

View File

@@ -0,0 +1,39 @@
#
/^>/ {
N++
na = split($1, a, "@")
if (a[na-1] > NEXMAX) NEXMAX = a[na-1]
NEX[a[na-1]]++
ANNOT[$NF]++
}
END {
na = split(FILENAME, a, "/")
na = split(a[na], a, "\\.")
printf("%s %d ", a[1], N)
s = ""
for (i = 1 ; i <= NEXMAX ; i ++) {
if (NEX[i] != 0)
s = s "" i ":" NEX[i] "_"
}
gsub("_+$", "", s)
printf("%s ", s)
s = (NEXMAX == 1) ? "MONEX" : "POLYEX"
printf("%s ", s)
nmax = 0
amax = "none"
for (e in ANNOT) {
if (ANNOT[e] > nmax) {
nmax = ANNOT[e]
amax = e
}
}
print amax
}

View File

@@ -0,0 +1,48 @@
#
function min(x, y) {
return ((x < y) ? x : y)
}
BEGIN {
if (COVMIN == "") COVMIN = 50
if (PMAX == "") PMAX = 1e-6
if (IDMIN == "") IDMIN = 30
}
/^#/ {
hitnum = 0;
next;
}
{
if ($1 == $2) next
hitnum++;
na = split($1, a, "@");
if (na < 2) {
print "query file not properly formatted" > "/dev/stderr"
exit(1);
}
len1 = a[na];
na = split($2, a, "@");
if (na < 2) {
print "bank file not properly formatted" > "/dev/stderr"
exit(1);
}
len2 = a[na];
id = $3 + 0.0;
ali = $4;
covmin = ali * 100. / min(len1, len2);
proba = $11 + 0.0;
if ((covmin > COVMIN) && ((proba < PMAX) || (proba == 0)) && (id > IDMIN)) {
print $1, $2, hitnum, id, covmin, proba, ali, len1, len2;
}
}

View File

@@ -0,0 +1,18 @@
#!/usr/bin/env Rscript
#
require(igraph, warn.conflicts=F)
args <- commandArgs(T)
path <- if(length(args) > 0) args[1] else 'graph.dl'
g <- read.graph(path, format='dl')
cc <- clusters(g)
res <- cbind(V(g)$name, membership(cc))
write.table(res, quote=FALSE, row.names=FALSE, col.names=FALSE)
quit(save="no")

View File

@@ -0,0 +1,19 @@
#!/usr/bin/env Rscript
#
args <- commandArgs(T)
path <- if(length(args) > 0) args[1] else 'len.txt'
delta <- if(length(args) > 1) args[2] else 0.5
tab <- read.table(path, header=T)
lmed <- median(tab$len)
dlen <- lmed * as.numeric(delta)
tab$ok <- (abs(tab$len-lmed)/lmed) <= delta
write.table(tab, quote=F)
quit(save='no')

View File

@@ -0,0 +1,10 @@
#
/^>/ {
split($1, a, "@")
ok = a[3] ~ PAT
}
ok {
print $0
}

View File

@@ -0,0 +1,30 @@
#
#
#
function Check(seq) {
if (seq == "") return 0
gsub("[ACDEFGHIKLMNPQRSTVWXY\n]+", "", seq)
return (length(seq) == 0)
}
/^>/ {
if (Check(Seq)) {
print Name
printf("%s", Seq)
}
Name = $0
Seq = ""
next
}
{
Seq = Seq "" $0 "\n"
}
END {
if (Check(Seq)) {
print Name
printf("%s", Seq)
}
}

View File

@@ -0,0 +1,10 @@
#
BEGIN {
print "id len"
}
/^>/ {
na = split($1, a, "@")
print substr($1, 2), a[na]
}

View File

@@ -0,0 +1,15 @@
#
#
{
cnt[$NF]++
}
END {
n = asort(cnt)
printf("cc_size %s", NAME)
for (i = n ; i >= 1 ; i--)
printf(" %d", cnt[i])
print ""
}

View File

@@ -0,0 +1,19 @@
#
{
N[$NF]++
E[$NF, N[$NF]] = $1
}
END {
cmax = 1
nmax = N[1]
for (i in N) {
if (N[i] > nmax) {
nmax = N[i]
cmax = i
}
}
for (i = 1 ; i <= nmax ; i++)
print E[cmax, i]
}

View File

@@ -0,0 +1,17 @@
#
BEGIN {
if (FILE == "") FILE = "db.sel.txt"
while (getline < FILE)
INC[$1] = $1
close(FILE)
}
/^>/ {
name = substr($1, 2)
ok = name in INC
}
ok {
print $0
}

View File

@@ -0,0 +1,21 @@
#
{
node[$1]++
node[$2]++
link[++M] = $1 " " $2
}
END {
for (n in node)
N++
print "DL n=" N
print "format = edgelist1"
print "labels embedded:"
print "data:"
for (i = 1 ; i <= M ; i++)
print link[i]
}