9108ce75f15de1a32febe09d5adaff76487e0490

@ -0,0 +1,98 @@
#
#
#

function Min(a, b) {
  return (a < b ? a : b)
}

function Max(a, b) {
  return (a > b ? a : b)
}

function Align(s1, s2, _local_, d, l) {
  if (s1 == s2) return 100
  d = AlignNWS(s1, s2, Identity)
  l = Max(length(s1), length(s2))
  return int((l - d) * 100 / l)
}

BEGIN {
  PROCINFO["sorted_in"] = "@ind_num_asc"
  IdentityMatrix("ABCDEFGHIJKLMNOPQRSTUVWXYZ*", Identity)
}

BEGINFILE {
  NFile++
  File[NFile] = FILENAME
}

/^#/ { next }

{
  strand = $5
  stop = (strand == "D" ? $4 : $3)
  Stop[stop]++
  i = ++NRec[NFile]
  Rec[NFile][i]["record"] = $0
  Rec[NFile][i]["genefam"] = $1
  Rec[NFile][i]["gene"] = $2
  Rec[NFile][i]["from"] = $3
  Rec[NFile][i]["to"] = $4
  Rec[NFile][i]["strand"] = $5
  Rec[NFile][i]["nexon"] = $6
  Rec[NFile][i]["length"] = $7
  Rec[NFile][i]["protseq"] = $9
  if (NFile == 1)
    Indx1[stop] = i
  else
    Indx2[stop] = i
}

END {
  for (st in Stop) {
    if (Indx1[st])
      print "FILE1 " Rec[1][Indx1[st]]["record"]
    else 
      print "FILE1 NONE"

    if (Indx2[st])
      print "FILE2 " Rec[2][Indx2[st]]["record"]
    else 
      print "FILE2 NONE"
      
    if (Indx1[st] && Indx2[st]) {
      fm = Rec[1][Indx1[st]]["genefam"]
      id = Align(Rec[1][Indx1[st]]["protseq"], Rec[2][Indx2[st]]["protseq"])
      printf("MATCH %s ID %d ", fm, id)
      if (id == 100)
        status = "CORRECT"
      else if (id >= 90)
        status = "ALMOST_CORRECT"
      else if (id >= 80)
        status = "ACCEPTABLE"
      else
        status = "WRONG"
      if (status != "CORRECT") {
        if (Rec[1][Indx1[st]]["nexon"] != Rec[2][Indx2[st]]["nexon"])
          status = status ".BAD_NBEXON"
        start1 = Rec[1][Indx1[st]]["strand"] == "D" ? Rec[1][Indx1[st]]["from"] : Rec[1][Indx1[st]]["to"]
        start2 = Rec[2][Indx2[st]]["strand"] == "D" ? Rec[2][Indx2[st]]["from"] : Rec[2][Indx2[st]]["to"]
        if (start1 != start2)
          status = status ".BAD_START"
        else
          status = status ".BAD_JUNCTION"
      }
      print status
    }
    else if (Indx1[st]) {
      fm = Rec[1][Indx1[st]]["genefam"]
      print "MATCH " fm " ID 0 MISSED.WRONG_STOP"
    }
    else if (Indx2[st]) {
      fm = Rec[2][Indx2[st]]["genefam"]
      print "MATCH " fm " ID 0 OVERPRED.WRONG_STOP"
    }
    print ""
  }
}
@ -0,0 +1,80 @@
#
# get cds features from embl (short version)
#

# @include lib.embl.awk


BEGIN {
  if (MAXSPAN == "") MAXSPAN = 10000
  print "#genefam gene from to strand nexon length status protseq product"
}

/^FT   CDS/ {
  revstrand = match($3, "^complement")
  s = substr($0, 22)
  gsub("^complement", "", s)
  ok = ! match(s, "complement|order")
  nexon = Nexons(s)
  SpanLocation(s, sloc)
  spanlen = sloc[2] - sloc[1] + 1
  len = LenLocation(s)
  ok = ok && (len < MAXSPAN)
  cdsseq = ok ? SeqLocation(seq, s, revstrand) : "XXX"
  cstart = substr(cdsseq, 1,3)
  cstop  = substr(cdsseq, length(cdsseq)-2)

  gene = "none"
  locustag = "none"
  product = "none"
  translation = "X"
  incds = 1
  next
}

(incds && /^FT   [^ ]/) {
  print GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"), 
        nexon, len, (ok ? "Ok" : "Error"), translation, product
  incds = 0
  next
}

/^FT                   \/gene=/ {
  split($0, a, "=")
  gene = a[2]
  gsub("^[^a-z,A-Z]+", "", gene)
  gsub("\"", "", gene)
  gsub(" ", "_", gene)
  next
}

/^FT                   \/locus_tag=/ {
  split($0, a, "=")
  locustag = a[2]
  gsub("\"", "", locustag)
  gsub(" ", "_", locustag)
  next
}

/^FT                   \/product=/ {
  split($0, a, "=")
  product = a[2]
  gsub("\"", "", product)
  gsub(" ", "_", product)
  next
}

/^FT                   \/translation=/ {
  split($0, a, "=")
  translation = a[2]
  gsub("\"", "", translation)
  gsub(" ", "", translation)
  next
}

END {
  if (incds) {
    print GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"), 
          nexon, len, (ok ? "Ok" : "Error"), translation, product
  }
}
@ -0,0 +1,28 @@
#
# embl oneLiner
#

/^FT / {
  InFeat = 1
}

(InFeat == 0) && ($1 != "CC") && ($1 == pkey) && /^..   [^ ]/ {
  line = line " " substr($0, 6)
  next
}

(InFeat == 1) && /^FT                   [^\/]/ {
  line = line "" substr($0, 22)
  next
}

{
  if (line != "") print line
  line = $0
  pkey = $1
  next
}

END {
  if (line != "") print line
}
@ -0,0 +1,84 @@
#
# get cds features from genbank (short version)
#

# @include lib.gbk.awk


BEGIN {
  if (MAXSPAN == "") MAXSPAN = 10000
  print "#genefam gene from to strand nexon length status protseq product"
}

/^     CDS/ {
  revstrand = match($2, "^complement")
  s = substr($0, 22)
  gsub("^complement", "", s)
  ok = ! match(s, "complement|order")
  nexon = Nexons(s)
  SpanLocation(s, sloc)
  spanlen = sloc[2] - sloc[1] + 1
  len = LenLocation(s)
  ok = ok && (len < MAXSPAN)
  cdsseq = ok ? SeqLocation(seq, s, revstrand) : "XXX"
  cstart = substr(cdsseq, 1,3)
  cstop  = substr(cdsseq, length(cdsseq)-2)

  gene = "none"
  locustag = "none"
  product = "none"
  translation = "X"
  incds = 1
  next
}

(incds && /^     [^ ]/) {
  print GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"), 
        nexon, len, (ok ? "Ok" : "Error"), translation, product
  incds = 0
  next
}

/^                     \/gene=/ {
  split($0, a, "=")
  gene = a[2]
  gsub("^[^a-z,A-Z]+", "", gene)
  gsub("\"", "", gene)
  gsub(" ", "_", gene)
  next
}

/^                     \/locus_tag=/ {
  split($0, a, "=")
  locustag = a[2]
  gsub("\"", "", locustag)
  gsub(" ", "_", locustag)
  next
}

/^                     \/product=/ {
  split($0, a, "=")
  product = a[2]
  gsub("\"", "", product)
  gsub(" ", "_", product)
  next
}

/^                     \/translation=/ {
  split($0, a, "=")
  translation = a[2]
  gsub("\"", "", translation)
  gsub(" ", "", translation)
  next
}

/^\/\// {
  locus = "?"
}

END {
  if (incds) {
    print GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"), 
          nexon, len, (ok ? "Ok" : "Error"), translation, product
  }
}
@ -0,0 +1,29 @@
#
# genbank oneLiner
#

/^            [^ ]/ {
  line = line "" substr($0, 12)
  next
}

/^                     [^\/]/ {
  line = line "" substr($0, 21)
  next
}

/^  ORGANISM/ {
  if (line != "") print line
  line = $0 ";"
  next
}

{
  if (line != "") print line
  line = $0
  next
}

END {
  print line
}
@ -0,0 +1,146 @@
#
# NWS alignment
#

# -----------------------
# make identity substitution matrix for given alphabet
#
function IdentityMatrix(alpha, mat, diag, ndiag, _local_, n, i, j, ci, cj) {
  if (diag == "")  diag  = 0
  if (ndiag == "") ndiag = 1
  alpha = toupper(alpha)
  delete mat
  n = length(alpha)
  for (i = 1 ; i <= n ; i++) {
    ci = substr(alpha, i, 1)
    for (j = 1 ; j <= n ; j++) {
      cj = substr(alpha, j, 1)
      mat[ci][cj] = (i == j) ? diag : ndiag
    }
  }
}

# -----------------------
# internal utility : reverse string
#
  function _Reverse(s, _local_, i, n, rs) {
    rs = "";
    n = length(s);
    for (i = n ; i >= 1 ; i--)
      rs = rs "" substr(s, i, 1)
    return rs;
  }

# -----------------------
# internal utility : alignment traceback for NWS
#
  function _Traceback(dir, s1, s2, n1, n2, align, _local_, i, c1, c2, c3) {
    delete align
    while (dir[n1][n2] != 0) {
      if (dir[n1][n2] == "s") {
        c1 = substr(s1, n1--, 1)
        c2 = substr(s2, n2--, 1)
        c3 = (c1 == c2) ? tolower(c1) : "*"
      }
      else if (dir[n1][n2] == "i") {
        c1 = "-"
        c2 = substr(s2, n2--, 1)
        c3 = "-"
      }
      else {
        c1 = substr(s1, n1--, 1)
        c2 = "-"
        c3 = "-"
      }
      align[1] = align[1] "" c1
      align[2] = align[2] "" c2
      align[3] = align[3] "" c3
    }
    for (i = 1 ; i <= 3 ; i++)
      align[i] = _Reverse(align[i])
  }

# -----------------------
# internal utility : min
#
  function _Min(a, b) {
    return (a < b ? a : b)
  }

# -----------------------
# sequence alignment NWS
#
# todo : check alphabet
#
#   --> i
#  |
#  v d
#
function AlignNWS(s1, s2, subst, indel, local, align,
                     _local_, rev, n1, n2, i, j, c1, c2, m2,
                           ws, wi, wd, w,
                           mat, dir) {
                           
   s1 = toupper(s1) ; s2 = toupper(s2)
   n1 = length(s1)  ; n2 = length(s2)
   if (local && (n2 < n1)) {
     rev = s1 ; s1 = s2 ; s2 = rev
     rev = n1 ; n1 = n2 ; n2 = rev
   }

   if (indel == "") indel = 1

   #
   # nws alignment
   #
   for (i = 0 ; i <= n1 ; i++) {
     c1 = substr(s1, i, 1)
     for (j = 0 ; j <= n2 ; j++) {
       c2 = substr(s2, j, 1)
       if (i && j) {
         ws = mat[i-1][j-1] + subst[c1][c2]
         wd = mat[i-1][j] + indel
         wi = mat[i][j-1] + indel
         w  = _Min(ws, _Min(wi, wd))
         mat[i][j] = w
         dir[i][j] = (w == ws) ? "s" : (w == wi) ? "i" : "d"
       } else if (i) {
         mat[i][j] = mat[i-1][j] + indel
         dir[i][j] = "d"
       } else if (j) {
         mat[i][j] = mat[i][j-1] + (local ? 0 : indel)
         dir[i][j] = "i"
       } else {
         mat[i][j] = 0
         dir[i][j] = 0
       }
     }
     # delete mat[i-1]
   }

   #
   # adjust last line in local mode
   #
   if (local) {
     m2 = n2
     for (j = m2 ; j >= 0 ; j--) {
       if (mat[n1][j] < mat[n1][m2])
         m2 = j
     }
     mat[n1][n2] = mat[n1][m2]
     for (j = m2 + 1 ; j <= n2 ; j++) {
       dir[n1][j] = "i"
     }
   }

   #
   # traceback
   #
   _Traceback(dir, s1, s2, n1, n2, align)
   
   if (rev) {
     rev = align[1] ; align[1] = align[2]; align[2] = rev
   }
   
   return mat[n1][n2]
}
@ -0,0 +1,199 @@
#
# utilities library
#

END {
  if (_ForceExit_) exit(_ForceExit_)
}

function Notify(key, msg) {
    print "# " key " " msg >> "/dev/stderr"
}

function Info(msg) {
    Notify("info", msg)
    return 1
}

function Warning(msg) {
    Notify("warning", msg)
    return 0
}

function Exit(status) {
    exit(_ForceExit_ = status)
}

function Error(msg, status) {
    Notify("error", msg)
    Exit(status ? status : 1)
}

function Assert(condition, msg, status) {
    if (! condition) {
      msg = FILENAME ":" FNR ": " msg
      return status ? Error(msg, status) : Warning(msg)
    }
}

function Min(a, b) {
  return (a < b ? a : b)
}

function Max(a, b) {
  return (a > b ? a : b)
}

function Trim(s, regexp) {
  if (regexp == 0) regexp = "[ \t]+"
  gsub("^" regexp "|" regexp "$", "", s)
  return s
}

function TestPath(path, test, _local_, stat) {
  if (test == 0) test = "-f"
  if (Trim(path) == "")
    return 0 # because of a bug in 'test'

  gsub(" ", "\\ ", path) # protect spaces
  stat = system("test " test " " path)
  return stat ? 0 : 1
}

# ---

function Strip(s) {
  gsub("complement|join|order|\\)|\\(|<|>| ", "", s)
  return s
}

function GetLoc(s, loc, _local_, a, tmp) {
  delete loc
  loc[1] = loc[2] = 0
  split(s, loc, "\\.\\.")
  if (loc[1] > loc[2]) {
    tmp = loc[1]
    loc[1] = loc[2]
    loc[2] = tmp
  }
}

function ParseLocation(s, locs, _local_, i, na, a, loc) {
  delete locs
  s = Strip(s)
  na = split(s, a, ",")
  for (i = 1 ; i <= na ; i++) {
    GetLoc(a[i], loc)
    locs[i][1] = loc[1]
    locs[i][2] = loc[2]
  }
  return na
}

function SpanLocation(s, sloc, _local_, i, na, locs) {
  delete sloc
  na = ParseLocation(s, locs)
  sloc[1] = (na > 0 ? locs[1][1] : 0)
  sloc[2] = (na > 0 ? locs[1][2] : 0)
  for (i = 2 ; i <= na ; i++) {
    sloc[1] = Min(sloc[1], locs[i][1])
    sloc[2] = Max(sloc[2], locs[i][2])
  }
}

function LenLocation(s, _local_, i, na, locs, len) {
  len = 0
  na = ParseLocation(s, locs)
  for (i = 1 ; i <= na ; i++) {
    len += locs[i][2] - locs[i][1] + 1
  }
  return len
}

function Nexons(s, _local_, a) {
  s = Strip(s)
  return split(s, a, ",")
}

function Reverse(s, _local_, i, n, rs) {
  rs = "";
  n = length(s);
  for (i = n ; i >= 1 ; i--)
    rs = rs "" substr(s, i, 1)
  return rs;
}

function RevComplement(seq, _local_, n, i, c, rs) {
  n = length(seq)
  rs = ""
  for (i = n ; i >= 1 ; i--) {
    c = substr(seq, i, 1)
    rs = rs "" (_DnaC[c] ? _DnaC[c] : "X")
  }
  return rs
}

function SubSeq(seq, from, to, revstrand) {
  seq = substr(seq, from, to-from+1)
  if (revstrand) seq = RevComplement(seq)
  return seq
}

function SeqLocation(seq, s, revstrand, _local_, sloc, i, na, locs) {
  sloc = ""
  na = ParseLocation(s, locs)
  for (i = 1 ; i <= na ; i++) {
    sloc = sloc "" SubSeq(seq, locs[i][1], locs[i][2], 0)
  }
  return (revstrand ? RevComplement(sloc) : sloc)
}

function GeneFamily(s) {
  s = tolower(s)
  gsub("(_|-)[0-9]+$", "", s)
  gsub("(_|-)(a|b|c|i)+$", "", s)
  gsub("'+$", "", s)
  gsub("/.+$", "", s)
  if (match(s, "[^a-z,0-9]")) s = "none"
  return s
}

# ---

function ReadFasta(file, _local_, seq, context) {
  context = $0
  seq = ""
  getline < file
  while(getline < file) seq = seq "" $0
  $0 = context
  return seq    
}

function PrintFasta(seq, name, CharPerLine, _local_, i, len, rest, chunk) {
  if (CharPerLine <= 0) CharPerLine = 60
  print ">" name
  len = length(seq)
  rest = len % CharPerLine
  chunk  = (len - rest) / CharPerLine
  for (i = 0 ; i < chunk ; i++) 
    print substr(seq, (i*CharPerLine) + 1, CharPerLine)
  if (rest != 0)
    print substr(seq, (chunk*CharPerLine) + 1, rest)
}


#
# constants
#

BEGIN {
  # complementary of _IupacDna
  _DnaC["A"] = "T"; _DnaC["C"] = "G"; _DnaC["G"] = "C"; _DnaC["T"] = "A"
  _DnaC["R"] = "Y"; _DnaC["Y"] = "R"; _DnaC["M"] = "K"; _DnaC["K"] = "M"
  _DnaC["W"] = "W"; _DnaC["S"] = "S"; _DnaC["B"] = "V"; _DnaC["V"] = "B"
  _DnaC["D"] = "H"; _DnaC["H"] = "D"; _DnaC["N"] = "N"; _DnaC["X"] = "X"
  _DnaC["a"] = "t"; _DnaC["c"] = "g"; _DnaC["g"] = "c"; _DnaC["t"] = "a"
  _DnaC["r"] = "y"; _DnaC["y"] = "r"; _DnaC["m"] = "k"; _DnaC["k"] = "m"
  _DnaC["w"] = "w"; _DnaC["s"] = "s"; _DnaC["b"] = "v"; _DnaC["v"] = "b"
  _DnaC["d"] = "h"; _DnaC["h"] = "d"; _DnaC["n"] = "n"; _DnaC["x"] = "x"
}