annotate/detectors/cds/tools/lib/embl.cds_long.awk

#
# get cds features from embl (long version)
#
# -v FASTA

# @include lib.embl.awk


BEGIN {
  print "#locus locustag genefam gene from to strand nexon length status start stop dnaseq protseq product"

  if (HEADONLY != "") exit(0)

  if (MAXSPAN == "") MAXSPAN = 10000

  if (FASTA == "") Error("No FASTA file specified", 1)
  
  if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)

  Seq = tolower(ReadFasta(FASTA))
}

/^ID / {
  locus = $2
  gsub(";", "", locus)
  incds = 0
  next
}

/^FT   CDS/ {
  revstrand = match($3, "^complement")
  s = substr($0, 22)
  gsub("^complement", "", s)
  ok = ! match(s, "complement|order")
  nexon = Nexons(s)
  SpanLocation(s, sloc)
  spanlen = sloc[2] - sloc[1] + 1
  len = LenLocation(s)
  ok = ok && (len < MAXSPAN)
  cdsseq = ok ? SeqLocation(Seq, s, revstrand) : "XXX"
  cstart = substr(cdsseq, 1,3)
  cstop  = substr(cdsseq, length(cdsseq)-2)

  gene = "none"
  locustag = "none"
  product = "none"
  translation = "X"
  incds = 1
  next
}

(incds && /^FT   [^ ]/) {
  print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"), 
        nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
  incds = 0
  next
}

/^FT                   \/gene=/ {
  split($0, a, "=")
  gene = a[2]
  gsub("^[^a-z,A-Z]+", "", gene)
  gsub("\"", "", gene)
  gsub(" ", "_", gene)
  next
}

/^FT                   \/locus_tag=/ {
  split($0, a, "=")
  locustag = a[2]
  gsub("\"", "", locustag)
  gsub(" ", "_", locustag)
  next
}

/^FT                   \/product=/ {
  split($0, a, "=")
  product = a[2]
  gsub("\"", "", product)
  gsub(" ", "_", product)
  next
}

/^FT                   \/translation=/ {
  split($0, a, "=")
  translation = a[2]
  gsub("\"", "", translation)
  gsub(" ", "", translation)
  next
}

END {
  if (incds) {
    print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"), 
          nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
  }
}
cds/tools/chlorodb added Former-commit-id: 0579e878a69b7c285ca71870e9ca5730649a2fda Former-commit-id: 7cced5b488441d87bf070a9a444317db0e048880 2015-11-13 17:41:18 +01:00			`#`
			`# get cds features from embl (long version)`
			`#`
			`# -v FASTA`

			`# @include lib.embl.awk`


			`BEGIN {`
			`print "#locus locustag genefam gene from to strand nexon length status start stop dnaseq protseq product"`

			`if (HEADONLY != "") exit(0)`

			`if (MAXSPAN == "") MAXSPAN = 10000`

			`if (FASTA == "") Error("No FASTA file specified", 1)`

			`if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)`

			`Seq = tolower(ReadFasta(FASTA))`
			`}`

			`/^ID / {`
			`locus = $2`
			`gsub(";", "", locus)`
			`incds = 0`
			`next`
			`}`

			`/^FT CDS/ {`
			`revstrand = match($3, "^complement")`
			`s = substr($0, 22)`
			`gsub("^complement", "", s)`
			`ok = ! match(s, "complement\|order")`
			`nexon = Nexons(s)`
			`SpanLocation(s, sloc)`
			`spanlen = sloc[2] - sloc[1] + 1`
			`len = LenLocation(s)`
			`ok = ok && (len < MAXSPAN)`
			`cdsseq = ok ? SeqLocation(Seq, s, revstrand) : "XXX"`
			`cstart = substr(cdsseq, 1,3)`
			`cstop = substr(cdsseq, length(cdsseq)-2)`

			`gene = "none"`
			`locustag = "none"`
			`product = "none"`
			`translation = "X"`
			`incds = 1`
			`next`
			`}`

			`(incds && /^FT [^ ]/) {`
			`print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"),`
			`nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product`
			`incds = 0`
			`next`
			`}`

			`/^FT \/gene=/ {`
			`split($0, a, "=")`
			`gene = a[2]`
			`gsub("^[^a-z,A-Z]+", "", gene)`
			`gsub("\"", "", gene)`
			`gsub(" ", "_", gene)`
			`next`
			`}`

			`/^FT \/locus_tag=/ {`
			`split($0, a, "=")`
			`locustag = a[2]`
			`gsub("\"", "", locustag)`
			`gsub(" ", "_", locustag)`
			`next`
			`}`

			`/^FT \/product=/ {`
			`split($0, a, "=")`
			`product = a[2]`
			`gsub("\"", "", product)`
			`gsub(" ", "_", product)`
			`next`
			`}`

			`/^FT \/translation=/ {`
			`split($0, a, "=")`
			`translation = a[2]`
			`gsub("\"", "", translation)`
			`gsub(" ", "", translation)`
			`next`
			`}`

			`END {`
			`if (incds) {`
			`print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"),`
			`nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product`
			`}`
			`}`