From 37e1ecc9fd356487344c9b998e6cdf33ef0095e4 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 7 Oct 2015 11:32:17 -0300 Subject: [PATCH] Add scripts to generate reference DBs for LSC and SSC Former-commit-id: 77fd69c9687bfc7cfd3305299c7c81d2a9eddada Former-commit-id: 4d09933673c30d6a9b2b9cc985b33ef23d7a99f4 --- .../normalize/{ => bin}/normalize_plastid.sh | 0 detectors/normalize/tools/buildSCDB.sh | 174 ++++++++++++++++++ detectors/normalize/tools/cc.py | 48 +++++ detectors/normalize/tools/coorienteSC.sh | 83 +++++++++ detectors/normalize/tools/extract_refLSC.sh | 50 +++++ detectors/normalize/tools/extract_refSSC.sh | 50 +++++ .../normalize/tools/selectViridiplantae.sh | 8 + scripts/bash_init.sh | 117 ++++++++++++ 8 files changed, 530 insertions(+) rename detectors/normalize/{ => bin}/normalize_plastid.sh (100%) create mode 100755 detectors/normalize/tools/buildSCDB.sh create mode 100755 detectors/normalize/tools/cc.py create mode 100755 detectors/normalize/tools/coorienteSC.sh create mode 100755 detectors/normalize/tools/extract_refLSC.sh create mode 100755 detectors/normalize/tools/extract_refSSC.sh create mode 100755 detectors/normalize/tools/selectViridiplantae.sh create mode 100644 scripts/bash_init.sh diff --git a/detectors/normalize/normalize_plastid.sh b/detectors/normalize/bin/normalize_plastid.sh similarity index 100% rename from detectors/normalize/normalize_plastid.sh rename to detectors/normalize/bin/normalize_plastid.sh diff --git a/detectors/normalize/tools/buildSCDB.sh b/detectors/normalize/tools/buildSCDB.sh new file mode 100755 index 0000000..0c66530 --- /dev/null +++ b/detectors/normalize/tools/buildSCDB.sh @@ -0,0 +1,174 @@ +#!/bin/bash +# +# BUILD REFERENCE SINGLE COPY LIBRARIES +# +#======================================================================================== + +# -- CAUTION -- Works as long than the script +# is not called through a symlink +SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})" +source "${SCRIPT_DIR}/../../../scripts/bash_init.sh" + +pushTmpDir ORG.buildSCDB + + openLogFile "${IR_DATA_DIR}/SingleCopyDB.log" + + loginfo "Selecting Viridiplantae genebank entries..." + VIRIDIPLANTAE=$(${PROG_DIR}/selectViridiplantae.sh $*) + loginfo " --> $(echo ${VIRIDIPLANTAE} | wc -w) entries selected" + loginfo "Done" + + + # + # Deals with Long Single Copies (LSC) + # + + loginfo "Extracting Long Single Copies (LSC)..." + ${PROG_DIR}/extract_refLSC.sh ${VIRIDIPLANTAE} > LSC.fasta + loginfo " --> $(fastaCount LSC.fasta) retreived sequences" + loginfo "Done" + + + + loginfo "Building LSC coorientation graph..." + ${PROG_DIR}/coorienteSC.sh LSC.fasta 20000 ${LOGFILE} > LSC.tgf + ${PROG_DIR}/cc.py LSC.tgf > LSC.cc + loginfo " --> $(awk '{print $1}' LSC.cc | uniq | wc -l) connected componants" + loginfo "Done" + + + loginfo "Indexing LCS..." + fastaindex -f LSC.fasta -i LSC.index + loginfo "Done" + + + + loginfo "Extracting main connected components for LCS..." + rm -f LSC.direct.fasta + touch LSC.direct.fasta + for id in `awk '($1==0) {print $2}' LSC.cc`; do + fastafetch -f LSC.fasta -i LSC.index -q "${id}" >> LSC.direct.fasta + done + loginfo " --> $(fastaCount LSC.direct.fasta) sequences" + loginfo "Done" + + + + loginfo "Extracting second connected components for LCS..." + rm -f LSC.reverse.fasta + touch LSC.reverse.fasta + for id in `awk '($1==1) {print $2}' LSC.cc`; do + fastafetch -f LSC.fasta -i LSC.index -q "${id}" >> LSC.reverse.fasta + done + loginfo " --> $(fastaCount LSC.reverse.fasta) sequences" + loginfo "Done" + + + + loginfo "merging both connected components for LCS..." + fastarevcomp LSC.reverse.fasta >> LSC.direct.fasta + loginfo " --> $(fastaCount LSC.direct.fasta) sequences in total" + loginfo "Done" + + + + loginfo "Checking LCS homogeneity..." + ${PROG_DIR}/coorienteSC.sh LSC.direct.fasta 20000 ${LOGFILE} > LSC_RefDB.tgf + ${PROG_DIR}/cc.py LSC_RefDB.tgf > LSC_RefDB.cc + NCC=$(awk '{print $1}' LSC_RefDB.cc | uniq | wc -l) + if (( $NCC == 1 )); then + loginfo " --> $NCC connected componants" + else + logwarning " --> $NCC connected componants" + fi + loginfo "Done" + + + + loginfo "Installing LCS reference databases..." + cp LSC.direct.fasta "${IR_DATA_DIR}/LSC_RefDB.fasta" + cp LSC_RefDB.tgf "${IR_DATA_DIR}/LSC_RefDB.tgf" + + makeblastdb -in "${IR_DATA_DIR}/LSC_RefDB.fasta" \ + -dbtype nucl \ + -out "${IR_DATA_DIR}/LSC_RefDB" >& /dev/null + loginfo "Done" + + # + # Deals with Short Single Copies (SSC) + # + + loginfo "Extracting Short Single Copies (SSC)..." + ${PROG_DIR}/extract_refSSC.sh ${VIRIDIPLANTAE} > SSC.fasta + loginfo " --> $(fastaCount SSC.fasta) retreived sequences" + loginfo "Done" + + + + loginfo "Building SSC coorientation graph..." + ${PROG_DIR}/coorienteSC.sh SSC.fasta 5000 ${LOGFILE} > SSC.tgf + ${PROG_DIR}/cc.py SSC.tgf > SSC.cc + loginfo " --> $(awk '{print $1}' SSC.cc | uniq | wc -l) connected componants" + loginfo "Done" + + + + loginfo "Indexing SSC..." + fastaindex -f SSC.fasta -i SSC.index + loginfo "Done" + + + + loginfo "Extracting main connected components for SSC..." + rm -f SSC.direct.fasta + touch SSC.direct.fasta + for id in `awk '($1==0) {print $2}' SSC.cc`; do + fastafetch -f SSC.fasta -i SSC.index -q "${id}" >> SSC.direct.fasta + done + loginfo " --> $(fastaCount SSC.direct.fasta) sequences" + loginfo "Done" + + + + loginfo "Extracting second connected components for SSC..." + rm -f SSC.reverse.fasta + touch SSC.reverse.fasta + for id in `awk '($1==1) {print $2}' SSC.cc`; do + fastafetch -f SSC.fasta -i SSC.index -q "${id}" >> SSC.reverse.fasta + done + loginfo " --> $(fastaCount SSC.reverse.fasta) sequences" + loginfo "Done" + + + + loginfo "merging both connected components for SSC..." + fastarevcomp SSC.reverse.fasta >> SSC.direct.fasta + loginfo " --> $(fastaCount SSC.direct.fasta) sequences in total" + loginfo "Done" + + + + loginfo "Checking SSC homogeneity..." + ${PROG_DIR}/coorienteSC.sh SSC.direct.fasta 5000 ${LOGFILE} > SSC_RefDB.tgf + ${PROG_DIR}/cc.py SSC_RefDB.tgf > SSC_RefDB.cc + NCC=$(awk '{print $1}' SSC_RefDB.cc | uniq | wc -l) + if (( $NCC == 1 )); then + loginfo " --> $NCC connected componants" + else + logwarning " --> $NCC connected componants" + fi + loginfo "Done" + + + + loginfo "Installing SSC reference databases..." + cp SSC.direct.fasta "${IR_DATA_DIR}/SSC_RefDB.fasta" + cp SSC_RefDB.tgf "${IR_DATA_DIR}/SSC_RefDB.tgf" + + makeblastdb -in "${IR_DATA_DIR}/SSC_RefDB.fasta" \ + -dbtype nucl \ + -out "${IR_DATA_DIR}/SSC_RefDB" >& /dev/null + loginfo "Done" + +popTmpDir + diff --git a/detectors/normalize/tools/cc.py b/detectors/normalize/tools/cc.py new file mode 100755 index 0000000..e7f6c84 --- /dev/null +++ b/detectors/normalize/tools/cc.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python + +import sys + +data = open(sys.argv[1]) + +ccs = [] +for line in data: + + parts = line.strip().split() + if len(parts) >= 2: + a = parts[0] + b = parts[1] + else: + continue + + newcc=set([a,b]) + + keep=set() + found=set() + for i in range(len(ccs)): + if len(found) < 2: + cc=ccs[i] + if a not in found and a in cc: + found.add(a) + keep.add(i) + if b not in found and b in cc: + found.add(b) + keep.add(i) + + for i in keep: + newcc |= ccs[i] + + newccs=[newcc] + + for i in range(len(ccs)): + if i not in keep: + newccs.append(ccs[i]) + + ccs=newccs + +ccs.sort(key=len, reverse=True) + +for i in range(len(ccs)): + cc=ccs[i] + for l in cc: + sys.stdout.write("%d %s\n" % (i,l)) + \ No newline at end of file diff --git a/detectors/normalize/tools/coorienteSC.sh b/detectors/normalize/tools/coorienteSC.sh new file mode 100755 index 0000000..4ad3a0c --- /dev/null +++ b/detectors/normalize/tools/coorienteSC.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# +# BUILD THE COORIENTATION GRAPH +# +# From a set of sequences, this commande build a graph where: +# - vertices are sequence id +# - relation is : s oriented in the same way +# +# The same ortientation is defined from the similiarity measured by blast +# The result graph is formated following the tgf format (trivial graph format) +# NODE1 NODE2 EDGE_LABEL +# +# Run the command +# +# coorienteSC.sh [LOGFILE] +# +# : the fasta file containing sequences to analyse +# : the cumulative minimum length strand have to share to conclude +# : an optional logfile name +# +#======================================================================================== + + +# -- CAUTION -- Works as long than the script +# is not called through a symlink +SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})" +source "${SCRIPT_DIR}/../../../scripts/bash_init.sh" + + +pushTmpDir ORG.coorienteSC + + if [[ ! -z $3 ]]; then + openLogFile $3 + fi + + DATA="${CALL_DIR}/${1}" + MINLENGTH=$2 + BLASTDB=${1/.fasta$/} + + loginfo "Build temporary blast DB..." + makeblastdb -in "${DATA}" -dbtype nucl -out "${BLASTDB}" >& /dev/null + loginfo "Done" + + loginfo "Running Blast..." + blastn -db "${BLASTDB}" -query "${DATA}" -outfmt 6 | \ + awk ' \ + ($4 > 1000) && ($3 > 70) \ + ($1==QUERY) && \ + ($2==SUBJECT) && \ + (($7<$8) && ($9<$10)) || (($7>$8) && ($9>$10)) {LSAME+= ($3/100.*$4)} \ + ($4 > 1000) && ($3 > 70) \ + ($1==QUERY) && \ + ($2==SUBJECT) && \ + (($7<$8) && ($9>$10)) || (($7>$8) && ($9<$10)) {LDIFF+= ($3/100.*$4)} \ + (QUERY!="") && \ + (($1!=QUERY) || ($2!=SUBJECT)) {print QUERY,SUBJECT,LSAME,LDIFF,(LSAME>LDIFF)} \ + (($1!=QUERY) || ($2!=SUBJECT)) {QUERY=$1;SUBJECT=$2; \ + LSAME=0; \ + LDIFF=0; \ + if (($4 > 1000) && ($3 > 70)) { \ + if ((($7<$8) && ($9<$10)) || \ + (($7>$8) && ($9>$10))) {\ + LSAME= ($3/100.*$4) } \ + else { \ + LDIFF= ($3/100.*$4) }} \ + } \ + END {print QUERY,SUBJECT,LSAME,LDIFF,(LSAME>LDIFF)}' | \ + awk -v minlength="${MINLENGTH}" \ + ' (($3>minlength) || \ + ($4 > minlength)) && \ + ($3/($4+1) > 2) && \ + ($1!=$2) { if ($1 > $2) \ + { print $2,$1,$5 } \ + else \ + {print $1,$2,$5}}' | \ + sort | \ + uniq -c | \ + awk '($1==2) {print $2,$3,$4}' + loginfo "Done" + +popTmpDir + + \ No newline at end of file diff --git a/detectors/normalize/tools/extract_refLSC.sh b/detectors/normalize/tools/extract_refLSC.sh new file mode 100755 index 0000000..55bbcab --- /dev/null +++ b/detectors/normalize/tools/extract_refLSC.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# + + + awk 'function printfasta(seq) { \ + seqlen=length(seq); \ + for (i=1; i <= seqlen; i+=60) \ + print substr(seq,i,60); \ + } \ + function comp(seq) { \ + "echo "seq" | tr acgtACGT tgcaTGCA " | getline res; \ + close("echo "seq" | tr acgtACGT tgcaTGCA "); \ + return res; \ + } \ + function rev(seq) { \ + "echo "seq" | rev " | getline res; \ + close("echo "seq" | rev "); \ + return res; \ + } \ + function revcomp(seq) { \ + res=rev(comp(seq)); \ + return res; \ + } \ + \ + /^LOCUS / {AC=$2; sequence=""; seqon=0; FROM="";TO=""} \ + /^ misc_feature/ {LOCUS=$2; STRAND=1} \ + /^ misc_feature/ && /complement/ {STRAND=0; \ + sub("complement\\(","",LOCUS); \ + sub("\\)","",LOCUS); \ + } \ + /large single copy/ && /LSC/ {split(LOCUS,POS,"."); \ + FROM=POS[1]; \ + TO=POS[3]; \ + LENGTH=TO-FROM+1 \ + } \ + /^ORIGIN/ {seqon=1} \ + /^ *[1-9][0-9]* [a-z ]+$/ && seqon {seq=$2 $3 $4 $5 $6 $7; \ + gsub("[^acgt]","n",seq);\ + sequence=sequence seq \ + } \ + /^\/\// && FROM && (LENGTH > 60000) && (LENGTH < 100000) \ + {print ">LSC_"AC" Strand="STRAND";", \ + "cut="FROM".."TO";", \ + "seq_length="LENGTH";"; \ + SS=substr(sequence,FROM,LENGTH); \ + if (! STRAND) \ + SS=revcomp(SS); \ + printfasta(SS); \ + } \ + ' $* diff --git a/detectors/normalize/tools/extract_refSSC.sh b/detectors/normalize/tools/extract_refSSC.sh new file mode 100755 index 0000000..36660c6 --- /dev/null +++ b/detectors/normalize/tools/extract_refSSC.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# + + + awk 'function printfasta(seq) { \ + seqlen=length(seq); \ + for (i=1; i <= seqlen; i+=60) \ + print substr(seq,i,60); \ + } \ + function comp(seq) { \ + "echo "seq" | tr acgtACGT tgcaTGCA " | getline res; \ + close("echo "seq" | tr acgtACGT tgcaTGCA "); \ + return res; \ + } \ + function rev(seq) { \ + "echo "seq" | rev " | getline res; \ + close("echo "seq" | rev "); \ + return res; \ + } \ + function revcomp(seq) { \ + res=rev(comp(seq)); \ + return res; \ + } \ + \ + /^LOCUS / {AC=$2; sequence=""; seqon=0; FROM="";TO=""} \ + /^ misc_feature/ {LOCUS=$2; STRAND=1} \ + /^ misc_feature/ && /complement/ {STRAND=0; \ + sub("complement\\(","",LOCUS); \ + sub("\\)","",LOCUS); \ + } \ + /small single copy/ && /SSC/ {split(LOCUS,POS,"."); \ + FROM=POS[1]; \ + TO=POS[3]; \ + LENGTH=TO-FROM+1 \ + } \ + /^ORIGIN/ {seqon=1} \ + /^ *[1-9][0-9]* [a-z ]+$/ && seqon {seq=$2 $3 $4 $5 $6 $7; \ + gsub("[^acgt]","n",seq);\ + sequence=sequence seq \ + } \ + /^\/\// && FROM && (LENGTH > 15000) && (LENGTH < 20000) \ + {print ">SSC_"AC" Strand="STRAND";", \ + "cut="FROM".."TO";", \ + "seq_length="LENGTH";"; \ + SS=substr(sequence,FROM,LENGTH); \ + if (! STRAND) \ + SS=revcomp(SS); \ + printfasta(SS); \ + } \ + ' $* diff --git a/detectors/normalize/tools/selectViridiplantae.sh b/detectors/normalize/tools/selectViridiplantae.sh new file mode 100755 index 0000000..25e644e --- /dev/null +++ b/detectors/normalize/tools/selectViridiplantae.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +grep -A 1 ' ORGANISM' $* | \ + grep -B 1 Viridiplantae | \ + awk '{print $1}' | \ + grep '\.gbk' | \ + sed -E 's/(^.*\.gbk).$/\1/' | \ + uniq \ No newline at end of file diff --git a/scripts/bash_init.sh b/scripts/bash_init.sh new file mode 100644 index 0000000..39d2bc6 --- /dev/null +++ b/scripts/bash_init.sh @@ -0,0 +1,117 @@ +# +# Bash file to be sourced at the begining of each bash script +# for setting up basic variables and functions +# + +######################## +# +# General usage functions +# +# + +function getAbsolutePath { + [[ -d $1 ]] && { cd "$1"; echo "$(pwd -P)"; } || + { cd "$(dirname "$1")"; echo "$(pwd -P)/$(basename "$1")"; } +} + +# Manage temp directory + +function pushTmpDir { + TMP_DIR=$(mktemp -d -t "$1_proc_$$_") + pushd $TMP_DIR >& /dev/null +} + +function popTmpDir { + popd >& /dev/null + rm -rf $TMP_DIR >& /dev/null +} + +# Logging functions + + +function errcho { + >&2 echo $* +} + +function openLogFile { + LOGFILE=$1 + touch ${LOGFILE} +} + + +function loginfo { + errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1" + if [[ ! -z ${LOGFILE} ]]; then + echo `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1" >> ${LOGFILE} + fi +} + +function logerror { + errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1" + if [[ ! -z ${LOGFILE} ]]; then + echo `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1" >> ${LOGFILE} + fi +} + +function logwarning { + errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" + if [[ ! -z ${LOGFILE} ]]; then + echo `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" >> ${LOGFILE} + fi +} + +# Sequence related functions + +function fastaCount { + grep '^>' $1 | wc -l +} + +# +# +######################## + + + +######################## +# +# Local variable definitions +# +# + +# The absolute path to the ORG.Annote home direcotory +ORG_HOME=`getAbsolutePath $(dirname ${BASH_SOURCE[0]})/..` + + +ORG_PORTNAME=`${ORG_HOME}/config/guess_port` # The architecture running the ORG.Annnote instance + +PROG_DIR="$(getAbsolutePath $(dirname $0))" # Directory containing the main script file + +DATA_DIR="${ORG_HOME}/data" # Directory containing reference data for the annotation + +CALL_DIR="$(getAbsolutePath $(pwd))" # Directory from where the main script is called + + +IR_DATA_DIR="${DATA_DIR}/ir" # Directory containing data related to the + # Inverted repeat strucuture + + +# +# +######################## + + + +######################## +# +# Altering the environment +# +# + +# We alter the path to include the bin dir corresponding to the port +PATH="${ORG_HOME}/ports/${ORG_PORTNAME}/bin:${PATH}" +export PATH + +# Force to basic international setting for a correction behaviour of AWK on mac with float +export LANG=C +export LC_ALL=C +