From 37e1ecc9fd356487344c9b998e6cdf33ef0095e4 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric.coissac@metabarcoding.org>
Date: Wed, 7 Oct 2015 11:32:17 -0300
Subject: [PATCH] Add scripts to generate reference DBs for LSC and SSC

Former-commit-id: 77fd69c9687bfc7cfd3305299c7c81d2a9eddada
Former-commit-id: 4d09933673c30d6a9b2b9cc985b33ef23d7a99f4
---
 .../normalize/{ => bin}/normalize_plastid.sh  |   0
 detectors/normalize/tools/buildSCDB.sh        | 174 ++++++++++++++++++
 detectors/normalize/tools/cc.py               |  48 +++++
 detectors/normalize/tools/coorienteSC.sh      |  83 +++++++++
 detectors/normalize/tools/extract_refLSC.sh   |  50 +++++
 detectors/normalize/tools/extract_refSSC.sh   |  50 +++++
 .../normalize/tools/selectViridiplantae.sh    |   8 +
 scripts/bash_init.sh                          | 117 ++++++++++++
 8 files changed, 530 insertions(+)
 rename detectors/normalize/{ => bin}/normalize_plastid.sh (100%)
 create mode 100755 detectors/normalize/tools/buildSCDB.sh
 create mode 100755 detectors/normalize/tools/cc.py
 create mode 100755 detectors/normalize/tools/coorienteSC.sh
 create mode 100755 detectors/normalize/tools/extract_refLSC.sh
 create mode 100755 detectors/normalize/tools/extract_refSSC.sh
 create mode 100755 detectors/normalize/tools/selectViridiplantae.sh
 create mode 100644 scripts/bash_init.sh

diff --git a/detectors/normalize/normalize_plastid.sh b/detectors/normalize/bin/normalize_plastid.sh
similarity index 100%
rename from detectors/normalize/normalize_plastid.sh
rename to detectors/normalize/bin/normalize_plastid.sh
diff --git a/detectors/normalize/tools/buildSCDB.sh b/detectors/normalize/tools/buildSCDB.sh
new file mode 100755
index 0000000..0c66530
--- /dev/null
+++ b/detectors/normalize/tools/buildSCDB.sh
@@ -0,0 +1,174 @@
+#!/bin/bash
+#
+#                           BUILD REFERENCE SINGLE COPY LIBRARIES
+#
+#========================================================================================
+
+# -- CAUTION -- Works as long than the script 
+#               is not called through a symlink
+SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
+source "${SCRIPT_DIR}/../../../scripts/bash_init.sh"
+
+pushTmpDir ORG.buildSCDB
+	
+	openLogFile "${IR_DATA_DIR}/SingleCopyDB.log"
+	
+	loginfo "Selecting Viridiplantae genebank entries..."
+		VIRIDIPLANTAE=$(${PROG_DIR}/selectViridiplantae.sh $*) 
+		loginfo " --> $(echo ${VIRIDIPLANTAE} | wc -w) entries selected"
+	loginfo "Done"
+	
+	
+	#
+	# Deals with Long Single Copies (LSC)
+	#
+	
+	loginfo "Extracting Long Single Copies (LSC)..."
+		${PROG_DIR}/extract_refLSC.sh ${VIRIDIPLANTAE} > LSC.fasta
+		loginfo " --> $(fastaCount LSC.fasta) retreived sequences"
+	loginfo "Done"
+	
+	
+	
+	loginfo "Building LSC coorientation graph..."
+		${PROG_DIR}/coorienteSC.sh LSC.fasta 20000 ${LOGFILE} > LSC.tgf
+		${PROG_DIR}/cc.py LSC.tgf > LSC.cc
+		loginfo " --> $(awk '{print $1}' LSC.cc | uniq | wc -l) connected componants"
+	loginfo "Done"
+
+
+	loginfo "Indexing LCS..."
+		fastaindex -f LSC.fasta -i LSC.index
+	loginfo "Done"
+	
+	
+	
+	loginfo "Extracting main connected components for LCS..."
+		rm -f LSC.direct.fasta
+		touch LSC.direct.fasta
+		for id in `awk '($1==0) {print $2}' LSC.cc`; do
+			fastafetch -f LSC.fasta -i LSC.index -q "${id}" >> LSC.direct.fasta
+		done
+		loginfo " --> $(fastaCount LSC.direct.fasta) sequences"
+	loginfo "Done"
+	
+	
+	
+	loginfo "Extracting second connected components for LCS..."
+		rm -f LSC.reverse.fasta
+		touch LSC.reverse.fasta
+		for id in `awk '($1==1) {print $2}' LSC.cc`; do
+			fastafetch -f LSC.fasta -i LSC.index -q "${id}" >> LSC.reverse.fasta
+		done		
+		loginfo " --> $(fastaCount LSC.reverse.fasta) sequences"
+	loginfo "Done"
+	
+	
+	
+	loginfo "merging both connected components for LCS..."
+		fastarevcomp LSC.reverse.fasta >> LSC.direct.fasta
+		loginfo " --> $(fastaCount LSC.direct.fasta) sequences in total"
+	loginfo "Done"
+
+
+
+	loginfo "Checking LCS homogeneity..."
+		${PROG_DIR}/coorienteSC.sh LSC.direct.fasta 20000 ${LOGFILE} > LSC_RefDB.tgf		
+		${PROG_DIR}/cc.py LSC_RefDB.tgf > LSC_RefDB.cc
+		NCC=$(awk '{print $1}' LSC_RefDB.cc | uniq | wc -l)
+		if (( $NCC == 1 )); then
+			loginfo " --> $NCC connected componants"
+		else
+			logwarning " --> $NCC connected componants"
+		fi
+	loginfo "Done"
+	
+	
+	
+	loginfo "Installing LCS reference databases..."
+		cp LSC.direct.fasta "${IR_DATA_DIR}/LSC_RefDB.fasta"
+		cp LSC_RefDB.tgf    "${IR_DATA_DIR}/LSC_RefDB.tgf"
+		
+		makeblastdb -in "${IR_DATA_DIR}/LSC_RefDB.fasta" \
+		            -dbtype nucl \
+		            -out "${IR_DATA_DIR}/LSC_RefDB" >& /dev/null
+	loginfo "Done"
+		
+	#
+	# Deals with Short Single Copies (SSC)
+	#
+	
+	loginfo "Extracting Short Single Copies (SSC)..."
+		${PROG_DIR}/extract_refSSC.sh ${VIRIDIPLANTAE} > SSC.fasta
+		loginfo " --> $(fastaCount SSC.fasta) retreived sequences"
+	loginfo "Done"
+
+
+	
+	loginfo "Building SSC coorientation graph..."
+		${PROG_DIR}/coorienteSC.sh SSC.fasta 5000 ${LOGFILE} > SSC.tgf
+		${PROG_DIR}/cc.py SSC.tgf > SSC.cc
+		loginfo " --> $(awk '{print $1}' SSC.cc | uniq | wc -l) connected componants"
+	loginfo "Done"
+
+
+
+	loginfo "Indexing SSC..."
+		fastaindex -f SSC.fasta -i SSC.index
+	loginfo "Done"
+	
+	
+	
+	loginfo "Extracting main connected components for SSC..."
+		rm -f SSC.direct.fasta
+		touch SSC.direct.fasta
+		for id in `awk '($1==0) {print $2}' SSC.cc`; do
+			fastafetch -f SSC.fasta -i SSC.index -q "${id}" >> SSC.direct.fasta
+		done
+		loginfo " --> $(fastaCount SSC.direct.fasta) sequences"
+	loginfo "Done"
+	
+	
+	
+	loginfo "Extracting second connected components for SSC..."
+		rm -f SSC.reverse.fasta
+		touch SSC.reverse.fasta
+		for id in `awk '($1==1) {print $2}' SSC.cc`; do
+			fastafetch -f SSC.fasta -i SSC.index -q "${id}" >> SSC.reverse.fasta
+		done		
+		loginfo " --> $(fastaCount SSC.reverse.fasta) sequences"
+	loginfo "Done"
+	
+	
+	
+	loginfo "merging both connected components for SSC..."
+		fastarevcomp SSC.reverse.fasta >> SSC.direct.fasta
+		loginfo " --> $(fastaCount SSC.direct.fasta) sequences in total"
+	loginfo "Done"
+
+
+
+	loginfo "Checking SSC homogeneity..."
+		${PROG_DIR}/coorienteSC.sh SSC.direct.fasta 5000 ${LOGFILE} > SSC_RefDB.tgf		
+		${PROG_DIR}/cc.py SSC_RefDB.tgf > SSC_RefDB.cc
+		NCC=$(awk '{print $1}' SSC_RefDB.cc | uniq | wc -l)
+		if (( $NCC == 1 )); then
+			loginfo " --> $NCC connected componants"
+		else
+			logwarning " --> $NCC connected componants"
+		fi
+	loginfo "Done"
+	
+	
+	
+	loginfo "Installing SSC reference databases..."
+		cp SSC.direct.fasta "${IR_DATA_DIR}/SSC_RefDB.fasta"
+		cp SSC_RefDB.tgf    "${IR_DATA_DIR}/SSC_RefDB.tgf"
+		
+		makeblastdb -in "${IR_DATA_DIR}/SSC_RefDB.fasta" \
+		            -dbtype nucl \
+		            -out "${IR_DATA_DIR}/SSC_RefDB" >& /dev/null
+	loginfo "Done"
+		
+popTmpDir
+
diff --git a/detectors/normalize/tools/cc.py b/detectors/normalize/tools/cc.py
new file mode 100755
index 0000000..e7f6c84
--- /dev/null
+++ b/detectors/normalize/tools/cc.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python 
+
+import sys
+
+data = open(sys.argv[1])
+
+ccs = []
+for line in data:
+    
+    parts = line.strip().split()
+    if len(parts) >= 2:
+        a = parts[0]
+        b = parts[1]
+    else:
+        continue
+    
+    newcc=set([a,b])
+    
+    keep=set()
+    found=set()
+    for i in range(len(ccs)):
+        if len(found) < 2:
+            cc=ccs[i]
+            if a not in found and a in cc:
+                found.add(a)
+                keep.add(i)
+            if b not in found and b in cc:
+                found.add(b)
+                keep.add(i)
+                
+    for i in keep:
+        newcc |= ccs[i]
+        
+    newccs=[newcc]
+                    
+    for i in range(len(ccs)):
+        if i not in keep:
+            newccs.append(ccs[i])
+            
+    ccs=newccs
+    
+ccs.sort(key=len, reverse=True)
+
+for i in range(len(ccs)):
+    cc=ccs[i]
+    for l in cc:
+        sys.stdout.write("%d %s\n" % (i,l))
+    
\ No newline at end of file
diff --git a/detectors/normalize/tools/coorienteSC.sh b/detectors/normalize/tools/coorienteSC.sh
new file mode 100755
index 0000000..4ad3a0c
--- /dev/null
+++ b/detectors/normalize/tools/coorienteSC.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+#
+#                           BUILD THE COORIENTATION GRAPH
+#
+# From a set of sequences, this commande build a graph where:
+#             - vertices are sequence id
+#             - relation is : s oriented in the same way
+#
+# The same ortientation is defined from the similiarity measured by blast
+# The result graph is formated following the tgf format (trivial graph format)
+#                 NODE1 NODE2 EDGE_LABEL
+#
+# Run the command
+#
+#   coorienteSC.sh <FASTA_FILE> <MINLENGTH> [LOGFILE]
+#
+#   <FASTA_FILE> : the fasta file containing sequences to analyse
+#   <MINLENGTH>  : the cumulative minimum length strand have to share to conclude
+#   <LOGFILE>    : an optional logfile name
+#
+#========================================================================================
+
+
+# -- CAUTION -- Works as long than the script 
+#               is not called through a symlink
+SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
+source "${SCRIPT_DIR}/../../../scripts/bash_init.sh"
+
+
+pushTmpDir  ORG.coorienteSC
+
+	if [[ ! -z $3 ]]; then
+		openLogFile $3
+	fi
+
+	DATA="${CALL_DIR}/${1}"
+	MINLENGTH=$2
+	BLASTDB=${1/.fasta$/}
+
+	loginfo "Build temporary blast DB..."
+	makeblastdb -in "${DATA}" -dbtype nucl -out "${BLASTDB}" >& /dev/null
+	loginfo "Done"
+	
+	loginfo "Running Blast..."
+		blastn -db "${BLASTDB}" -query "${DATA}" -outfmt 6 | \
+	 	awk ' \
+	 	      ($4 > 1000) && ($3 > 70) \
+	 	      ($1==QUERY) && \
+	 	      ($2==SUBJECT) && \
+	 	      (($7<$8) && ($9<$10)) || (($7>$8) && ($9>$10)) {LSAME+= ($3/100.*$4)} \
+	 	      ($4 > 1000) &&  ($3 > 70) \
+	 	      ($1==QUERY) && \
+	 	      ($2==SUBJECT) && \
+	 	      (($7<$8) && ($9>$10)) || (($7>$8) && ($9<$10)) {LDIFF+= ($3/100.*$4)} \
+	 	      (QUERY!="") && \
+	 	      (($1!=QUERY) || ($2!=SUBJECT)) {print QUERY,SUBJECT,LSAME,LDIFF,(LSAME>LDIFF)} \
+	 	      (($1!=QUERY) || ($2!=SUBJECT)) {QUERY=$1;SUBJECT=$2; \
+	 	                                      LSAME=0; \
+	 	                                      LDIFF=0; \
+	 	                                      if (($4 > 1000) && ($3 > 70)) { \
+	 	                                        if ((($7<$8) && ($9<$10)) || \
+	 	                                            (($7>$8) && ($9>$10))) {\
+	 	                                         LSAME= ($3/100.*$4) } \
+	 	                                      else { \
+	 	                                         LDIFF= ($3/100.*$4) }} \
+	 	                                     } \
+	          END {print QUERY,SUBJECT,LSAME,LDIFF,(LSAME>LDIFF)}' | \
+     	awk -v minlength="${MINLENGTH}" \
+     		  ' (($3>minlength) || \
+       			($4 > minlength)) && \
+       			($3/($4+1) > 2) && \
+       			($1!=$2) { if ($1 > $2) \
+       						 { print $2,$1,$5 } \
+                           else \
+                             {print $1,$2,$5}}' | \
+        sort | \
+        uniq -c | \
+        awk '($1==2) {print $2,$3,$4}'
+	loginfo "Done"
+	          
+popTmpDir
+
+ 
\ No newline at end of file
diff --git a/detectors/normalize/tools/extract_refLSC.sh b/detectors/normalize/tools/extract_refLSC.sh
new file mode 100755
index 0000000..55bbcab
--- /dev/null
+++ b/detectors/normalize/tools/extract_refLSC.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+#
+
+
+  awk 'function printfasta(seq) {                                            \
+             seqlen=length(seq);                                             \
+             for (i=1; i <= seqlen; i+=60)                                   \
+                 print substr(seq,i,60);                                     \
+             }                                                               \
+       function comp(seq) {                   						                   \
+              "echo "seq" | tr acgtACGT tgcaTGCA " | getline res; 	         \
+              close("echo "seq" | tr acgtACGT tgcaTGCA ");                   \
+              return res;                        						                 \
+             }                                      						             \
+       function rev(seq) {                    						                   \
+              "echo "seq" | rev " | getline res; 						                 \
+              close("echo "seq" | rev ");                                    \
+              return res;                        						                 \
+             }                                      						             \
+       function revcomp(seq) {                						                   \
+              res=rev(comp(seq));                						                 \
+              return res;                        						                 \
+             }                                      						             \
+                                                                             \
+       /^LOCUS / {AC=$2; sequence=""; seqon=0; FROM="";TO=""}                \
+       /^     misc_feature/ {LOCUS=$2; STRAND=1}                             \
+       /^     misc_feature/ && /complement/ {STRAND=0;                       \
+                                             sub("complement\\(","",LOCUS);  \
+                                             sub("\\)","",LOCUS);    \
+                                            }                      \
+       /large single copy/ && /LSC/ {split(LOCUS,POS,".");         \
+                                     FROM=POS[1];                  \
+                                     TO=POS[3];                    \
+                                     LENGTH=TO-FROM+1              \
+                                    }                              \
+       /^ORIGIN/ {seqon=1}                                         \
+       /^ *[1-9][0-9]* [a-z ]+$/ && seqon {seq=$2 $3 $4 $5 $6 $7;  \
+                                           gsub("[^acgt]","n",seq);\
+                                           sequence=sequence seq   \
+                                          }                        \
+       /^\/\// && FROM && (LENGTH > 60000) && (LENGTH < 100000)    \
+                        {print ">LSC_"AC" Strand="STRAND";",       \
+                               "cut="FROM".."TO";",                \
+                               "seq_length="LENGTH";";             \
+                         SS=substr(sequence,FROM,LENGTH);          \
+                         if (! STRAND)                             \
+                           SS=revcomp(SS);                         \
+                         printfasta(SS);                           \
+       }             \
+      ' $*
diff --git a/detectors/normalize/tools/extract_refSSC.sh b/detectors/normalize/tools/extract_refSSC.sh
new file mode 100755
index 0000000..36660c6
--- /dev/null
+++ b/detectors/normalize/tools/extract_refSSC.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+#
+
+
+  awk 'function printfasta(seq) {                                            \
+             seqlen=length(seq);                                             \
+             for (i=1; i <= seqlen; i+=60)                                   \
+                 print substr(seq,i,60);                                     \
+             }                                                               \
+       function comp(seq) {                   						                   \
+              "echo "seq" | tr acgtACGT tgcaTGCA " | getline res; 	         \
+              close("echo "seq" | tr acgtACGT tgcaTGCA ");                   \
+              return res;                        						                 \
+             }                                      						             \
+       function rev(seq) {                    						                   \
+              "echo "seq" | rev " | getline res; 						                 \
+              close("echo "seq" | rev ");                                    \
+              return res;                        						                 \
+             }                                      						             \
+       function revcomp(seq) {                						                   \
+              res=rev(comp(seq));                						                 \
+              return res;                        						                 \
+             }                                      						             \
+                                                                             \
+       /^LOCUS / {AC=$2; sequence=""; seqon=0; FROM="";TO=""}                \
+       /^     misc_feature/ {LOCUS=$2; STRAND=1}                             \
+       /^     misc_feature/ && /complement/ {STRAND=0;                       \
+                                             sub("complement\\(","",LOCUS);  \
+                                             sub("\\)","",LOCUS);    \
+                                            }                      \
+       /small single copy/ && /SSC/ {split(LOCUS,POS,".");         \
+                                     FROM=POS[1];                  \
+                                     TO=POS[3];                    \
+                                     LENGTH=TO-FROM+1              \
+                                    }                              \
+       /^ORIGIN/ {seqon=1}                                         \
+       /^ *[1-9][0-9]* [a-z ]+$/ && seqon {seq=$2 $3 $4 $5 $6 $7;  \
+                                           gsub("[^acgt]","n",seq);\
+                                           sequence=sequence seq   \
+                                          }                        \
+       /^\/\// && FROM && (LENGTH > 15000) && (LENGTH < 20000)     \
+                        {print ">SSC_"AC" Strand="STRAND";",       \
+                               "cut="FROM".."TO";",                \
+                               "seq_length="LENGTH";";             \
+                         SS=substr(sequence,FROM,LENGTH);          \
+                         if (! STRAND)                             \
+                           SS=revcomp(SS);                         \
+                         printfasta(SS);                           \
+       }             \
+      ' $*
diff --git a/detectors/normalize/tools/selectViridiplantae.sh b/detectors/normalize/tools/selectViridiplantae.sh
new file mode 100755
index 0000000..25e644e
--- /dev/null
+++ b/detectors/normalize/tools/selectViridiplantae.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+grep -A 1 '  ORGANISM' $* | \
+  grep -B 1 Viridiplantae | \
+  awk '{print $1}' | \
+  grep '\.gbk' | \
+  sed -E 's/(^.*\.gbk).$/\1/' | \
+  uniq
\ No newline at end of file
diff --git a/scripts/bash_init.sh b/scripts/bash_init.sh
new file mode 100644
index 0000000..39d2bc6
--- /dev/null
+++ b/scripts/bash_init.sh
@@ -0,0 +1,117 @@
+#
+# Bash file to be sourced at the begining of each bash script 
+# for setting up basic variables and functions
+#
+
+########################
+#
+# General usage functions
+#
+#
+
+function getAbsolutePath {
+    [[ -d $1 ]] && { cd "$1"; echo "$(pwd -P)"; } || 
+    { cd "$(dirname "$1")"; echo "$(pwd -P)/$(basename "$1")"; }
+}
+
+# Manage temp directory
+
+function pushTmpDir {
+	TMP_DIR=$(mktemp -d -t "$1_proc_$$_")
+	pushd $TMP_DIR >& /dev/null
+}
+
+function popTmpDir {
+	popd  >& /dev/null
+	rm -rf $TMP_DIR >& /dev/null
+}
+
+# Logging functions
+ 
+
+function errcho {
+	>&2 echo $*
+}
+
+function openLogFile {
+	LOGFILE=$1
+	touch ${LOGFILE}
+}
+
+
+function loginfo {
+    errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO   ] $$ -- $1"
+    if [[ ! -z ${LOGFILE} ]]; then
+          echo `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO   ] $$ -- $1" >> ${LOGFILE}
+    fi
+}
+
+function logerror {
+    errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR  ] $$ -- $1"
+    if [[ ! -z ${LOGFILE} ]]; then
+          echo `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR   ] $$ -- $1" >> ${LOGFILE}
+    fi
+}
+
+function logwarning {
+    errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1"
+    if [[ ! -z ${LOGFILE} ]]; then
+          echo `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" >> ${LOGFILE}
+    fi
+}
+
+# Sequence related functions	
+	
+function fastaCount {
+	grep '^>' $1 | wc -l
+}
+
+#
+#
+########################
+
+
+
+########################
+#
+# Local variable definitions
+#
+#
+
+# The absolute path to the ORG.Annote home direcotory
+ORG_HOME=`getAbsolutePath $(dirname ${BASH_SOURCE[0]})/..`
+
+ 
+ORG_PORTNAME=`${ORG_HOME}/config/guess_port`	# The architecture running the ORG.Annnote instance
+
+PROG_DIR="$(getAbsolutePath $(dirname $0))"		# Directory containing the main script file
+
+DATA_DIR="${ORG_HOME}/data"                     # Directory containing reference data for the annotation
+
+CALL_DIR="$(getAbsolutePath $(pwd))"            # Directory from where the main script is called
+
+
+IR_DATA_DIR="${DATA_DIR}/ir"  				  	# Directory containing data related to the 
+												# Inverted repeat strucuture
+								
+
+#
+#
+########################
+
+
+
+########################
+#
+# Altering the environment
+#
+#
+
+# We alter the path to include the bin dir corresponding to the port
+PATH="${ORG_HOME}/ports/${ORG_PORTNAME}/bin:${PATH}"
+export PATH
+
+# Force to basic international setting for a correction behaviour of AWK on mac with float
+export LANG=C
+export LC_ALL=C
+