Add scripts to generate reference DBs for LSC and SSC

Former-commit-id: 77fd69c9687bfc7cfd3305299c7c81d2a9eddada
Former-commit-id: 4d09933673c30d6a9b2b9cc985b33ef23d7a99f4
This commit is contained in:
2015-10-07 11:32:17 -03:00
parent fb1158bcd4
commit 37e1ecc9fd
8 changed files with 530 additions and 0 deletions

View File

@ -0,0 +1,174 @@
#!/bin/bash
#
# BUILD REFERENCE SINGLE COPY LIBRARIES
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${SCRIPT_DIR}/../../../scripts/bash_init.sh"
pushTmpDir ORG.buildSCDB
openLogFile "${IR_DATA_DIR}/SingleCopyDB.log"
loginfo "Selecting Viridiplantae genebank entries..."
VIRIDIPLANTAE=$(${PROG_DIR}/selectViridiplantae.sh $*)
loginfo " --> $(echo ${VIRIDIPLANTAE} | wc -w) entries selected"
loginfo "Done"
#
# Deals with Long Single Copies (LSC)
#
loginfo "Extracting Long Single Copies (LSC)..."
${PROG_DIR}/extract_refLSC.sh ${VIRIDIPLANTAE} > LSC.fasta
loginfo " --> $(fastaCount LSC.fasta) retreived sequences"
loginfo "Done"
loginfo "Building LSC coorientation graph..."
${PROG_DIR}/coorienteSC.sh LSC.fasta 20000 ${LOGFILE} > LSC.tgf
${PROG_DIR}/cc.py LSC.tgf > LSC.cc
loginfo " --> $(awk '{print $1}' LSC.cc | uniq | wc -l) connected componants"
loginfo "Done"
loginfo "Indexing LCS..."
fastaindex -f LSC.fasta -i LSC.index
loginfo "Done"
loginfo "Extracting main connected components for LCS..."
rm -f LSC.direct.fasta
touch LSC.direct.fasta
for id in `awk '($1==0) {print $2}' LSC.cc`; do
fastafetch -f LSC.fasta -i LSC.index -q "${id}" >> LSC.direct.fasta
done
loginfo " --> $(fastaCount LSC.direct.fasta) sequences"
loginfo "Done"
loginfo "Extracting second connected components for LCS..."
rm -f LSC.reverse.fasta
touch LSC.reverse.fasta
for id in `awk '($1==1) {print $2}' LSC.cc`; do
fastafetch -f LSC.fasta -i LSC.index -q "${id}" >> LSC.reverse.fasta
done
loginfo " --> $(fastaCount LSC.reverse.fasta) sequences"
loginfo "Done"
loginfo "merging both connected components for LCS..."
fastarevcomp LSC.reverse.fasta >> LSC.direct.fasta
loginfo " --> $(fastaCount LSC.direct.fasta) sequences in total"
loginfo "Done"
loginfo "Checking LCS homogeneity..."
${PROG_DIR}/coorienteSC.sh LSC.direct.fasta 20000 ${LOGFILE} > LSC_RefDB.tgf
${PROG_DIR}/cc.py LSC_RefDB.tgf > LSC_RefDB.cc
NCC=$(awk '{print $1}' LSC_RefDB.cc | uniq | wc -l)
if (( $NCC == 1 )); then
loginfo " --> $NCC connected componants"
else
logwarning " --> $NCC connected componants"
fi
loginfo "Done"
loginfo "Installing LCS reference databases..."
cp LSC.direct.fasta "${IR_DATA_DIR}/LSC_RefDB.fasta"
cp LSC_RefDB.tgf "${IR_DATA_DIR}/LSC_RefDB.tgf"
makeblastdb -in "${IR_DATA_DIR}/LSC_RefDB.fasta" \
-dbtype nucl \
-out "${IR_DATA_DIR}/LSC_RefDB" >& /dev/null
loginfo "Done"
#
# Deals with Short Single Copies (SSC)
#
loginfo "Extracting Short Single Copies (SSC)..."
${PROG_DIR}/extract_refSSC.sh ${VIRIDIPLANTAE} > SSC.fasta
loginfo " --> $(fastaCount SSC.fasta) retreived sequences"
loginfo "Done"
loginfo "Building SSC coorientation graph..."
${PROG_DIR}/coorienteSC.sh SSC.fasta 5000 ${LOGFILE} > SSC.tgf
${PROG_DIR}/cc.py SSC.tgf > SSC.cc
loginfo " --> $(awk '{print $1}' SSC.cc | uniq | wc -l) connected componants"
loginfo "Done"
loginfo "Indexing SSC..."
fastaindex -f SSC.fasta -i SSC.index
loginfo "Done"
loginfo "Extracting main connected components for SSC..."
rm -f SSC.direct.fasta
touch SSC.direct.fasta
for id in `awk '($1==0) {print $2}' SSC.cc`; do
fastafetch -f SSC.fasta -i SSC.index -q "${id}" >> SSC.direct.fasta
done
loginfo " --> $(fastaCount SSC.direct.fasta) sequences"
loginfo "Done"
loginfo "Extracting second connected components for SSC..."
rm -f SSC.reverse.fasta
touch SSC.reverse.fasta
for id in `awk '($1==1) {print $2}' SSC.cc`; do
fastafetch -f SSC.fasta -i SSC.index -q "${id}" >> SSC.reverse.fasta
done
loginfo " --> $(fastaCount SSC.reverse.fasta) sequences"
loginfo "Done"
loginfo "merging both connected components for SSC..."
fastarevcomp SSC.reverse.fasta >> SSC.direct.fasta
loginfo " --> $(fastaCount SSC.direct.fasta) sequences in total"
loginfo "Done"
loginfo "Checking SSC homogeneity..."
${PROG_DIR}/coorienteSC.sh SSC.direct.fasta 5000 ${LOGFILE} > SSC_RefDB.tgf
${PROG_DIR}/cc.py SSC_RefDB.tgf > SSC_RefDB.cc
NCC=$(awk '{print $1}' SSC_RefDB.cc | uniq | wc -l)
if (( $NCC == 1 )); then
loginfo " --> $NCC connected componants"
else
logwarning " --> $NCC connected componants"
fi
loginfo "Done"
loginfo "Installing SSC reference databases..."
cp SSC.direct.fasta "${IR_DATA_DIR}/SSC_RefDB.fasta"
cp SSC_RefDB.tgf "${IR_DATA_DIR}/SSC_RefDB.tgf"
makeblastdb -in "${IR_DATA_DIR}/SSC_RefDB.fasta" \
-dbtype nucl \
-out "${IR_DATA_DIR}/SSC_RefDB" >& /dev/null
loginfo "Done"
popTmpDir

48
detectors/normalize/tools/cc.py Executable file
View File

@ -0,0 +1,48 @@
#!/usr/bin/env python
import sys
data = open(sys.argv[1])
ccs = []
for line in data:
parts = line.strip().split()
if len(parts) >= 2:
a = parts[0]
b = parts[1]
else:
continue
newcc=set([a,b])
keep=set()
found=set()
for i in range(len(ccs)):
if len(found) < 2:
cc=ccs[i]
if a not in found and a in cc:
found.add(a)
keep.add(i)
if b not in found and b in cc:
found.add(b)
keep.add(i)
for i in keep:
newcc |= ccs[i]
newccs=[newcc]
for i in range(len(ccs)):
if i not in keep:
newccs.append(ccs[i])
ccs=newccs
ccs.sort(key=len, reverse=True)
for i in range(len(ccs)):
cc=ccs[i]
for l in cc:
sys.stdout.write("%d %s\n" % (i,l))

View File

@ -0,0 +1,83 @@
#!/bin/bash
#
# BUILD THE COORIENTATION GRAPH
#
# From a set of sequences, this commande build a graph where:
# - vertices are sequence id
# - relation is : s oriented in the same way
#
# The same ortientation is defined from the similiarity measured by blast
# The result graph is formated following the tgf format (trivial graph format)
# NODE1 NODE2 EDGE_LABEL
#
# Run the command
#
# coorienteSC.sh <FASTA_FILE> <MINLENGTH> [LOGFILE]
#
# <FASTA_FILE> : the fasta file containing sequences to analyse
# <MINLENGTH> : the cumulative minimum length strand have to share to conclude
# <LOGFILE> : an optional logfile name
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${SCRIPT_DIR}/../../../scripts/bash_init.sh"
pushTmpDir ORG.coorienteSC
if [[ ! -z $3 ]]; then
openLogFile $3
fi
DATA="${CALL_DIR}/${1}"
MINLENGTH=$2
BLASTDB=${1/.fasta$/}
loginfo "Build temporary blast DB..."
makeblastdb -in "${DATA}" -dbtype nucl -out "${BLASTDB}" >& /dev/null
loginfo "Done"
loginfo "Running Blast..."
blastn -db "${BLASTDB}" -query "${DATA}" -outfmt 6 | \
awk ' \
($4 > 1000) && ($3 > 70) \
($1==QUERY) && \
($2==SUBJECT) && \
(($7<$8) && ($9<$10)) || (($7>$8) && ($9>$10)) {LSAME+= ($3/100.*$4)} \
($4 > 1000) && ($3 > 70) \
($1==QUERY) && \
($2==SUBJECT) && \
(($7<$8) && ($9>$10)) || (($7>$8) && ($9<$10)) {LDIFF+= ($3/100.*$4)} \
(QUERY!="") && \
(($1!=QUERY) || ($2!=SUBJECT)) {print QUERY,SUBJECT,LSAME,LDIFF,(LSAME>LDIFF)} \
(($1!=QUERY) || ($2!=SUBJECT)) {QUERY=$1;SUBJECT=$2; \
LSAME=0; \
LDIFF=0; \
if (($4 > 1000) && ($3 > 70)) { \
if ((($7<$8) && ($9<$10)) || \
(($7>$8) && ($9>$10))) {\
LSAME= ($3/100.*$4) } \
else { \
LDIFF= ($3/100.*$4) }} \
} \
END {print QUERY,SUBJECT,LSAME,LDIFF,(LSAME>LDIFF)}' | \
awk -v minlength="${MINLENGTH}" \
' (($3>minlength) || \
($4 > minlength)) && \
($3/($4+1) > 2) && \
($1!=$2) { if ($1 > $2) \
{ print $2,$1,$5 } \
else \
{print $1,$2,$5}}' | \
sort | \
uniq -c | \
awk '($1==2) {print $2,$3,$4}'
loginfo "Done"
popTmpDir

View File

@ -0,0 +1,50 @@
#!/bin/bash
#
awk 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \
} \
function comp(seq) { \
"echo "seq" | tr acgtACGT tgcaTGCA " | getline res; \
close("echo "seq" | tr acgtACGT tgcaTGCA "); \
return res; \
} \
function rev(seq) { \
"echo "seq" | rev " | getline res; \
close("echo "seq" | rev "); \
return res; \
} \
function revcomp(seq) { \
res=rev(comp(seq)); \
return res; \
} \
\
/^LOCUS / {AC=$2; sequence=""; seqon=0; FROM="";TO=""} \
/^ misc_feature/ {LOCUS=$2; STRAND=1} \
/^ misc_feature/ && /complement/ {STRAND=0; \
sub("complement\\(","",LOCUS); \
sub("\\)","",LOCUS); \
} \
/large single copy/ && /LSC/ {split(LOCUS,POS,"."); \
FROM=POS[1]; \
TO=POS[3]; \
LENGTH=TO-FROM+1 \
} \
/^ORIGIN/ {seqon=1} \
/^ *[1-9][0-9]* [a-z ]+$/ && seqon {seq=$2 $3 $4 $5 $6 $7; \
gsub("[^acgt]","n",seq);\
sequence=sequence seq \
} \
/^\/\// && FROM && (LENGTH > 60000) && (LENGTH < 100000) \
{print ">LSC_"AC" Strand="STRAND";", \
"cut="FROM".."TO";", \
"seq_length="LENGTH";"; \
SS=substr(sequence,FROM,LENGTH); \
if (! STRAND) \
SS=revcomp(SS); \
printfasta(SS); \
} \
' $*

View File

@ -0,0 +1,50 @@
#!/bin/bash
#
awk 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \
} \
function comp(seq) { \
"echo "seq" | tr acgtACGT tgcaTGCA " | getline res; \
close("echo "seq" | tr acgtACGT tgcaTGCA "); \
return res; \
} \
function rev(seq) { \
"echo "seq" | rev " | getline res; \
close("echo "seq" | rev "); \
return res; \
} \
function revcomp(seq) { \
res=rev(comp(seq)); \
return res; \
} \
\
/^LOCUS / {AC=$2; sequence=""; seqon=0; FROM="";TO=""} \
/^ misc_feature/ {LOCUS=$2; STRAND=1} \
/^ misc_feature/ && /complement/ {STRAND=0; \
sub("complement\\(","",LOCUS); \
sub("\\)","",LOCUS); \
} \
/small single copy/ && /SSC/ {split(LOCUS,POS,"."); \
FROM=POS[1]; \
TO=POS[3]; \
LENGTH=TO-FROM+1 \
} \
/^ORIGIN/ {seqon=1} \
/^ *[1-9][0-9]* [a-z ]+$/ && seqon {seq=$2 $3 $4 $5 $6 $7; \
gsub("[^acgt]","n",seq);\
sequence=sequence seq \
} \
/^\/\// && FROM && (LENGTH > 15000) && (LENGTH < 20000) \
{print ">SSC_"AC" Strand="STRAND";", \
"cut="FROM".."TO";", \
"seq_length="LENGTH";"; \
SS=substr(sequence,FROM,LENGTH); \
if (! STRAND) \
SS=revcomp(SS); \
printfasta(SS); \
} \
' $*

View File

@ -0,0 +1,8 @@
#!/bin/bash
grep -A 1 ' ORGANISM' $* | \
grep -B 1 Viridiplantae | \
awk '{print $1}' | \
grep '\.gbk' | \
sed -E 's/(^.*\.gbk).$/\1/' | \
uniq

117
scripts/bash_init.sh Normal file
View File

@ -0,0 +1,117 @@
#
# Bash file to be sourced at the begining of each bash script
# for setting up basic variables and functions
#
########################
#
# General usage functions
#
#
function getAbsolutePath {
[[ -d $1 ]] && { cd "$1"; echo "$(pwd -P)"; } ||
{ cd "$(dirname "$1")"; echo "$(pwd -P)/$(basename "$1")"; }
}
# Manage temp directory
function pushTmpDir {
TMP_DIR=$(mktemp -d -t "$1_proc_$$_")
pushd $TMP_DIR >& /dev/null
}
function popTmpDir {
popd >& /dev/null
rm -rf $TMP_DIR >& /dev/null
}
# Logging functions
function errcho {
>&2 echo $*
}
function openLogFile {
LOGFILE=$1
touch ${LOGFILE}
}
function loginfo {
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1"
if [[ ! -z ${LOGFILE} ]]; then
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1" >> ${LOGFILE}
fi
}
function logerror {
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1"
if [[ ! -z ${LOGFILE} ]]; then
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1" >> ${LOGFILE}
fi
}
function logwarning {
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1"
if [[ ! -z ${LOGFILE} ]]; then
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" >> ${LOGFILE}
fi
}
# Sequence related functions
function fastaCount {
grep '^>' $1 | wc -l
}
#
#
########################
########################
#
# Local variable definitions
#
#
# The absolute path to the ORG.Annote home direcotory
ORG_HOME=`getAbsolutePath $(dirname ${BASH_SOURCE[0]})/..`
ORG_PORTNAME=`${ORG_HOME}/config/guess_port` # The architecture running the ORG.Annnote instance
PROG_DIR="$(getAbsolutePath $(dirname $0))" # Directory containing the main script file
DATA_DIR="${ORG_HOME}/data" # Directory containing reference data for the annotation
CALL_DIR="$(getAbsolutePath $(pwd))" # Directory from where the main script is called
IR_DATA_DIR="${DATA_DIR}/ir" # Directory containing data related to the
# Inverted repeat strucuture
#
#
########################
########################
#
# Altering the environment
#
#
# We alter the path to include the bin dir corresponding to the port
PATH="${ORG_HOME}/ports/${ORG_PORTNAME}/bin:${PATH}"
export PATH
# Force to basic international setting for a correction behaviour of AWK on mac with float
export LANG=C
export LC_ALL=C