Add scripts to generate reference DBs for LSC and SSC
Former-commit-id: 77fd69c9687bfc7cfd3305299c7c81d2a9eddada Former-commit-id: 4d09933673c30d6a9b2b9cc985b33ef23d7a99f4
This commit is contained in:
174
detectors/normalize/tools/buildSCDB.sh
Executable file
174
detectors/normalize/tools/buildSCDB.sh
Executable file
@ -0,0 +1,174 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BUILD REFERENCE SINGLE COPY LIBRARIES
|
||||
#
|
||||
#========================================================================================
|
||||
|
||||
# -- CAUTION -- Works as long than the script
|
||||
# is not called through a symlink
|
||||
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
|
||||
source "${SCRIPT_DIR}/../../../scripts/bash_init.sh"
|
||||
|
||||
pushTmpDir ORG.buildSCDB
|
||||
|
||||
openLogFile "${IR_DATA_DIR}/SingleCopyDB.log"
|
||||
|
||||
loginfo "Selecting Viridiplantae genebank entries..."
|
||||
VIRIDIPLANTAE=$(${PROG_DIR}/selectViridiplantae.sh $*)
|
||||
loginfo " --> $(echo ${VIRIDIPLANTAE} | wc -w) entries selected"
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
#
|
||||
# Deals with Long Single Copies (LSC)
|
||||
#
|
||||
|
||||
loginfo "Extracting Long Single Copies (LSC)..."
|
||||
${PROG_DIR}/extract_refLSC.sh ${VIRIDIPLANTAE} > LSC.fasta
|
||||
loginfo " --> $(fastaCount LSC.fasta) retreived sequences"
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
|
||||
loginfo "Building LSC coorientation graph..."
|
||||
${PROG_DIR}/coorienteSC.sh LSC.fasta 20000 ${LOGFILE} > LSC.tgf
|
||||
${PROG_DIR}/cc.py LSC.tgf > LSC.cc
|
||||
loginfo " --> $(awk '{print $1}' LSC.cc | uniq | wc -l) connected componants"
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
loginfo "Indexing LCS..."
|
||||
fastaindex -f LSC.fasta -i LSC.index
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
|
||||
loginfo "Extracting main connected components for LCS..."
|
||||
rm -f LSC.direct.fasta
|
||||
touch LSC.direct.fasta
|
||||
for id in `awk '($1==0) {print $2}' LSC.cc`; do
|
||||
fastafetch -f LSC.fasta -i LSC.index -q "${id}" >> LSC.direct.fasta
|
||||
done
|
||||
loginfo " --> $(fastaCount LSC.direct.fasta) sequences"
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
|
||||
loginfo "Extracting second connected components for LCS..."
|
||||
rm -f LSC.reverse.fasta
|
||||
touch LSC.reverse.fasta
|
||||
for id in `awk '($1==1) {print $2}' LSC.cc`; do
|
||||
fastafetch -f LSC.fasta -i LSC.index -q "${id}" >> LSC.reverse.fasta
|
||||
done
|
||||
loginfo " --> $(fastaCount LSC.reverse.fasta) sequences"
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
|
||||
loginfo "merging both connected components for LCS..."
|
||||
fastarevcomp LSC.reverse.fasta >> LSC.direct.fasta
|
||||
loginfo " --> $(fastaCount LSC.direct.fasta) sequences in total"
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
|
||||
loginfo "Checking LCS homogeneity..."
|
||||
${PROG_DIR}/coorienteSC.sh LSC.direct.fasta 20000 ${LOGFILE} > LSC_RefDB.tgf
|
||||
${PROG_DIR}/cc.py LSC_RefDB.tgf > LSC_RefDB.cc
|
||||
NCC=$(awk '{print $1}' LSC_RefDB.cc | uniq | wc -l)
|
||||
if (( $NCC == 1 )); then
|
||||
loginfo " --> $NCC connected componants"
|
||||
else
|
||||
logwarning " --> $NCC connected componants"
|
||||
fi
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
|
||||
loginfo "Installing LCS reference databases..."
|
||||
cp LSC.direct.fasta "${IR_DATA_DIR}/LSC_RefDB.fasta"
|
||||
cp LSC_RefDB.tgf "${IR_DATA_DIR}/LSC_RefDB.tgf"
|
||||
|
||||
makeblastdb -in "${IR_DATA_DIR}/LSC_RefDB.fasta" \
|
||||
-dbtype nucl \
|
||||
-out "${IR_DATA_DIR}/LSC_RefDB" >& /dev/null
|
||||
loginfo "Done"
|
||||
|
||||
#
|
||||
# Deals with Short Single Copies (SSC)
|
||||
#
|
||||
|
||||
loginfo "Extracting Short Single Copies (SSC)..."
|
||||
${PROG_DIR}/extract_refSSC.sh ${VIRIDIPLANTAE} > SSC.fasta
|
||||
loginfo " --> $(fastaCount SSC.fasta) retreived sequences"
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
|
||||
loginfo "Building SSC coorientation graph..."
|
||||
${PROG_DIR}/coorienteSC.sh SSC.fasta 5000 ${LOGFILE} > SSC.tgf
|
||||
${PROG_DIR}/cc.py SSC.tgf > SSC.cc
|
||||
loginfo " --> $(awk '{print $1}' SSC.cc | uniq | wc -l) connected componants"
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
|
||||
loginfo "Indexing SSC..."
|
||||
fastaindex -f SSC.fasta -i SSC.index
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
|
||||
loginfo "Extracting main connected components for SSC..."
|
||||
rm -f SSC.direct.fasta
|
||||
touch SSC.direct.fasta
|
||||
for id in `awk '($1==0) {print $2}' SSC.cc`; do
|
||||
fastafetch -f SSC.fasta -i SSC.index -q "${id}" >> SSC.direct.fasta
|
||||
done
|
||||
loginfo " --> $(fastaCount SSC.direct.fasta) sequences"
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
|
||||
loginfo "Extracting second connected components for SSC..."
|
||||
rm -f SSC.reverse.fasta
|
||||
touch SSC.reverse.fasta
|
||||
for id in `awk '($1==1) {print $2}' SSC.cc`; do
|
||||
fastafetch -f SSC.fasta -i SSC.index -q "${id}" >> SSC.reverse.fasta
|
||||
done
|
||||
loginfo " --> $(fastaCount SSC.reverse.fasta) sequences"
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
|
||||
loginfo "merging both connected components for SSC..."
|
||||
fastarevcomp SSC.reverse.fasta >> SSC.direct.fasta
|
||||
loginfo " --> $(fastaCount SSC.direct.fasta) sequences in total"
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
|
||||
loginfo "Checking SSC homogeneity..."
|
||||
${PROG_DIR}/coorienteSC.sh SSC.direct.fasta 5000 ${LOGFILE} > SSC_RefDB.tgf
|
||||
${PROG_DIR}/cc.py SSC_RefDB.tgf > SSC_RefDB.cc
|
||||
NCC=$(awk '{print $1}' SSC_RefDB.cc | uniq | wc -l)
|
||||
if (( $NCC == 1 )); then
|
||||
loginfo " --> $NCC connected componants"
|
||||
else
|
||||
logwarning " --> $NCC connected componants"
|
||||
fi
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
|
||||
loginfo "Installing SSC reference databases..."
|
||||
cp SSC.direct.fasta "${IR_DATA_DIR}/SSC_RefDB.fasta"
|
||||
cp SSC_RefDB.tgf "${IR_DATA_DIR}/SSC_RefDB.tgf"
|
||||
|
||||
makeblastdb -in "${IR_DATA_DIR}/SSC_RefDB.fasta" \
|
||||
-dbtype nucl \
|
||||
-out "${IR_DATA_DIR}/SSC_RefDB" >& /dev/null
|
||||
loginfo "Done"
|
||||
|
||||
popTmpDir
|
||||
|
48
detectors/normalize/tools/cc.py
Executable file
48
detectors/normalize/tools/cc.py
Executable file
@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
|
||||
data = open(sys.argv[1])
|
||||
|
||||
ccs = []
|
||||
for line in data:
|
||||
|
||||
parts = line.strip().split()
|
||||
if len(parts) >= 2:
|
||||
a = parts[0]
|
||||
b = parts[1]
|
||||
else:
|
||||
continue
|
||||
|
||||
newcc=set([a,b])
|
||||
|
||||
keep=set()
|
||||
found=set()
|
||||
for i in range(len(ccs)):
|
||||
if len(found) < 2:
|
||||
cc=ccs[i]
|
||||
if a not in found and a in cc:
|
||||
found.add(a)
|
||||
keep.add(i)
|
||||
if b not in found and b in cc:
|
||||
found.add(b)
|
||||
keep.add(i)
|
||||
|
||||
for i in keep:
|
||||
newcc |= ccs[i]
|
||||
|
||||
newccs=[newcc]
|
||||
|
||||
for i in range(len(ccs)):
|
||||
if i not in keep:
|
||||
newccs.append(ccs[i])
|
||||
|
||||
ccs=newccs
|
||||
|
||||
ccs.sort(key=len, reverse=True)
|
||||
|
||||
for i in range(len(ccs)):
|
||||
cc=ccs[i]
|
||||
for l in cc:
|
||||
sys.stdout.write("%d %s\n" % (i,l))
|
||||
|
83
detectors/normalize/tools/coorienteSC.sh
Executable file
83
detectors/normalize/tools/coorienteSC.sh
Executable file
@ -0,0 +1,83 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BUILD THE COORIENTATION GRAPH
|
||||
#
|
||||
# From a set of sequences, this commande build a graph where:
|
||||
# - vertices are sequence id
|
||||
# - relation is : s oriented in the same way
|
||||
#
|
||||
# The same ortientation is defined from the similiarity measured by blast
|
||||
# The result graph is formated following the tgf format (trivial graph format)
|
||||
# NODE1 NODE2 EDGE_LABEL
|
||||
#
|
||||
# Run the command
|
||||
#
|
||||
# coorienteSC.sh <FASTA_FILE> <MINLENGTH> [LOGFILE]
|
||||
#
|
||||
# <FASTA_FILE> : the fasta file containing sequences to analyse
|
||||
# <MINLENGTH> : the cumulative minimum length strand have to share to conclude
|
||||
# <LOGFILE> : an optional logfile name
|
||||
#
|
||||
#========================================================================================
|
||||
|
||||
|
||||
# -- CAUTION -- Works as long than the script
|
||||
# is not called through a symlink
|
||||
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
|
||||
source "${SCRIPT_DIR}/../../../scripts/bash_init.sh"
|
||||
|
||||
|
||||
pushTmpDir ORG.coorienteSC
|
||||
|
||||
if [[ ! -z $3 ]]; then
|
||||
openLogFile $3
|
||||
fi
|
||||
|
||||
DATA="${CALL_DIR}/${1}"
|
||||
MINLENGTH=$2
|
||||
BLASTDB=${1/.fasta$/}
|
||||
|
||||
loginfo "Build temporary blast DB..."
|
||||
makeblastdb -in "${DATA}" -dbtype nucl -out "${BLASTDB}" >& /dev/null
|
||||
loginfo "Done"
|
||||
|
||||
loginfo "Running Blast..."
|
||||
blastn -db "${BLASTDB}" -query "${DATA}" -outfmt 6 | \
|
||||
awk ' \
|
||||
($4 > 1000) && ($3 > 70) \
|
||||
($1==QUERY) && \
|
||||
($2==SUBJECT) && \
|
||||
(($7<$8) && ($9<$10)) || (($7>$8) && ($9>$10)) {LSAME+= ($3/100.*$4)} \
|
||||
($4 > 1000) && ($3 > 70) \
|
||||
($1==QUERY) && \
|
||||
($2==SUBJECT) && \
|
||||
(($7<$8) && ($9>$10)) || (($7>$8) && ($9<$10)) {LDIFF+= ($3/100.*$4)} \
|
||||
(QUERY!="") && \
|
||||
(($1!=QUERY) || ($2!=SUBJECT)) {print QUERY,SUBJECT,LSAME,LDIFF,(LSAME>LDIFF)} \
|
||||
(($1!=QUERY) || ($2!=SUBJECT)) {QUERY=$1;SUBJECT=$2; \
|
||||
LSAME=0; \
|
||||
LDIFF=0; \
|
||||
if (($4 > 1000) && ($3 > 70)) { \
|
||||
if ((($7<$8) && ($9<$10)) || \
|
||||
(($7>$8) && ($9>$10))) {\
|
||||
LSAME= ($3/100.*$4) } \
|
||||
else { \
|
||||
LDIFF= ($3/100.*$4) }} \
|
||||
} \
|
||||
END {print QUERY,SUBJECT,LSAME,LDIFF,(LSAME>LDIFF)}' | \
|
||||
awk -v minlength="${MINLENGTH}" \
|
||||
' (($3>minlength) || \
|
||||
($4 > minlength)) && \
|
||||
($3/($4+1) > 2) && \
|
||||
($1!=$2) { if ($1 > $2) \
|
||||
{ print $2,$1,$5 } \
|
||||
else \
|
||||
{print $1,$2,$5}}' | \
|
||||
sort | \
|
||||
uniq -c | \
|
||||
awk '($1==2) {print $2,$3,$4}'
|
||||
loginfo "Done"
|
||||
|
||||
popTmpDir
|
||||
|
||||
|
50
detectors/normalize/tools/extract_refLSC.sh
Executable file
50
detectors/normalize/tools/extract_refLSC.sh
Executable file
@ -0,0 +1,50 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
|
||||
awk 'function printfasta(seq) { \
|
||||
seqlen=length(seq); \
|
||||
for (i=1; i <= seqlen; i+=60) \
|
||||
print substr(seq,i,60); \
|
||||
} \
|
||||
function comp(seq) { \
|
||||
"echo "seq" | tr acgtACGT tgcaTGCA " | getline res; \
|
||||
close("echo "seq" | tr acgtACGT tgcaTGCA "); \
|
||||
return res; \
|
||||
} \
|
||||
function rev(seq) { \
|
||||
"echo "seq" | rev " | getline res; \
|
||||
close("echo "seq" | rev "); \
|
||||
return res; \
|
||||
} \
|
||||
function revcomp(seq) { \
|
||||
res=rev(comp(seq)); \
|
||||
return res; \
|
||||
} \
|
||||
\
|
||||
/^LOCUS / {AC=$2; sequence=""; seqon=0; FROM="";TO=""} \
|
||||
/^ misc_feature/ {LOCUS=$2; STRAND=1} \
|
||||
/^ misc_feature/ && /complement/ {STRAND=0; \
|
||||
sub("complement\\(","",LOCUS); \
|
||||
sub("\\)","",LOCUS); \
|
||||
} \
|
||||
/large single copy/ && /LSC/ {split(LOCUS,POS,"."); \
|
||||
FROM=POS[1]; \
|
||||
TO=POS[3]; \
|
||||
LENGTH=TO-FROM+1 \
|
||||
} \
|
||||
/^ORIGIN/ {seqon=1} \
|
||||
/^ *[1-9][0-9]* [a-z ]+$/ && seqon {seq=$2 $3 $4 $5 $6 $7; \
|
||||
gsub("[^acgt]","n",seq);\
|
||||
sequence=sequence seq \
|
||||
} \
|
||||
/^\/\// && FROM && (LENGTH > 60000) && (LENGTH < 100000) \
|
||||
{print ">LSC_"AC" Strand="STRAND";", \
|
||||
"cut="FROM".."TO";", \
|
||||
"seq_length="LENGTH";"; \
|
||||
SS=substr(sequence,FROM,LENGTH); \
|
||||
if (! STRAND) \
|
||||
SS=revcomp(SS); \
|
||||
printfasta(SS); \
|
||||
} \
|
||||
' $*
|
50
detectors/normalize/tools/extract_refSSC.sh
Executable file
50
detectors/normalize/tools/extract_refSSC.sh
Executable file
@ -0,0 +1,50 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
|
||||
awk 'function printfasta(seq) { \
|
||||
seqlen=length(seq); \
|
||||
for (i=1; i <= seqlen; i+=60) \
|
||||
print substr(seq,i,60); \
|
||||
} \
|
||||
function comp(seq) { \
|
||||
"echo "seq" | tr acgtACGT tgcaTGCA " | getline res; \
|
||||
close("echo "seq" | tr acgtACGT tgcaTGCA "); \
|
||||
return res; \
|
||||
} \
|
||||
function rev(seq) { \
|
||||
"echo "seq" | rev " | getline res; \
|
||||
close("echo "seq" | rev "); \
|
||||
return res; \
|
||||
} \
|
||||
function revcomp(seq) { \
|
||||
res=rev(comp(seq)); \
|
||||
return res; \
|
||||
} \
|
||||
\
|
||||
/^LOCUS / {AC=$2; sequence=""; seqon=0; FROM="";TO=""} \
|
||||
/^ misc_feature/ {LOCUS=$2; STRAND=1} \
|
||||
/^ misc_feature/ && /complement/ {STRAND=0; \
|
||||
sub("complement\\(","",LOCUS); \
|
||||
sub("\\)","",LOCUS); \
|
||||
} \
|
||||
/small single copy/ && /SSC/ {split(LOCUS,POS,"."); \
|
||||
FROM=POS[1]; \
|
||||
TO=POS[3]; \
|
||||
LENGTH=TO-FROM+1 \
|
||||
} \
|
||||
/^ORIGIN/ {seqon=1} \
|
||||
/^ *[1-9][0-9]* [a-z ]+$/ && seqon {seq=$2 $3 $4 $5 $6 $7; \
|
||||
gsub("[^acgt]","n",seq);\
|
||||
sequence=sequence seq \
|
||||
} \
|
||||
/^\/\// && FROM && (LENGTH > 15000) && (LENGTH < 20000) \
|
||||
{print ">SSC_"AC" Strand="STRAND";", \
|
||||
"cut="FROM".."TO";", \
|
||||
"seq_length="LENGTH";"; \
|
||||
SS=substr(sequence,FROM,LENGTH); \
|
||||
if (! STRAND) \
|
||||
SS=revcomp(SS); \
|
||||
printfasta(SS); \
|
||||
} \
|
||||
' $*
|
8
detectors/normalize/tools/selectViridiplantae.sh
Executable file
8
detectors/normalize/tools/selectViridiplantae.sh
Executable file
@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
grep -A 1 ' ORGANISM' $* | \
|
||||
grep -B 1 Viridiplantae | \
|
||||
awk '{print $1}' | \
|
||||
grep '\.gbk' | \
|
||||
sed -E 's/(^.*\.gbk).$/\1/' | \
|
||||
uniq
|
117
scripts/bash_init.sh
Normal file
117
scripts/bash_init.sh
Normal file
@ -0,0 +1,117 @@
|
||||
#
|
||||
# Bash file to be sourced at the begining of each bash script
|
||||
# for setting up basic variables and functions
|
||||
#
|
||||
|
||||
########################
|
||||
#
|
||||
# General usage functions
|
||||
#
|
||||
#
|
||||
|
||||
function getAbsolutePath {
|
||||
[[ -d $1 ]] && { cd "$1"; echo "$(pwd -P)"; } ||
|
||||
{ cd "$(dirname "$1")"; echo "$(pwd -P)/$(basename "$1")"; }
|
||||
}
|
||||
|
||||
# Manage temp directory
|
||||
|
||||
function pushTmpDir {
|
||||
TMP_DIR=$(mktemp -d -t "$1_proc_$$_")
|
||||
pushd $TMP_DIR >& /dev/null
|
||||
}
|
||||
|
||||
function popTmpDir {
|
||||
popd >& /dev/null
|
||||
rm -rf $TMP_DIR >& /dev/null
|
||||
}
|
||||
|
||||
# Logging functions
|
||||
|
||||
|
||||
function errcho {
|
||||
>&2 echo $*
|
||||
}
|
||||
|
||||
function openLogFile {
|
||||
LOGFILE=$1
|
||||
touch ${LOGFILE}
|
||||
}
|
||||
|
||||
|
||||
function loginfo {
|
||||
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1"
|
||||
if [[ ! -z ${LOGFILE} ]]; then
|
||||
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1" >> ${LOGFILE}
|
||||
fi
|
||||
}
|
||||
|
||||
function logerror {
|
||||
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1"
|
||||
if [[ ! -z ${LOGFILE} ]]; then
|
||||
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1" >> ${LOGFILE}
|
||||
fi
|
||||
}
|
||||
|
||||
function logwarning {
|
||||
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1"
|
||||
if [[ ! -z ${LOGFILE} ]]; then
|
||||
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" >> ${LOGFILE}
|
||||
fi
|
||||
}
|
||||
|
||||
# Sequence related functions
|
||||
|
||||
function fastaCount {
|
||||
grep '^>' $1 | wc -l
|
||||
}
|
||||
|
||||
#
|
||||
#
|
||||
########################
|
||||
|
||||
|
||||
|
||||
########################
|
||||
#
|
||||
# Local variable definitions
|
||||
#
|
||||
#
|
||||
|
||||
# The absolute path to the ORG.Annote home direcotory
|
||||
ORG_HOME=`getAbsolutePath $(dirname ${BASH_SOURCE[0]})/..`
|
||||
|
||||
|
||||
ORG_PORTNAME=`${ORG_HOME}/config/guess_port` # The architecture running the ORG.Annnote instance
|
||||
|
||||
PROG_DIR="$(getAbsolutePath $(dirname $0))" # Directory containing the main script file
|
||||
|
||||
DATA_DIR="${ORG_HOME}/data" # Directory containing reference data for the annotation
|
||||
|
||||
CALL_DIR="$(getAbsolutePath $(pwd))" # Directory from where the main script is called
|
||||
|
||||
|
||||
IR_DATA_DIR="${DATA_DIR}/ir" # Directory containing data related to the
|
||||
# Inverted repeat strucuture
|
||||
|
||||
|
||||
#
|
||||
#
|
||||
########################
|
||||
|
||||
|
||||
|
||||
########################
|
||||
#
|
||||
# Altering the environment
|
||||
#
|
||||
#
|
||||
|
||||
# We alter the path to include the bin dir corresponding to the port
|
||||
PATH="${ORG_HOME}/ports/${ORG_PORTNAME}/bin:${PATH}"
|
||||
export PATH
|
||||
|
||||
# Force to basic international setting for a correction behaviour of AWK on mac with float
|
||||
export LANG=C
|
||||
export LC_ALL=C
|
||||
|
Reference in New Issue
Block a user