Former-commit-id: 3da31ce8a135394ecac041291134d61f11f06d8f Former-commit-id: 406f41a7cb2db14ea832480b86f72a11d3b0ab4a
367 lines
9.1 KiB
Bash
367 lines
9.1 KiB
Bash
#
|
|
# Bash file to be sourced at the begining of each bash script
|
|
# for setting up basic variables and functions
|
|
#
|
|
|
|
export AwkCmd="gawk"
|
|
|
|
# setup the LC_ALL environment variable (for Linux mostly)
|
|
# so the GNU tools (like sort) will work properly
|
|
|
|
export LANG=C
|
|
export LC_ALL=C
|
|
|
|
|
|
########################
|
|
#
|
|
# General usage functions
|
|
#
|
|
#
|
|
|
|
function getAbsolutePath {
|
|
[[ -d $1 ]] && { cd "$1"; echo "$(pwd -P)"; } ||
|
|
{ cd "$(dirname "$1")"; echo "$(pwd -P)/$(basename "$1")"; }
|
|
}
|
|
|
|
# Manage temp directory
|
|
|
|
function mktempdir {
|
|
local TMP_DIR
|
|
TMP_DIR=$(mktemp -d -t "$1_proc_$$_XXXXXX")
|
|
logdebug "Creating temp directory $TMP_DIR"
|
|
echo $TMP_DIR
|
|
}
|
|
|
|
function pushTmpDir {
|
|
local TMP_DIR
|
|
TMP_DIR=$(mktempdir "$1")
|
|
pushd $TMP_DIR >& /dev/null
|
|
TMP_DIR_STACK="$TMP_DIR $TMP_DIR_STACK"
|
|
logdebug "Pushing temp directory $TMP_DIR"
|
|
logdebug "Stack : ${TMP_DIR_STACK}"
|
|
}
|
|
|
|
function popTmpDir {
|
|
local TMP_DIR
|
|
TMP_DIR=$($AwkCmd '{print $1}' <<< $TMP_DIR_STACK)
|
|
TMP_DIR_STACK=$($AwkCmd '{$1="";print $0}' <<< $TMP_DIR_STACK)
|
|
popd >& /dev/null
|
|
rm -rf $TMP_DIR >& /dev/null
|
|
logdebug "Poping temp directory $TMP_DIR"
|
|
logdebug "Stack : ${TMP_DIR_STACK}"
|
|
}
|
|
|
|
# Logging functions
|
|
|
|
|
|
function errcho {
|
|
>&2 echo $*
|
|
}
|
|
|
|
function openLogFile {
|
|
ORG_LOGFILE=$1
|
|
export ORG_LOGFILE
|
|
touch ${ORG_LOGFILE}
|
|
}
|
|
|
|
|
|
function loginfo {
|
|
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1"
|
|
if [[ ! -z ${ORG_LOGFILE} ]]; then
|
|
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1" >> ${ORG_LOGFILE}
|
|
fi
|
|
}
|
|
|
|
function logerror {
|
|
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1"
|
|
if [[ ! -z ${ORG_LOGFILE} ]]; then
|
|
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1" >> ${ORG_LOGFILE}
|
|
fi
|
|
}
|
|
|
|
function logwarning {
|
|
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1"
|
|
if [[ ! -z ${ORG_LOGFILE} ]]; then
|
|
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" >> ${ORG_LOGFILE}
|
|
fi
|
|
}
|
|
|
|
function logdebug {
|
|
if [[ ! -z ${ORG_DEBUG} ]]; then
|
|
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA DEBUG ] $$ -- $1"
|
|
if [[ ! -z ${ORG_LOGFILE} ]]; then
|
|
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA DEBUG ] $$ -- $1" >> ${ORG_LOGFILE}
|
|
fi
|
|
fi
|
|
}
|
|
|
|
#
|
|
# Asserts that the number of arguments passed to the script
|
|
# is at least equal to the first argument of the function.
|
|
#
|
|
# needarg 3
|
|
#
|
|
# requires that the script is called at least with 3 arguments
|
|
#
|
|
__ORG_ANNOT_ARGS_SCRIPT_COUNT__=$#
|
|
function needarg {
|
|
if (( $__ORG_ANNOT_ARGS_SCRIPT_COUNT__ < $1 )) ; then
|
|
logerror "not enougth arguments provided"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
function needfile {
|
|
if [[ ! -e $1 ]] ; then
|
|
logerror "File $1 doesn't exist"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
function needdir {
|
|
if [[ ! -d $1 ]] ; then
|
|
logerror "Directory $1 doesn't exist"
|
|
exit 1
|
|
fi
|
|
}
|
|
function assignundef {
|
|
local value=$(eval echo \${$1+x})
|
|
if [[ -z "$value" ]] ; then
|
|
eval $1=$2
|
|
fi
|
|
}
|
|
|
|
# Sequence related functions
|
|
|
|
# Counts how many sequences are stored in a fasta file
|
|
# - $1 : The fasta file to count
|
|
function fastaCount {
|
|
grep '^>' $1 | wc -l
|
|
}
|
|
|
|
|
|
# compute the sequence length from a fasta sequence
|
|
# - $1 : The fasta file to cut
|
|
function seqlength {
|
|
cat $1 | \
|
|
wc |\
|
|
$AwkCmd -v t="$(head -1 $1 | wc -c)" '{print $3 - t - $1 + 1}'
|
|
}
|
|
|
|
function readfirstfastaseq {
|
|
awk '(/^>/ && first) {on=0}
|
|
(on) {seq=seq $1}
|
|
(/^>/ && ! first) { first = 1
|
|
on = 1
|
|
}
|
|
END {print seq}
|
|
' $*
|
|
}
|
|
|
|
# extract a subseq from a fasta sequence
|
|
# - $1 : The fasta file to cut
|
|
# - $2 : First position of the subsequence (first position is numered 1),
|
|
# - $3 : End of the subsequence (included in the subsequence)
|
|
function cutseq {
|
|
$AwkCmd -v from=$2 -v end=$3 'function printfasta(seq) { \
|
|
seqlen=length(seq); \
|
|
for (i=1; i <= seqlen; i+=60) \
|
|
print substr(seq,i,60); \
|
|
} \
|
|
\
|
|
/^>/ {print $0} \
|
|
! /^>/ {seq=seq$0} \
|
|
END {printfasta(substr(seq,from,end-from+1))}' $1
|
|
}
|
|
|
|
# Joins a set of sequences stored in a fasta file into
|
|
# a single sequence
|
|
# - $1 : The fasta file containing the sequences to join
|
|
function joinfasta {
|
|
$AwkCmd '(NR==1 && /^>/) {print $0} \
|
|
! /^>/ {print $0}' "${1}" | \
|
|
formatfasta
|
|
}
|
|
|
|
function fasta1line {
|
|
$AwkCmd '(/^>/ && seq !="") {print seq} \
|
|
/^>/ {print $0;seq=""} \
|
|
!/^>/ {seq=seq $0} \
|
|
END {print seq}' "${1}"
|
|
}
|
|
|
|
function formatfasta {
|
|
$AwkCmd 'function printfasta(seq) { \
|
|
seqlen=length(seq); \
|
|
for (i=1; i <= seqlen; i+=60) \
|
|
print substr(seq,i,60); \
|
|
} \
|
|
(seq && /^>/) { printfasta(seq); \
|
|
seq=""} \
|
|
/^>/ { print $0 } \
|
|
! /^>/ { seq=seq $0 } \
|
|
END { printfasta(seq)}' "${1}"
|
|
}
|
|
|
|
# Reverse complement a DNA string
|
|
# - $1 : The DNA string to reverse complement
|
|
function reversecomp {
|
|
echo $* \
|
|
| tr 'Aa' '@!' | tr 'Tt' 'Aa' | tr '@!' 'Tt' \
|
|
| tr 'Cc' '@!' | tr 'Gg' 'Cc' | tr '@!' 'Gg' \
|
|
| tr 'Mm' '@!' | tr 'Kk' 'Mm' | tr '@!' 'Kk' \
|
|
| tr 'Rr' '@!' | tr 'Yy' 'Rr' | tr '@!' 'Yy' \
|
|
| tr 'Ww' '@!' | tr 'Ss' 'Ww' | tr '@!' 'Ss' \
|
|
| tr 'Bb' '@!' | tr 'Vv' 'Bb' | tr '@!' 'Vv' \
|
|
| tr 'Dd' '@!' | tr 'Hh' 'Dd' | tr '@!' 'Hh' \
|
|
| rev
|
|
}
|
|
|
|
|
|
function annotatedParts() {
|
|
egrep -A 3 "^FT (CDS|rRNA|tRNA)" $1 \
|
|
| egrep -v -- '^--' \
|
|
| egrep -v '^FT +/' \
|
|
| egrep -o '[0-9]+\.\.[0-9]+' \
|
|
| awk -F'.' '($1 > $3) {x=$3;$3=$1;$1=x} {print $1"-"$3}' \
|
|
| sort -n
|
|
}
|
|
|
|
|
|
function notAnnoted() {
|
|
local annotations=$1
|
|
local genome=$2
|
|
local genomesize=$(seqlength $genome)
|
|
local minimum=$3
|
|
local from=1
|
|
|
|
for part in $(annotatedParts ${annotations}) ; do
|
|
loc=( $(tr '-' ' ' <<< $part) )
|
|
begin=${loc[0]}
|
|
end=${loc[1]}
|
|
|
|
|
|
if (( ( begin - from ) >= minimum )) ; then
|
|
cutseq "${genome}" $from $(( begin - 1 )) \
|
|
| $AwkCmd -v from=$from -v to=$(( begin - 1 )) '
|
|
/^>/ {$0=$1 " from=" from "; to=" to ";"}
|
|
{print $0}
|
|
'
|
|
fi
|
|
from=$((end+1))
|
|
done
|
|
|
|
if (( ( genomesize - from + 1 ) >= minimum )) ; then
|
|
cutseq "${genome}" $from $genomesize \
|
|
| $AwkCmd -v from=$from -v to=$genomesize '
|
|
/^>/ {$0=$1 " from=" from "; to=" to ";"}
|
|
{print $0}
|
|
'
|
|
fi
|
|
}
|
|
|
|
#
|
|
# Process management related function
|
|
#
|
|
|
|
function timeoutcmd() {
|
|
local seconde=$1
|
|
shift
|
|
|
|
$* &
|
|
|
|
local mainpid=$!
|
|
sleep $seconde &
|
|
local sleeppid=$!
|
|
|
|
local nproc=$(ps $mainpid $sleeppid | tail -n +2 | wc -l)
|
|
|
|
while (( nproc > 1 )) ; do
|
|
sleep 1
|
|
nproc=$(ps $mainpid $sleeppid | tail -n +2 | wc -l)
|
|
done
|
|
|
|
local timealive=$(ps $sleeppid | tail -n +2 | wc -l)
|
|
|
|
if (( timealive > 0 )) ; then
|
|
kill -9 $sleeppid
|
|
else
|
|
if (( $(ps $mainpid | tail -n +2 | wc -l) > 0 )) ; then
|
|
kill -9 $mainpid
|
|
logwarning "Timeout after ${seconde}s on command : $*"
|
|
return 1
|
|
fi
|
|
fi
|
|
}
|
|
|
|
|
|
#
|
|
#
|
|
########################
|
|
|
|
|
|
|
|
########################
|
|
#
|
|
# Local variable definitions
|
|
#
|
|
#
|
|
|
|
# The absolute path to the ORG.Annot home directory
|
|
ORG_HOME=`getAbsolutePath $(dirname ${BASH_SOURCE[0]})/..`
|
|
|
|
ORG_PORTNAME=`${ORG_HOME}/config/guess_port` # The architecture running the ORG.Annot instance
|
|
|
|
BIN_DIR="${ORG_HOME}/ports/${ORG_PORTNAME}/bin" # Directory containing binaries for this port
|
|
|
|
SCRIPT_DIR="${ORG_HOME}/scripts" # Directory containing scripts utilities
|
|
|
|
PROG_DIR="$(getAbsolutePath $(dirname $0))" # Directory containing the main script file
|
|
|
|
LIB_DIR="$(getAbsolutePath ${PROG_DIR}/../lib)" # Directory containing the main script libraries
|
|
|
|
CALL_DIR="$(getAbsolutePath $(pwd))" # Directory from where the main script is called
|
|
|
|
|
|
DATA_DIR="${ORG_HOME}/data" # Directory containing reference data for the annotation
|
|
|
|
IR_DATA_DIR="${DATA_DIR}/ir" # Directory containing data related to
|
|
# IRs detection
|
|
|
|
TRNA_DATA_DIR="${DATA_DIR}/trna" # Directory containing data related to
|
|
# tRNAs detection
|
|
|
|
CDS_DATA_DIR="${DATA_DIR}/cds" # Directory containing data related to
|
|
# CDSs detection
|
|
|
|
RRNA_DATA_DIR="${DATA_DIR}/rrna" # Directory containing data related to
|
|
# rRNAs detection
|
|
|
|
NUCRRNA_DATA_DIR="${DATA_DIR}/nucrrna" # Directory containing data related to
|
|
# rRNAs detection
|
|
|
|
ITS_DATA_DIR="${DATA_DIR}/its" # Directory containing data related to
|
|
# rRNAs detection
|
|
|
|
|
|
#
|
|
#
|
|
########################
|
|
|
|
|
|
|
|
########################
|
|
#
|
|
# Altering the environment
|
|
#
|
|
#
|
|
|
|
# We alter the path to include the bin dir corresponding to the port
|
|
PATH="${SCRIPT_DIR}:${BIN_DIR}:${PATH}"
|
|
export PATH
|
|
|
|
# Force to basic international setting for a correction behaviour of AWK on mac with float
|
|
export LANG=C
|
|
export LC_ALL=C
|
|
|