616fd2bb4454a8d4d55daa0e88e5ec38d6e33dd3

@@ -0,0 +1,73 @@


CDHIT_ID=0.7
CDHIT_DELTA=0.8


function clusterize() {
 
 	local prot="${1}"
 	local fastain="${prot}.fst"
 	local cdhitout="${prot}.cdhit"
 	local fasta1="${prot}.1l.fst"
 	
 	rm -rf "${prot}"
 	mkdir -p "${prot}"
 	
 	cd-hit 	-i "${fastain}" \
 	    	-o "${prot}/${cdhitout}" \
 	    	-c ${CDHIT_DELTA} \
 	    	-G 1 \
			-g 1 \
			-aL 0.95 \
			-s ${CDHIT_ID} \
 	    	-b 350 -p 1 \
 	    	-d 0 -n 3
 	       
 	fasta1line "${fastain}" > "${prot}/${fasta1}"
 	
 	pushd "$prot"
 	
 	rm -rf "*.cluster.*.ids"
 	$AwkCmd -v prot="$prot" \
 	        '/^>/   {cluster=$2} \
 	         ! /^>/ {sub("\\.\\.*$","",$3); \
 	                 filename=prot".cluster."cluster".ids"; \
 	                 print $3 >> filename ; \
 	                 close(filename) }' "${cdhitout}.clstr"
 	                
 	rm -f "../$prot.clean.fst"
 	                 
 	for ids in *.cluster.*.ids ; do
 		cluster=$(printf "%03d" $(echo "${ids}" | $AwkCmd -F'.' '{print $3}'))
 		size=$(wc -l "$ids" | $AwkCmd  '{print $1}')
 		fsize=$(printf '%05d' $size)

 		alignment="${prot}.cluster.${fsize}.${cluster}.fst"
 	    
 	    if (( size > 1 )) ; then
	 	    egrep -f "$ids" -A 1 "${fasta1}" | \
	 	    egrep -v -- '^--$' | \
	 	    clustalo -i -  > "$alignment"
	 	else
	 	    egrep -f "$ids" -A 1 "${fasta1}" | \
	 	    egrep -v -- '^--$' > "$alignment"
	    fi
	 		
 	    if (( size >= 5 )) ; then
 	    	egrep -f "$ids" -A 1 "${fasta1}" | \
	 	    egrep -v -- '^--$' | \
	 	    formatfasta >> "../$prot.clean.fst"
 	    fi
 	    
 	    rm -f "$ids"
 	    
 	done 
 	
 	rm -f "${fasta1}"
 	rm -f "${cdhitout}"
 	
 	popd
 	       
}