From f9410e5aeee8ce55da41ba63e3ba679d7177ebfe Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 13 Oct 2008 10:39:14 +0000 Subject: [PATCH] Initial commit for this soft git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@175 60f365c0-8329-0410-b2a4-ec073aeeaa1d --- .cproject | 151 +++++ .project | 77 +++ Documents/CalculTM.xls | Bin 0 -> 35840 bytes Licence_CeCILL_V2-en.txt | 506 ++++++++++++++++ Licence_CeCILL_V2-fr.txt | 512 ++++++++++++++++ src/Makefile | 63 ++ src/ecoprimer.c | 0 src/global.mk | 18 + src/libKMRK/KMRK.c | 1009 ++++++++++++++++++++++++++++++++ src/libKMRK/KMRK.h | 309 ++++++++++ src/libKMRK/KMRK_Seeds.c | 908 ++++++++++++++++++++++++++++ src/libKMRK/KMRK_Seeds.h | 126 ++++ src/libKMRK/KMRK_mask.c | 259 ++++++++ src/libKMRK/KMRK_mask.h | 37 ++ src/libKMRK/KMRK_merge_seeds.c | 123 ++++ src/libKMRK/KMRK_merge_seeds.h | 11 + src/libKMRK/Makefile | 25 + src/libKMRK/memory.c | 224 +++++++ src/libKMRK/memory.h | 105 ++++ src/libKMRK/repseek_types.h | 197 +++++++ src/libKMRK/sequence.h | 25 + src/libapat/CODES/dft_code.h | 14 + src/libapat/CODES/dna_code.h | 71 +++ src/libapat/CODES/prot_code.h | 51 ++ src/libapat/Gmach.h | 97 +++ src/libapat/Gtypes.h | 104 ++++ src/libapat/Makefile | 24 + src/libapat/apat.h | 173 ++++++ src/libapat/apat_parse.c | 369 ++++++++++++ src/libapat/apat_search.c | 339 +++++++++++ src/libapat/libstki.c | 379 ++++++++++++ src/libapat/libstki.h | 87 +++ src/libecoPCR/Makefile | 31 + src/libecoPCR/ecoError.c | 26 + src/libecoPCR/ecoIOUtils.c | 122 ++++ src/libecoPCR/ecoMalloc.c | 79 +++ src/libecoPCR/ecoPCR.h | 269 +++++++++ src/libecoPCR/ecoapat.c | 199 +++++++ src/libecoPCR/ecodna.c | 153 +++++ src/libecoPCR/ecofilter.c | 19 + src/libecoPCR/econame.c | 61 ++ src/libecoPCR/ecorank.c | 52 ++ src/libecoPCR/ecoseq.c | 223 +++++++ src/libecoPCR/ecotax.c | 329 +++++++++++ src/libtm/tm.c | 31 + 45 files changed, 7987 insertions(+) create mode 100644 .cproject create mode 100644 .project create mode 100644 Documents/CalculTM.xls create mode 100644 Licence_CeCILL_V2-en.txt create mode 100644 Licence_CeCILL_V2-fr.txt create mode 100644 src/Makefile create mode 100644 src/ecoprimer.c create mode 100644 src/global.mk create mode 100644 src/libKMRK/KMRK.c create mode 100644 src/libKMRK/KMRK.h create mode 100644 src/libKMRK/KMRK_Seeds.c create mode 100644 src/libKMRK/KMRK_Seeds.h create mode 100644 src/libKMRK/KMRK_mask.c create mode 100644 src/libKMRK/KMRK_mask.h create mode 100644 src/libKMRK/KMRK_merge_seeds.c create mode 100644 src/libKMRK/KMRK_merge_seeds.h create mode 100644 src/libKMRK/Makefile create mode 100644 src/libKMRK/memory.c create mode 100644 src/libKMRK/memory.h create mode 100644 src/libKMRK/repseek_types.h create mode 100644 src/libKMRK/sequence.h create mode 100644 src/libapat/CODES/dft_code.h create mode 100644 src/libapat/CODES/dna_code.h create mode 100644 src/libapat/CODES/prot_code.h create mode 100644 src/libapat/Gmach.h create mode 100644 src/libapat/Gtypes.h create mode 100644 src/libapat/Makefile create mode 100644 src/libapat/apat.h create mode 100644 src/libapat/apat_parse.c create mode 100644 src/libapat/apat_search.c create mode 100644 src/libapat/libstki.c create mode 100644 src/libapat/libstki.h create mode 100644 src/libecoPCR/Makefile create mode 100644 src/libecoPCR/ecoError.c create mode 100644 src/libecoPCR/ecoIOUtils.c create mode 100644 src/libecoPCR/ecoMalloc.c create mode 100644 src/libecoPCR/ecoPCR.h create mode 100644 src/libecoPCR/ecoapat.c create mode 100644 src/libecoPCR/ecodna.c create mode 100644 src/libecoPCR/ecofilter.c create mode 100644 src/libecoPCR/econame.c create mode 100644 src/libecoPCR/ecorank.c create mode 100644 src/libecoPCR/ecoseq.c create mode 100644 src/libecoPCR/ecotax.c create mode 100644 src/libtm/tm.c diff --git a/.cproject b/.cproject new file mode 100644 index 0000000..9f894fd --- /dev/null +++ b/.cproject @@ -0,0 +1,151 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.project b/.project new file mode 100644 index 0000000..ffa37e5 --- /dev/null +++ b/.project @@ -0,0 +1,77 @@ + + + ecoPrimers + + + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + org.eclipse.cdt.make.core.fullBuildTarget + all + + + ?name? + + + + org.eclipse.cdt.make.core.enableAutoBuild + false + + + org.eclipse.cdt.make.core.enableFullBuild + true + + + org.eclipse.cdt.make.core.enableCleanBuild + true + + + org.eclipse.cdt.make.core.cleanBuildTarget + clean + + + org.eclipse.cdt.make.core.append_environment + true + + + org.eclipse.cdt.make.core.contents + org.eclipse.cdt.make.core.activeConfigSettings + + + org.eclipse.cdt.make.core.useDefaultBuildCmd + true + + + org.eclipse.cdt.make.core.buildArguments + + + + org.eclipse.cdt.make.core.buildCommand + make + + + org.eclipse.cdt.make.core.autoBuildTarget + all + + + org.eclipse.cdt.make.core.stopOnError + true + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + + diff --git a/Documents/CalculTM.xls b/Documents/CalculTM.xls new file mode 100644 index 0000000000000000000000000000000000000000..094eb31189a8b9004c77d195be2a80a1abbb183c GIT binary patch literal 35840 zcmeHQ3v`s#oxk7AWby!#KzM`%VHiLGAy1wfA;Sa$mPeGReNZJNLx_YVO$LQ_&l+p& zRslEKmX)oo*tJ`#Zl$%TJqHhC*X_|3Tguk0+p|4QUAw1eyY8{B?dfij+28Mf?>EnH z!q8{n%$NMX``!QdzxV$B_kF+nm^0s-IP%z6D}Jg{u2ssd&PNNCH-T^AnIu=KQrF^v z_VdwbR5O*rWA3_yV&FYs4I=T}AP*=H#5B(b6@UssMWAt@@t_HyVo(VPSwKw$O#+pH zCWEGcrh=w{%0bgX6`)Gc4A2#znV?x9A80nn51Ip-3z`SI5_A=4K4<}GA?Rw*BG6(` z6{s3i1F8kpf$Bj^KubZ(K+8cZKr2DlfS5-~*Wq~;Xf>!26acLOHGyoIODOq%Z>Z`< zDE@m~F60;+gbV)o3V2s4B6qb;8~Q%D?Chfp7r$aPz}m=6gzFa7tNPV$wF71^%!|Dy zDVN23Yh%d9JrRcbv}B&zs`jcq_-g}>{c3~iQ9A__Ibqd>=blkKosaNY2eSMS!ziLj zuUc4`aOENA^{RHk5JE`di^Q`6;oG`r8$QcRIYHwpEI=vt=BZ!SoX%H+mhwW8pDLF+ z?s*Y+rQQH-Q++J^Yx}#}eZk(Yfq}Mm-yi#0BN*B&Vz|!oqMdOejvv9j7kCoz&8J6OO*_8QFd?z#4#U)Vo%r$Q_cqA9oA>@cx{*`H}EzQER;n`RHB4I(7p2 z>gG^?M_W&uer4g+NrsAgBjuGVvnzLISEGdGL6j2){!_#9sO!>S&Gb|wGfh#}9MqF@ z@K>Y!%BP<4`O!{qL%o^KpXphNy1v5M$%jxggj9$6a7M@1qLHXfbX7h`9YiO79=d80Z@;m9(*ES3*f{eSW#BK&z%NYWf6C=irGo`gniGc$ z#hf(k7PD;FE9T^3kC;=2^TkxdO3bN)dC`Z^%;~0rE%=WMylU2S?9Bc^o%^3W^-jT% zQcwOT9N_9;hj&;Ff8-kUZN8|~;Gl{`A}|J3`xEEYj~Cae=&uf{&2QYIIwwZe`7fFyol_EjjS}7Ogt(mu0iUj2rfU2Y*s2S7> zx)Eg|*w^2+C)AIwXmE4e`_PFBZt8@io=v#z*@V&4*0%1jZ|%;Vq4qFr8nTbfsPcOX z)CUkr#dICU8xT@`#l(dgzGbJJ^o*iKT3C>dFzWU6o^mi;f}Ramf7l47@)Bo4ee%{NSnRL7(<3?mz8NMs42yZ-<}R z^*Y??t9jCTwSLd*D%e;f@Acr2gSIUo4iqb;C+m(pEAQ9ebqx9(R5QLm2)ou#bclzG z_;>~M?O6CaBwXRUQPBs3T^)z_9ao$0d|s_T@cvJK^J(Hi{CH;K zxzW6j(BGj$>M6B(?{>T&Q2|DQzF6U;xPKdO&KUoGa4LE~Fnr~>+IZ%1#ux4WH(h;0 zhg{v-|NHMp`fP)nV!Rs!HCpOx>xaJZqZVMG`$h>5_~0kPzrTfcxSwk&|09T}LE7Or za4U;NT&*AMV0f_4rJv=#zc1BsE&h*y|IpB}lW(f#iAPUG4~RZP55I7dc<@e1cyy2S zp;huzEdB2vVfBH$(2Z9<_&-8@RP!Xnopvu+jC=4-Lr(GwOkNgv--Vg4^Q)b1cK$p7 zKMx$0{2Saii02k@bpdxqMnQfM&u+=T!n~?f(i>@q@LcA-7a05QLgc|dOtGOa{EOsI z*pH{;L(nfmq=5Rtoy5t|v42PTWO+I8HqD`9pEvIDp7{fDtKI%KtMTFF4Sevp z57qYsloAWIiVNNN5_z7e%MB{$b6@`D*-blIR^KN4g~GpD?jIM@9pU7>AcQ=Q|r3L zEBvch6*bIm+7fL2K+9TRUw79)*w=E?nvENRKL2907kz!*q3Y_U)+S%e#tmCreek2Y zdhKSv&%Y}i?z^VC`p!G=tZJiFReSFq3Jz4a^!N6K`osG-!n4J2P}LFc@FN)QdlDRQ zJG$D#tBML6c8B(_f)~a!)ZRVes+P9S(7N9KJ#Bd0w7$2$>!ZCr;kND@Ljw)fl!r6p z+}VA&t9^GUT-DwmY76)F+inUQ2Ey>ZGaXg&ZE_5bm896(-qwx0eEB3p zibAoHfcPJCc~c%l65ZqDk+0vvE&#dWny6Oj#k~$_O+qq zzpblhXYXxxd+O9qw`^)@X=-oVl0N(luCHfhOgs*?J) z7hu=dEvu_qS`)vPCUyvH$945}E30ak#jfQ^LJ6hhPhSwTj5}0{mwE0;)i6W$SiIaZ zg)^}0m(9!-Nudn0st)DX~mrzIlxkoptjf)va1zI*>?99*J@m{tWkJZ#v)uG>+ z-DlqL{=S5vsd1AUrnJs%sr4W@!SC1rZM3m$I#zq=m;if3|T*RBFJ-I6ID<%==X9v)ME@H)gX7n+@mo-+&iMz~HC*T`y1-Gnzq znENkzMmcNm5oP^Q5qcI+QHat97C%;EI%!nuObhuotq22cPDXXf9a-87N zyslPdh1Zfty)^IbSQwmo(!A}1%CZC>rmzhU4^~Kc<72!o@Y;mOWd+(T&|41=G7QF* z=FI|cmdJ5AvF4o_RMy-^$}wi*y!pc81eWHFK(6I_%5fo}<}v$h_mIbBftuF~UX_F~ zImTn$|K{VAn-b$;rX0%*$(tJE^&t+lzSCm7##p?R$9R;}X~d<8I$S7Z(!4UpL+WF- z1jCpS~a5#P8}#06fmcpRSFz*CK0EG~fG8f@VKz$<{w6U|c=*(W^}xKBZ4qRI^7 zjhO6CMjQ~ta0Orf_6V<8fZ)-H;fo_;w;LI(3qm9AQqclv z#CH7aSBe&SX z_eY?`psj`1Xfau|02+))dlj)`9@;*z71#p7D!11r}tXf5(Z3pOUyB42AU_=yN~8F|!b zQDC%~AzHAsQ;WW`ZPTemL7EoLQo6y1oQ_*E z+^s|{m^MV{i$IHVJ4cN(TFezKfJS777x+bsacNq#O6dk4a`7BBUTXp8j#>aU#N+M7 zc&$aUcwCEdq6NEs%u(Y+3$Dm1@!6uq_%tno zQZB$3IXFbxa($8HC^&c20;nM_T0FyYF+ppQEFRZld=l>Qj<~s^iMZzrZiFwk$hi+m zd7hAlTbBgzAr~(RSO&|I0OyXlffM5Kl2EL1CyU2%PY~SP@5cC^pmC?vrrc{n+zSOa zm*LVja`%S?cX1kSo%g|qoX!T!`z0DToIBzMPVp8#OWsGurkcs(aooj%n|twyyIA8+ zsd-r266qqrjqt_R@}kC#@}E*>b&Je;!H#eC=SnqhICsR&0%*jgMK0C2lf~n>*)uAU zK^SqDXxvBdiy(i;dJ-Hu5OMR?Qv`OI zfjvdT*4@a*kD^$5v7rnh!pqOfaBl~^!~i)uLCv!;SoB0Zk?Ey#U(#X>xsDn7nf(x( z3!p|UFOml_b%qV})T9unriI94pC%z%rLRUx)K91XERrWp`!sPm-C&)jT_#k0UNsw= zt_h;^&bOYt+4V7ifLHI0Q%#74ND}}>+=Ngr9&sfv11lGg68meE;b(yofYmb7bzli2 zHZOLnay(t4K6%_%{GAicjojhw#tv%)=84SwxjQc*)TY_WiZuCgV-8WFS6>Zy@hQL? z05JCy?<`5LNgrwSc-1C)18?`uc<-vPcnh7P%T*I1MSDAq%4F-|4^@LYfb^Z=TBsgS zkE%D-w^II|SE+x`VK4AC*b6)ti^uu7W;P&WoQVNc{8Ju38ERhqQ{J2y&xc&1KjqDh z@rLCy5#yHrl_($XZhSg~Ca^u+GwGhlx1+21o_?2z6i4@nHQ1EuYSiyohUhD846#Jy zDLKbTZV<^iijwnUl8n?VVe?{t-<*U(^<@+)d)%v;R@y@IMIRPb8hGTO*s>4e1*y6#_=5WUJJ_QG*m<+k) z@v@cVPqtkoWsP&JB#e=zz)33}?d8jt)qWiZrDBONHmf5WaZ1*?^sA%{ip7#+jFgGo z^7zz6{!F3V^7!=qj%M}B#cxE&Essy_JH;|sd=ZS}CeJ_Uj*xg*4*OR#uK#Cg$Ssf0 zGVU&`WtR4GEG!|6=Z;hjGh~kp>)kNBssnfdugqCw`Zg`4= z&n=IS9SFcw`yIAC<(9{f`4XC#$+NAvbIaos_R?O=_pmY2r?acL-17LuMduEQV|Rj0 zgrzPzkMC3-EfeIH$Hyi~?1zh49&ay&x835jWlrSC>`S9CGfm`Z7nlK(E8B^jvcB9z zPHrN{F8K)q9LM9EVjZKg`Q_AM&g0VmYU-dAlOZ>elbgtqbKD%tuKl4sH<6PNL*^rb z(~&K)!jy7ii@nVb#VvMTON{5(338lI%T44Y9Nm}nMdy#37`cg@+(Zr!luX#+i<0bg zaAj^HCwI)<$7m9f(kF5h!BauYep z=Px^aD*SWI&D-B`TQ((r#ZNh$kF|I#E5nkT$jO~^H2Q%?xpR(q8hp~QY>c+rb_I}| z$jMFQjQMkp{*V(nW=kQd1gI`d1X6@``h<{c+$@>h#PAk0Zjxw4q{#_#ZEz7wvOu-_AAycYN9JTUPaHt zPS7M?e8cOC6DMAK4F_sf{8}zmX-o_IA1(Y*&WEl0mij)Z;+Jx%N@Mz zq(bjs{D}EuDsx)RT!r_noRyZQjC5;l!A?loqbL;4H zO6>X(D>`RcDA{nI632doYWrWo7;I{~;f7mpjUi*L!n&jq*cPc$kD9P?Z+llqo3FXQ zt*0ZT+^Ac*<$MBSu}Ugh{JdAubc-4lAvap6DJs;$3057e<09;uw$S_u2dyDF%W+?$ zIcOWnX~KPr=Afk{=LX!jY0fguxfPr)%~`HFyTQ3bbI_*J6%J6u4}eKnD>dg{aFEY{ z=>eQQSc|rTau4DDIgxYKYTS>4^Mv8liOf^DKW8}jVO_ZT2JSBz&Jy8#3-?zH2lXOk z?D}{#ZQ}@3We4--wI|H~Ny_6hz%lwWaxuzbDCGx14Tm@akWmr0q&|-fC3~?sC^HLcGsc_39m@(Sq@= zR-pQ*XYukyb>KFFTSsod=B@;H6S(!lg%2^=4}!ZS&4*5Kmx^pqWLGR&0w2QQE+aSC ztQHL9Ev~Iugmj#*?goE3`OP}K6@uaca95=H@mX+JiXS$L74YGh`h}$9f-+aD9U?V3 zL4Gabx6Bo^{aye92I3;Kr|`7G#y za?rKddgy|3tXnFxkYi0UKMOh5FH5tKW396$3pv(BTe6U24Ye%`Io4ZUS;(HS;(>8d@c())}}9IA;+HR_cO?))}%fbQdILdqx|uD z8HCZq#a*L~KU}Z9`r0E^m8GBh8;m~|{qI*e{`fhFV^Xw>ss+SzQQi+i4^Oe1&x47> zAdVIO0>tsh5C}a`br8fc$EQIYpS%Eaf$Z@JtDOy-Z`=^5zUlgP2u6=5#)+SJmC>Au zz$nXck^TYY_+wrCv0jabAOqy!hrmR*J2c?y2>I}{ZtZ)!#UFbd1BnY>7DhQPR-?Fl z1@s)JtH@og^TRYj#6$2UkDx3S#LxWhQGMtZ>`=FBmltv8$zO@1%WRyYSE06NBbcSk q#fG1yw^@r_%Hem*D0#)Y4QqIks*I_tw(bhyS4`2<%Uzd!4E#UwyNh-J literal 0 HcmV?d00001 diff --git a/Licence_CeCILL_V2-en.txt b/Licence_CeCILL_V2-en.txt new file mode 100644 index 0000000..fcc8df2 --- /dev/null +++ b/Licence_CeCILL_V2-en.txt @@ -0,0 +1,506 @@ + +CeCILL FREE SOFTWARE LICENSE AGREEMENT + + + Notice + +This Agreement is a Free Software license agreement that is the result +of discussions between its authors in order to ensure compliance with +the two main principles guiding its drafting: + + * firstly, compliance with the principles governing the distribution + of Free Software: access to source code, broad rights granted to + users, + * secondly, the election of a governing law, French law, with which + it is conformant, both as regards the law of torts and + intellectual property law, and the protection that it offers to + both authors and holders of the economic rights over software. + +The authors of the CeCILL (for Ce[a] C[nrs] I[nria] L[ogiciel] L[ibre]) +license are: + +Commissariat à l'Energie Atomique - CEA, a public scientific, technical +and industrial research establishment, having its principal place of +business at 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris, France. + +Centre National de la Recherche Scientifique - CNRS, a public scientific +and technological establishment, having its principal place of business +at 3 rue Michel-Ange, 75794 Paris cedex 16, France. + +Institut National de Recherche en Informatique et en Automatique - +INRIA, a public scientific and technological establishment, having its +principal place of business at Domaine de Voluceau, Rocquencourt, BP +105, 78153 Le Chesnay cedex, France. + + + Preamble + +The purpose of this Free Software license agreement is to grant users +the right to modify and redistribute the software governed by this +license within the framework of an open source distribution model. + +The exercising of these rights is conditional upon certain obligations +for users so as to preserve this status for all subsequent redistributions. + +In consideration of access to the source code and the rights to copy, +modify and redistribute granted by the license, users are provided only +with a limited warranty and the software's author, the holder of the +economic rights, and the successive licensors only have limited liability. + +In this respect, the risks associated with loading, using, modifying +and/or developing or reproducing the software by the user are brought to +the user's attention, given its Free Software status, which may make it +complicated to use, with the result that its use is reserved for +developers and experienced professionals having in-depth computer +knowledge. Users are therefore encouraged to load and test the +suitability of the software as regards their requirements in conditions +enabling the security of their systems and/or data to be ensured and, +more generally, to use and operate it in the same conditions of +security. This Agreement may be freely reproduced and published, +provided it is not altered, and that no provisions are either added or +removed herefrom. + +This Agreement may apply to any or all software for which the holder of +the economic rights decides to submit the use thereof to its provisions. + + + Article 1 - DEFINITIONS + +For the purpose of this Agreement, when the following expressions +commence with a capital letter, they shall have the following meaning: + +Agreement: means this license agreement, and its possible subsequent +versions and annexes. + +Software: means the software in its Object Code and/or Source Code form +and, where applicable, its documentation, "as is" when the Licensee +accepts the Agreement. + +Initial Software: means the Software in its Source Code and possibly its +Object Code form and, where applicable, its documentation, "as is" when +it is first distributed under the terms and conditions of the Agreement. + +Modified Software: means the Software modified by at least one +Contribution. + +Source Code: means all the Software's instructions and program lines to +which access is required so as to modify the Software. + +Object Code: means the binary files originating from the compilation of +the Source Code. + +Holder: means the holder(s) of the economic rights over the Initial +Software. + +Licensee: means the Software user(s) having accepted the Agreement. + +Contributor: means a Licensee having made at least one Contribution. + +Licensor: means the Holder, or any other individual or legal entity, who +distributes the Software under the Agreement. + +Contribution: means any or all modifications, corrections, translations, +adaptations and/or new functions integrated into the Software by any or +all Contributors, as well as any or all Internal Modules. + +Module: means a set of sources files including their documentation that +enables supplementary functions or services in addition to those offered +by the Software. + +External Module: means any or all Modules, not derived from the +Software, so that this Module and the Software run in separate address +spaces, with one calling the other when they are run. + +Internal Module: means any or all Module, connected to the Software so +that they both execute in the same address space. + +GNU GPL: means the GNU General Public License version 2 or any +subsequent version, as published by the Free Software Foundation Inc. + +Parties: mean both the Licensee and the Licensor. + +These expressions may be used both in singular and plural form. + + + Article 2 - PURPOSE + +The purpose of the Agreement is the grant by the Licensor to the +Licensee of a non-exclusive, transferable and worldwide license for the +Software as set forth in Article 5 hereinafter for the whole term of the +protection granted by the rights over said Software. + + + Article 3 - ACCEPTANCE + +3.1 The Licensee shall be deemed as having accepted the terms and +conditions of this Agreement upon the occurrence of the first of the +following events: + + * (i) loading the Software by any or all means, notably, by + downloading from a remote server, or by loading from a physical + medium; + * (ii) the first time the Licensee exercises any of the rights + granted hereunder. + +3.2 One copy of the Agreement, containing a notice relating to the +characteristics of the Software, to the limited warranty, and to the +fact that its use is restricted to experienced users has been provided +to the Licensee prior to its acceptance as set forth in Article 3.1 +hereinabove, and the Licensee hereby acknowledges that it has read and +understood it. + + + Article 4 - EFFECTIVE DATE AND TERM + + + 4.1 EFFECTIVE DATE + +The Agreement shall become effective on the date when it is accepted by +the Licensee as set forth in Article 3.1. + + + 4.2 TERM + +The Agreement shall remain in force for the entire legal term of +protection of the economic rights over the Software. + + + Article 5 - SCOPE OF RIGHTS GRANTED + +The Licensor hereby grants to the Licensee, who accepts, the following +rights over the Software for any or all use, and for the term of the +Agreement, on the basis of the terms and conditions set forth hereinafter. + +Besides, if the Licensor owns or comes to own one or more patents +protecting all or part of the functions of the Software or of its +components, the Licensor undertakes not to enforce the rights granted by +these patents against successive Licensees using, exploiting or +modifying the Software. If these patents are transferred, the Licensor +undertakes to have the transferees subscribe to the obligations set +forth in this paragraph. + + + 5.1 RIGHT OF USE + +The Licensee is authorized to use the Software, without any limitation +as to its fields of application, with it being hereinafter specified +that this comprises: + + 1. permanent or temporary reproduction of all or part of the Software + by any or all means and in any or all form. + + 2. loading, displaying, running, or storing the Software on any or + all medium. + + 3. entitlement to observe, study or test its operation so as to + determine the ideas and principles behind any or all constituent + elements of said Software. This shall apply when the Licensee + carries out any or all loading, displaying, running, transmission + or storage operation as regards the Software, that it is entitled + to carry out hereunder. + + + 5.2 ENTITLEMENT TO MAKE CONTRIBUTIONS + +The right to make Contributions includes the right to translate, adapt, +arrange, or make any or all modifications to the Software, and the right +to reproduce the resulting software. + +The Licensee is authorized to make any or all Contributions to the +Software provided that it includes an explicit notice that it is the +author of said Contribution and indicates the date of the creation thereof. + + + 5.3 RIGHT OF DISTRIBUTION + +In particular, the right of distribution includes the right to publish, +transmit and communicate the Software to the general public on any or +all medium, and by any or all means, and the right to market, either in +consideration of a fee, or free of charge, one or more copies of the +Software by any means. + +The Licensee is further authorized to distribute copies of the modified +or unmodified Software to third parties according to the terms and +conditions set forth hereinafter. + + + 5.3.1 DISTRIBUTION OF SOFTWARE WITHOUT MODIFICATION + +The Licensee is authorized to distribute true copies of the Software in +Source Code or Object Code form, provided that said distribution +complies with all the provisions of the Agreement and is accompanied by: + + 1. a copy of the Agreement, + + 2. a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + +and that, in the event that only the Object Code of the Software is +redistributed, the Licensee allows future Licensees unhindered access to +the full Source Code of the Software by indicating how to access it, it +being understood that the additional cost of acquiring the Source Code +shall not exceed the cost of transferring the data. + + + 5.3.2 DISTRIBUTION OF MODIFIED SOFTWARE + +When the Licensee makes a Contribution to the Software, the terms and +conditions for the distribution of the resulting Modified Software +become subject to all the provisions of this Agreement. + +The Licensee is authorized to distribute the Modified Software, in +source code or object code form, provided that said distribution +complies with all the provisions of the Agreement and is accompanied by: + + 1. a copy of the Agreement, + + 2. a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + +and that, in the event that only the object code of the Modified +Software is redistributed, the Licensee allows future Licensees +unhindered access to the full source code of the Modified Software by +indicating how to access it, it being understood that the additional +cost of acquiring the source code shall not exceed the cost of +transferring the data. + + + 5.3.3 DISTRIBUTION OF EXTERNAL MODULES + +When the Licensee has developed an External Module, the terms and +conditions of this Agreement do not apply to said External Module, that +may be distributed under a separate license agreement. + + + 5.3.4 COMPATIBILITY WITH THE GNU GPL + +The Licensee can include a code that is subject to the provisions of one +of the versions of the GNU GPL in the Modified or unmodified Software, +and distribute that entire code under the terms of the same version of +the GNU GPL. + +The Licensee can include the Modified or unmodified Software in a code +that is subject to the provisions of one of the versions of the GNU GPL, +and distribute that entire code under the terms of the same version of +the GNU GPL. + + + Article 6 - INTELLECTUAL PROPERTY + + + 6.1 OVER THE INITIAL SOFTWARE + +The Holder owns the economic rights over the Initial Software. Any or +all use of the Initial Software is subject to compliance with the terms +and conditions under which the Holder has elected to distribute its work +and no one shall be entitled to modify the terms and conditions for the +distribution of said Initial Software. + +The Holder undertakes that the Initial Software will remain ruled at +least by this Agreement, for the duration set forth in Article 4.2. + + + 6.2 OVER THE CONTRIBUTIONS + +The Licensee who develops a Contribution is the owner of the +intellectual property rights over this Contribution as defined by +applicable law. + + + 6.3 OVER THE EXTERNAL MODULES + +The Licensee who develops an External Module is the owner of the +intellectual property rights over this External Module as defined by +applicable law and is free to choose the type of agreement that shall +govern its distribution. + + + 6.4 JOINT PROVISIONS + +The Licensee expressly undertakes: + + 1. not to remove, or modify, in any manner, the intellectual property + notices attached to the Software; + + 2. to reproduce said notices, in an identical manner, in the copies + of the Software modified or not. + +The Licensee undertakes not to directly or indirectly infringe the +intellectual property rights of the Holder and/or Contributors on the +Software and to take, where applicable, vis-à-vis its staff, any and all +measures required to ensure respect of said intellectual property rights +of the Holder and/or Contributors. + + + Article 7 - RELATED SERVICES + +7.1 Under no circumstances shall the Agreement oblige the Licensor to +provide technical assistance or maintenance services for the Software. + +However, the Licensor is entitled to offer this type of services. The +terms and conditions of such technical assistance, and/or such +maintenance, shall be set forth in a separate instrument. Only the +Licensor offering said maintenance and/or technical assistance services +shall incur liability therefor. + +7.2 Similarly, any Licensor is entitled to offer to its licensees, under +its sole responsibility, a warranty, that shall only be binding upon +itself, for the redistribution of the Software and/or the Modified +Software, under terms and conditions that it is free to decide. Said +warranty, and the financial terms and conditions of its application, +shall be subject of a separate instrument executed between the Licensor +and the Licensee. + + + Article 8 - LIABILITY + +8.1 Subject to the provisions of Article 8.2, the Licensee shall be +entitled to claim compensation for any direct loss it may have suffered +from the Software as a result of a fault on the part of the relevant +Licensor, subject to providing evidence thereof. + +8.2 The Licensor's liability is limited to the commitments made under +this Agreement and shall not be incurred as a result of in particular: +(i) loss due the Licensee's total or partial failure to fulfill its +obligations, (ii) direct or consequential loss that is suffered by the +Licensee due to the use or performance of the Software, and (iii) more +generally, any consequential loss. In particular the Parties expressly +agree that any or all pecuniary or business loss (i.e. loss of data, +loss of profits, operating loss, loss of customers or orders, +opportunity cost, any disturbance to business activities) or any or all +legal proceedings instituted against the Licensee by a third party, +shall constitute consequential loss and shall not provide entitlement to +any or all compensation from the Licensor. + + + Article 9 - WARRANTY + +9.1 The Licensee acknowledges that the scientific and technical +state-of-the-art when the Software was distributed did not enable all +possible uses to be tested and verified, nor for the presence of +possible defects to be detected. In this respect, the Licensee's +attention has been drawn to the risks associated with loading, using, +modifying and/or developing and reproducing the Software which are +reserved for experienced users. + +The Licensee shall be responsible for verifying, by any or all means, +the suitability of the product for its requirements, its good working +order, and for ensuring that it shall not cause damage to either persons +or properties. + +9.2 The Licensor hereby represents, in good faith, that it is entitled +to grant all the rights over the Software (including in particular the +rights set forth in Article 5). + +9.3 The Licensee acknowledges that the Software is supplied "as is" by +the Licensor without any other express or tacit warranty, other than +that provided for in Article 9.2 and, in particular, without any warranty +as to its commercial value, its secured, safe, innovative or relevant +nature. + +Specifically, the Licensor does not warrant that the Software is free +from any error, that it will operate without interruption, that it will +be compatible with the Licensee's own equipment and software +configuration, nor that it will meet the Licensee's requirements. + +9.4 The Licensor does not either expressly or tacitly warrant that the +Software does not infringe any third party intellectual property right +relating to a patent, software or any other property right. Therefore, +the Licensor disclaims any and all liability towards the Licensee +arising out of any or all proceedings for infringement that may be +instituted in respect of the use, modification and redistribution of the +Software. Nevertheless, should such proceedings be instituted against +the Licensee, the Licensor shall provide it with technical and legal +assistance for its defense. Such technical and legal assistance shall be +decided on a case-by-case basis between the relevant Licensor and the +Licensee pursuant to a memorandum of understanding. The Licensor +disclaims any and all liability as regards the Licensee's use of the +name of the Software. No warranty is given as regards the existence of +prior rights over the name of the Software or as regards the existence +of a trademark. + + + Article 10 - TERMINATION + +10.1 In the event of a breach by the Licensee of its obligations +hereunder, the Licensor may automatically terminate this Agreement +thirty (30) days after notice has been sent to the Licensee and has +remained ineffective. + +10.2 A Licensee whose Agreement is terminated shall no longer be +authorized to use, modify or distribute the Software. However, any +licenses that it may have granted prior to termination of the Agreement +shall remain valid subject to their having been granted in compliance +with the terms and conditions hereof. + + + Article 11 - MISCELLANEOUS + + + 11.1 EXCUSABLE EVENTS + +Neither Party shall be liable for any or all delay, or failure to +perform the Agreement, that may be attributable to an event of force +majeure, an act of God or an outside cause, such as defective +functioning or interruptions of the electricity or telecommunications +networks, network paralysis following a virus attack, intervention by +government authorities, natural disasters, water damage, earthquakes, +fire, explosions, strikes and labor unrest, war, etc. + +11.2 Any failure by either Party, on one or more occasions, to invoke +one or more of the provisions hereof, shall under no circumstances be +interpreted as being a waiver by the interested Party of its right to +invoke said provision(s) subsequently. + +11.3 The Agreement cancels and replaces any or all previous agreements, +whether written or oral, between the Parties and having the same +purpose, and constitutes the entirety of the agreement between said +Parties concerning said purpose. No supplement or modification to the +terms and conditions hereof shall be effective as between the Parties +unless it is made in writing and signed by their duly authorized +representatives. + +11.4 In the event that one or more of the provisions hereof were to +conflict with a current or future applicable act or legislative text, +said act or legislative text shall prevail, and the Parties shall make +the necessary amendments so as to comply with said act or legislative +text. All other provisions shall remain effective. Similarly, invalidity +of a provision of the Agreement, for any reason whatsoever, shall not +cause the Agreement as a whole to be invalid. + + + 11.5 LANGUAGE + +The Agreement is drafted in both French and English and both versions +are deemed authentic. + + + Article 12 - NEW VERSIONS OF THE AGREEMENT + +12.1 Any person is authorized to duplicate and distribute copies of this +Agreement. + +12.2 So as to ensure coherence, the wording of this Agreement is +protected and may only be modified by the authors of the License, who +reserve the right to periodically publish updates or new versions of the +Agreement, each with a separate number. These subsequent versions may +address new issues encountered by Free Software. + +12.3 Any Software distributed under a given version of the Agreement may +only be subsequently distributed under the same version of the Agreement +or a subsequent version, subject to the provisions of Article 5.3.4. + + + Article 13 - GOVERNING LAW AND JURISDICTION + +13.1 The Agreement is governed by French law. The Parties agree to +endeavor to seek an amicable solution to any disagreements or disputes +that may arise during the performance of the Agreement. + +13.2 Failing an amicable solution within two (2) months as from their +occurrence, and unless emergency proceedings are necessary, the +disagreements or disputes shall be referred to the Paris Courts having +jurisdiction, by the more diligent Party. + + +Version 2.0 dated 2006-09-05. diff --git a/Licence_CeCILL_V2-fr.txt b/Licence_CeCILL_V2-fr.txt new file mode 100644 index 0000000..1613fca --- /dev/null +++ b/Licence_CeCILL_V2-fr.txt @@ -0,0 +1,512 @@ + +CONTRAT DE LICENCE DE LOGICIEL LIBRE CeCILL + + + Avertissement + +Ce contrat est une licence de logiciel libre issue d'une concertation +entre ses auteurs afin que le respect de deux grands principes préside à +sa rédaction: + + * d'une part, le respect des principes de diffusion des logiciels + libres: accès au code source, droits étendus conférés aux + utilisateurs, + * d'autre part, la désignation d'un droit applicable, le droit + français, auquel elle est conforme, tant au regard du droit de la + responsabilité civile que du droit de la propriété intellectuelle + et de la protection qu'il offre aux auteurs et titulaires des + droits patrimoniaux sur un logiciel. + +Les auteurs de la licence CeCILL (pour Ce[a] C[nrs] I[nria] L[ogiciel] +L[ibre]) sont: + +Commissariat à l'Energie Atomique - CEA, établissement public de +recherche à caractère scientifique, technique et industriel, dont le +siège est situé 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris. + +Centre National de la Recherche Scientifique - CNRS, établissement +public à caractère scientifique et technologique, dont le siège est +situé 3 rue Michel-Ange, 75794 Paris cedex 16. + +Institut National de Recherche en Informatique et en Automatique - +INRIA, établissement public à caractère scientifique et technologique, +dont le siège est situé Domaine de Voluceau, Rocquencourt, BP 105, 78153 +Le Chesnay cedex. + + + Préambule + +Ce contrat est une licence de logiciel libre dont l'objectif est de +conférer aux utilisateurs la liberté de modification et de +redistribution du logiciel régi par cette licence dans le cadre d'un +modèle de diffusion en logiciel libre. + +L'exercice de ces libertés est assorti de certains devoirs à la charge +des utilisateurs afin de préserver ce statut au cours des +redistributions ultérieures. + +L'accessibilité au code source et les droits de copie, de modification +et de redistribution qui en découlent ont pour contrepartie de n'offrir +aux utilisateurs qu'une garantie limitée et de ne faire peser sur +l'auteur du logiciel, le titulaire des droits patrimoniaux et les +concédants successifs qu'une responsabilité restreinte. + +A cet égard l'attention de l'utilisateur est attirée sur les risques +associés au chargement, à l'utilisation, à la modification et/ou au +développement et à la reproduction du logiciel par l'utilisateur étant +donné sa spécificité de logiciel libre, qui peut le rendre complexe à +manipuler et qui le réserve donc à des développeurs ou des +professionnels avertis possédant des connaissances informatiques +approfondies. Les utilisateurs sont donc invités à charger et tester +l'adéquation du logiciel à leurs besoins dans des conditions permettant +d'assurer la sécurité de leurs systèmes et/ou de leurs données et, plus +généralement, à l'utiliser et l'exploiter dans les mêmes conditions de +sécurité. Ce contrat peut être reproduit et diffusé librement, sous +réserve de le conserver en l'état, sans ajout ni suppression de clauses. + +Ce contrat est susceptible de s'appliquer à tout logiciel dont le +titulaire des droits patrimoniaux décide de soumettre l'exploitation aux +dispositions qu'il contient. + + + Article 1 - DEFINITIONS + +Dans ce contrat, les termes suivants, lorsqu'ils seront écrits avec une +lettre capitale, auront la signification suivante: + +Contrat: désigne le présent contrat de licence, ses éventuelles versions +postérieures et annexes. + +Logiciel: désigne le logiciel sous sa forme de Code Objet et/ou de Code +Source et le cas échéant sa documentation, dans leur état au moment de +l'acceptation du Contrat par le Licencié. + +Logiciel Initial: désigne le Logiciel sous sa forme de Code Source et +éventuellement de Code Objet et le cas échéant sa documentation, dans +leur état au moment de leur première diffusion sous les termes du Contrat. + +Logiciel Modifié: désigne le Logiciel modifié par au moins une +Contribution. + +Code Source: désigne l'ensemble des instructions et des lignes de +programme du Logiciel et auquel l'accès est nécessaire en vue de +modifier le Logiciel. + +Code Objet: désigne les fichiers binaires issus de la compilation du +Code Source. + +Titulaire: désigne le ou les détenteurs des droits patrimoniaux d'auteur +sur le Logiciel Initial. + +Licencié: désigne le ou les utilisateurs du Logiciel ayant accepté le +Contrat. + +Contributeur: désigne le Licencié auteur d'au moins une Contribution. + +Concédant: désigne le Titulaire ou toute personne physique ou morale +distribuant le Logiciel sous le Contrat. + +Contribution: désigne l'ensemble des modifications, corrections, +traductions, adaptations et/ou nouvelles fonctionnalités intégrées dans +le Logiciel par tout Contributeur, ainsi que tout Module Interne. + +Module: désigne un ensemble de fichiers sources y compris leur +documentation qui permet de réaliser des fonctionnalités ou services +supplémentaires à ceux fournis par le Logiciel. + +Module Externe: désigne tout Module, non dérivé du Logiciel, tel que ce +Module et le Logiciel s'exécutent dans des espaces d'adressage +différents, l'un appelant l'autre au moment de leur exécution. + +Module Interne: désigne tout Module lié au Logiciel de telle sorte +qu'ils s'exécutent dans le même espace d'adressage. + +GNU GPL: désigne la GNU General Public License dans sa version 2 ou +toute version ultérieure, telle que publiée par Free Software Foundation +Inc. + +Parties: désigne collectivement le Licencié et le Concédant. + +Ces termes s'entendent au singulier comme au pluriel. + + + Article 2 - OBJET + +Le Contrat a pour objet la concession par le Concédant au Licencié d'une +licence non exclusive, cessible et mondiale du Logiciel telle que +définie ci-après à l'article 5 pour toute la durée de protection des droits +portant sur ce Logiciel. + + + Article 3 - ACCEPTATION + +3.1 L'acceptation par le Licencié des termes du Contrat est réputée +acquise du fait du premier des faits suivants: + + * (i) le chargement du Logiciel par tout moyen notamment par + téléchargement à partir d'un serveur distant ou par chargement à + partir d'un support physique; + * (ii) le premier exercice par le Licencié de l'un quelconque des + droits concédés par le Contrat. + +3.2 Un exemplaire du Contrat, contenant notamment un avertissement +relatif aux spécificités du Logiciel, à la restriction de garantie et à +la limitation à un usage par des utilisateurs expérimentés a été mis à +disposition du Licencié préalablement à son acceptation telle que +définie à l'article 3.1 ci dessus et le Licencié reconnaît en avoir pris +connaissance. + + + Article 4 - ENTREE EN VIGUEUR ET DUREE + + + 4.1 ENTREE EN VIGUEUR + +Le Contrat entre en vigueur à la date de son acceptation par le Licencié +telle que définie en 3.1. + + + 4.2 DUREE + +Le Contrat produira ses effets pendant toute la durée légale de +protection des droits patrimoniaux portant sur le Logiciel. + + + Article 5 - ETENDUE DES DROITS CONCEDES + +Le Concédant concède au Licencié, qui accepte, les droits suivants sur +le Logiciel pour toutes destinations et pour la durée du Contrat dans +les conditions ci-après détaillées. + +Par ailleurs, si le Concédant détient ou venait à détenir un ou +plusieurs brevets d'invention protégeant tout ou partie des +fonctionnalités du Logiciel ou de ses composants, il s'engage à ne pas +opposer les éventuels droits conférés par ces brevets aux Licenciés +successifs qui utiliseraient, exploiteraient ou modifieraient le +Logiciel. En cas de cession de ces brevets, le Concédant s'engage à +faire reprendre les obligations du présent alinéa aux cessionnaires. + + + 5.1 DROIT D'UTILISATION + +Le Licencié est autorisé à utiliser le Logiciel, sans restriction quant +aux domaines d'application, étant ci-après précisé que cela comporte: + + 1. la reproduction permanente ou provisoire du Logiciel en tout ou + partie par tout moyen et sous toute forme. + + 2. le chargement, l'affichage, l'exécution, ou le stockage du + Logiciel sur tout support. + + 3. la possibilité d'en observer, d'en étudier, ou d'en tester le + fonctionnement afin de déterminer les idées et principes qui sont + à la base de n'importe quel élément de ce Logiciel; et ceci, + lorsque le Licencié effectue toute opération de chargement, + d'affichage, d'exécution, de transmission ou de stockage du + Logiciel qu'il est en droit d'effectuer en vertu du Contrat. + + + 5.2 DROIT D'APPORTER DES CONTRIBUTIONS + +Le droit d'apporter des Contributions comporte le droit de traduire, +d'adapter, d'arranger ou d'apporter toute autre modification au Logiciel +et le droit de reproduire le logiciel en résultant. + +Le Licencié est autorisé à apporter toute Contribution au Logiciel sous +réserve de mentionner, de façon explicite, son nom en tant qu'auteur de +cette Contribution et la date de création de celle-ci. + + + 5.3 DROIT DE DISTRIBUTION + +Le droit de distribution comporte notamment le droit de diffuser, de +transmettre et de communiquer le Logiciel au public sur tout support et +par tout moyen ainsi que le droit de mettre sur le marché à titre +onéreux ou gratuit, un ou des exemplaires du Logiciel par tout procédé. + +Le Licencié est autorisé à distribuer des copies du Logiciel, modifié ou +non, à des tiers dans les conditions ci-après détaillées. + + + 5.3.1 DISTRIBUTION DU LOGICIEL SANS MODIFICATION + +Le Licencié est autorisé à distribuer des copies conformes du Logiciel, +sous forme de Code Source ou de Code Objet, à condition que cette +distribution respecte les dispositions du Contrat dans leur totalité et +soit accompagnée: + + 1. d'un exemplaire du Contrat, + + 2. d'un avertissement relatif à la restriction de garantie et de + responsabilité du Concédant telle que prévue aux articles 8 + et 9, + +et que, dans le cas où seul le Code Objet du Logiciel est redistribué, +le Licencié permette aux futurs Licenciés d'accéder facilement au Code +Source complet du Logiciel en indiquant les modalités d'accès, étant +entendu que le coût additionnel d'acquisition du Code Source ne devra +pas excéder le simple coût de transfert des données. + + + 5.3.2 DISTRIBUTION DU LOGICIEL MODIFIE + +Lorsque le Licencié apporte une Contribution au Logiciel, les conditions +de distribution du Logiciel Modifié en résultant sont alors soumises à +l'intégralité des dispositions du Contrat. + +Le Licencié est autorisé à distribuer le Logiciel Modifié, sous forme de +code source ou de code objet, à condition que cette distribution +respecte les dispositions du Contrat dans leur totalité et soit +accompagnée: + + 1. d'un exemplaire du Contrat, + + 2. d'un avertissement relatif à la restriction de garantie et de + responsabilité du Concédant telle que prévue aux articles 8 + et 9, + +et que, dans le cas où seul le code objet du Logiciel Modifié est +redistribué, le Licencié permette aux futurs Licenciés d'accéder +facilement au code source complet du Logiciel Modifié en indiquant les +modalités d'accès, étant entendu que le coût additionnel d'acquisition +du code source ne devra pas excéder le simple coût de transfert des données. + + + 5.3.3 DISTRIBUTION DES MODULES EXTERNES + +Lorsque le Licencié a développé un Module Externe les conditions du +Contrat ne s'appliquent pas à ce Module Externe, qui peut être distribué +sous un contrat de licence différent. + + + 5.3.4 COMPATIBILITE AVEC LA LICENCE GNU GPL + +Le Licencié peut inclure un code soumis aux dispositions d'une des +versions de la licence GNU GPL dans le Logiciel modifié ou non et +distribuer l'ensemble sous les conditions de la même version de la +licence GNU GPL. + +Le Licencié peut inclure le Logiciel modifié ou non dans un code soumis +aux dispositions d'une des versions de la licence GNU GPL et distribuer +l'ensemble sous les conditions de la même version de la licence GNU GPL. + + + Article 6 - PROPRIETE INTELLECTUELLE + + + 6.1 SUR LE LOGICIEL INITIAL + +Le Titulaire est détenteur des droits patrimoniaux sur le Logiciel +Initial. Toute utilisation du Logiciel Initial est soumise au respect +des conditions dans lesquelles le Titulaire a choisi de diffuser son +oeuvre et nul autre n'a la faculté de modifier les conditions de +diffusion de ce Logiciel Initial. + +Le Titulaire s'engage à ce que le Logiciel Initial reste au moins régi +par le Contrat et ce, pour la durée visée à l'article 4.2. + + + 6.2 SUR LES CONTRIBUTIONS + +Le Licencié qui a développé une Contribution est titulaire sur celle-ci +des droits de propriété intellectuelle dans les conditions définies par +la législation applicable. + + + 6.3 SUR LES MODULES EXTERNES + +Le Licencié qui a développé un Module Externe est titulaire sur celui-ci +des droits de propriété intellectuelle dans les conditions définies par +la législation applicable et reste libre du choix du contrat régissant +sa diffusion. + + + 6.4 DISPOSITIONS COMMUNES + +Le Licencié s'engage expressément: + + 1. à ne pas supprimer ou modifier de quelque manière que ce soit les + mentions de propriété intellectuelle apposées sur le Logiciel; + + 2. à reproduire à l'identique lesdites mentions de propriété + intellectuelle sur les copies du Logiciel modifié ou non. + +Le Licencié s'engage à ne pas porter atteinte, directement ou +indirectement, aux droits de propriété intellectuelle du Titulaire et/ou +des Contributeurs sur le Logiciel et à prendre, le cas échéant, à +l'égard de son personnel toutes les mesures nécessaires pour assurer le +respect des dits droits de propriété intellectuelle du Titulaire et/ou +des Contributeurs. + + + Article 7 - SERVICES ASSOCIES + +7.1 Le Contrat n'oblige en aucun cas le Concédant à la réalisation de +prestations d'assistance technique ou de maintenance du Logiciel. + +Cependant le Concédant reste libre de proposer ce type de services. Les +termes et conditions d'une telle assistance technique et/ou d'une telle +maintenance seront alors déterminés dans un acte séparé. Ces actes de +maintenance et/ou assistance technique n'engageront que la seule +responsabilité du Concédant qui les propose. + +7.2 De même, tout Concédant est libre de proposer, sous sa seule +responsabilité, à ses licenciés une garantie, qui n'engagera que lui, +lors de la redistribution du Logiciel et/ou du Logiciel Modifié et ce, +dans les conditions qu'il souhaite. Cette garantie et les modalités +financières de son application feront l'objet d'un acte séparé entre le +Concédant et le Licencié. + + + Article 8 - RESPONSABILITE + +8.1 Sous réserve des dispositions de l'article 8.2, le Licencié a la +faculté, sous réserve de prouver la faute du Concédant concerné, de +solliciter la réparation du préjudice direct qu'il subirait du fait du +Logiciel et dont il apportera la preuve. + +8.2 La responsabilité du Concédant est limitée aux engagements pris en +application du Contrat et ne saurait être engagée en raison notamment: +(i) des dommages dus à l'inexécution, totale ou partielle, de ses +obligations par le Licencié, (ii) des dommages directs ou indirects +découlant de l'utilisation ou des performances du Logiciel subis par le +Licencié et (iii) plus généralement d'un quelconque dommage indirect. En +particulier, les Parties conviennent expressément que tout préjudice +financier ou commercial (par exemple perte de données, perte de +bénéfices, perte d'exploitation, perte de clientèle ou de commandes, +manque à gagner, trouble commercial quelconque) ou toute action dirigée +contre le Licencié par un tiers, constitue un dommage indirect et +n'ouvre pas droit à réparation par le Concédant. + + + Article 9 - GARANTIE + +9.1 Le Licencié reconnaît que l'état actuel des connaissances +scientifiques et techniques au moment de la mise en circulation du +Logiciel ne permet pas d'en tester et d'en vérifier toutes les +utilisations ni de détecter l'existence d'éventuels défauts. L'attention +du Licencié a été attirée sur ce point sur les risques associés au +chargement, à l'utilisation, la modification et/ou au développement et à +la reproduction du Logiciel qui sont réservés à des utilisateurs avertis. + +Il relève de la responsabilité du Licencié de contrôler, par tous +moyens, l'adéquation du produit à ses besoins, son bon fonctionnement et +de s'assurer qu'il ne causera pas de dommages aux personnes et aux biens. + +9.2 Le Concédant déclare de bonne foi être en droit de concéder +l'ensemble des droits attachés au Logiciel (comprenant notamment les +droits visés à l'article 5). + +9.3 Le Licencié reconnaît que le Logiciel est fourni "en l'état" par le +Concédant sans autre garantie, expresse ou tacite, que celle prévue à +l'article 9.2 et notamment sans aucune garantie sur sa valeur commerciale, +son caractère sécurisé, innovant ou pertinent. + +En particulier, le Concédant ne garantit pas que le Logiciel est exempt +d'erreur, qu'il fonctionnera sans interruption, qu'il sera compatible +avec l'équipement du Licencié et sa configuration logicielle ni qu'il +remplira les besoins du Licencié. + +9.4 Le Concédant ne garantit pas, de manière expresse ou tacite, que le +Logiciel ne porte pas atteinte à un quelconque droit de propriété +intellectuelle d'un tiers portant sur un brevet, un logiciel ou sur tout +autre droit de propriété. Ainsi, le Concédant exclut toute garantie au +profit du Licencié contre les actions en contrefaçon qui pourraient être +diligentées au titre de l'utilisation, de la modification, et de la +redistribution du Logiciel. Néanmoins, si de telles actions sont +exercées contre le Licencié, le Concédant lui apportera son aide +technique et juridique pour sa défense. Cette aide technique et +juridique est déterminée au cas par cas entre le Concédant concerné et +le Licencié dans le cadre d'un protocole d'accord. Le Concédant dégage +toute responsabilité quant à l'utilisation de la dénomination du +Logiciel par le Licencié. Aucune garantie n'est apportée quant à +l'existence de droits antérieurs sur le nom du Logiciel et sur +l'existence d'une marque. + + + Article 10 - RESILIATION + +10.1 En cas de manquement par le Licencié aux obligations mises à sa +charge par le Contrat, le Concédant pourra résilier de plein droit le +Contrat trente (30) jours après notification adressée au Licencié et +restée sans effet. + +10.2 Le Licencié dont le Contrat est résilié n'est plus autorisé à +utiliser, modifier ou distribuer le Logiciel. Cependant, toutes les +licences qu'il aura concédées antérieurement à la résiliation du Contrat +resteront valides sous réserve qu'elles aient été effectuées en +conformité avec le Contrat. + + + Article 11 - DISPOSITIONS DIVERSES + + + 11.1 CAUSE EXTERIEURE + +Aucune des Parties ne sera responsable d'un retard ou d'une défaillance +d'exécution du Contrat qui serait dû à un cas de force majeure, un cas +fortuit ou une cause extérieure, telle que, notamment, le mauvais +fonctionnement ou les interruptions du réseau électrique ou de +télécommunication, la paralysie du réseau liée à une attaque +informatique, l'intervention des autorités gouvernementales, les +catastrophes naturelles, les dégâts des eaux, les tremblements de terre, +le feu, les explosions, les grèves et les conflits sociaux, l'état de +guerre... + +11.2 Le fait, par l'une ou l'autre des Parties, d'omettre en une ou +plusieurs occasions de se prévaloir d'une ou plusieurs dispositions du +Contrat, ne pourra en aucun cas impliquer renonciation par la Partie +intéressée à s'en prévaloir ultérieurement. + +11.3 Le Contrat annule et remplace toute convention antérieure, écrite +ou orale, entre les Parties sur le même objet et constitue l'accord +entier entre les Parties sur cet objet. Aucune addition ou modification +aux termes du Contrat n'aura d'effet à l'égard des Parties à moins +d'être faite par écrit et signée par leurs représentants dûment habilités. + +11.4 Dans l'hypothèse où une ou plusieurs des dispositions du Contrat +s'avèrerait contraire à une loi ou à un texte applicable, existants ou +futurs, cette loi ou ce texte prévaudrait, et les Parties feraient les +amendements nécessaires pour se conformer à cette loi ou à ce texte. +Toutes les autres dispositions resteront en vigueur. De même, la +nullité, pour quelque raison que ce soit, d'une des dispositions du +Contrat ne saurait entraîner la nullité de l'ensemble du Contrat. + + + 11.5 LANGUE + +Le Contrat est rédigé en langue française et en langue anglaise, ces +deux versions faisant également foi. + + + Article 12 - NOUVELLES VERSIONS DU CONTRAT + +12.1 Toute personne est autorisée à copier et distribuer des copies de +ce Contrat. + +12.2 Afin d'en préserver la cohérence, le texte du Contrat est protégé +et ne peut être modifié que par les auteurs de la licence, lesquels se +réservent le droit de publier périodiquement des mises à jour ou de +nouvelles versions du Contrat, qui posséderont chacune un numéro +distinct. Ces versions ultérieures seront susceptibles de prendre en +compte de nouvelles problématiques rencontrées par les logiciels libres. + +12.3 Tout Logiciel diffusé sous une version donnée du Contrat ne pourra +faire l'objet d'une diffusion ultérieure que sous la même version du +Contrat ou une version postérieure, sous réserve des dispositions de +l'article 5.3.4. + + + Article 13 - LOI APPLICABLE ET COMPETENCE TERRITORIALE + +13.1 Le Contrat est régi par la loi française. Les Parties conviennent +de tenter de régler à l'amiable les différends ou litiges qui +viendraient à se produire par suite ou à l'occasion du Contrat. + +13.2 A défaut d'accord amiable dans un délai de deux (2) mois à compter +de leur survenance et sauf situation relevant d'une procédure d'urgence, +les différends ou litiges seront portés par la Partie la plus diligente +devant les Tribunaux compétents de Paris. + + +Version 2.0 du 2006-09-05. diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..078bc78 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,63 @@ +EXEC=ecoPrimer + +PRIMER_SRC= ecoprimer.c +PRIMER_OBJ= $(patsubst %.c,%.o,$(PRIMER_SRC)) + + +SRCS= $(PRIMER_SRC) + +LIB= -lecoPCR -lapat -lKMRK -lz -lm + +LIBFILE= libapat/libapat.a \ + libecoPCR/libecoPCR.a \ + libKMRK/libKMRK.a + + +include global.mk + +all: $(EXEC) + + +######## +# +# ecoPrimer compilation +# +######## + +# executable compilation and link + +ecoPrimer: $(PRIMER_OBJ) $(LIBFILE) + $(CC) $(LDFLAGS) -o $@ $< $(LIBPATH) $(LIB) + + +######## +# +# library compilation +# +######## + +libapat/libapat.a: + $(MAKE) -C libapat + +libecoPCR/libecoPCR.a: + $(MAKE) -C libecoPCR + +libKMRK/libKMRK.a: + $(MAKE) -C libKMRK + + +######## +# +# project management +# +######## + +clean: + rm -f *.o + rm -f $(EXEC) + $(MAKE) -C libapat clean + $(MAKE) -C libecoPCR clean + $(MAKE) -C libKMRK clean + + + \ No newline at end of file diff --git a/src/ecoprimer.c b/src/ecoprimer.c new file mode 100644 index 0000000..e69de29 diff --git a/src/global.mk b/src/global.mk new file mode 100644 index 0000000..4f32e2e --- /dev/null +++ b/src/global.mk @@ -0,0 +1,18 @@ +MACHINE=MAC_OS_X +LIBPATH= -Llibapat -LlibecoPCR -LlibKMRK +MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $< + +CC=gcc +CFLAGS= -W -Wall -O2 -g + +default: all + +%.o: %.c + $(CC) -D$(MACHINE) $(CFLAGS) -c -o $@ $< + +%.P : %.c + $(MAKEDEPEND) + @sed 's/\($*\)\.o[ :]*/\1.o $@ : /g' < $*.d > $@; \ + rm -f $*.d; [ -s $@ ] || rm -f $@ + +include $(SRCS:.c=.P) \ No newline at end of file diff --git a/src/libKMRK/KMRK.c b/src/libKMRK/KMRK.c new file mode 100644 index 0000000..e768f75 --- /dev/null +++ b/src/libKMRK/KMRK.c @@ -0,0 +1,1009 @@ +#include "KMRK.h" + +#include "repseek_types.h" +#include "memory.h" + +#include +#include +#include +#include + + +/******************************************** + ******************************************** + ** + ** Declaration of static functions + ** + ******************************************** + ********************************************/ + + +/** + * @brief Allocate a couple of vector v and n of same size. + * Never return if memory space is not available. + * + * @param size size of the new allocated vector couple + * @param count number of sequence encodedable in the VN + * structure allocated + * + * @return a pointer to a vn_type structure + */ + +static vn_type* new_vn(int32_t size,int32_t count); + + + + + +/** + * Set to 0 all elements of the n vector of a + * vn structure. + * + * @param x a pointer to a vn_type structure + */ + +static void clearNVector(vn_type* x); + + +/** + * Set to 0 all elements of the v vector of a + * vn structure. + * + * @param x a pointer to a vn_type structure + */ + +static void clearVVector(vn_type* x); + + +/* static int32_t *copyNVector(vn_type *x, int32_t *dest); */ + + +/** + * @brief Encode a nucleotide in a numeric format. + * + * @param nucleic a char value containing A,C,G or T. + * All others values induce 0 as result. + * + * @return an interger code corresponding to the nucleotide + */ + +static int32_t code(char nucleic); + + +/** + * @brief Construct linked list of similar symboles present in + * v part of x. + * + * Equivalent to filling of P stack in standard implementation + * + * Result is stored in n part of x. + * + * @param x a pointer to a vn_type structure + */ + +static void phase1(vn_type* x); + + +/* static void unroll1(vn_type* x); */ + + +static void phase2(vn_type* x, int32_t k); + + +static void unroll2(vn_type* x); + + +static void unstack(vn_type* x, int32_t k); + +static vn_type *hashSequence(char *sequence, int32_t lmax, int32_t *k, + masked_area_table_t *mask); + +static void clearNMark(vn_type *x); + +static vn_type* encodeSequence(char *sequence, + masked_area_table_t *mask); + +static void applyQuorum(vn_type *x, + int32_t count, + int32_t countSeq, + KMRK_QUORUM_FUNC_PTR(quorum)); + +/******************************************** + ******************************************** + ** + ** Defintion of static functions + ** + ******************************************** + ********************************************/ + + + +static int32_t code(char nucleic) +{ + + if(nucleic == 'A') + return 1; + if(nucleic == 'C') + return 2; + if(nucleic == 'G') + return 3; + if(nucleic == 'T') + return 4; + + return 0; +} + + +/*static void displayVN(vn_type * x) +{ + int32_t i; + + for (i=0; i < x->size; i++) + { + printf("%-4i => %-4i ; %-4i\n",i+1,x->v[i],x->n[i]); + } + + printf("\n"); +}*/ + +static void clearNVector(vn_type* x) +{ + memset(x->n,0,sizeof(int32_t)*x->size); +}; + +static void clearVVector(vn_type* x) +{ + memset(x->v,0,sizeof(int32_t)*x->size); +}; + +static void clearNMark(vn_type *x) +{ + int32_t i; + int32_t *n; + int32_t size; + + n = x->n; + size = x->size; + + for (i=0; i < size; i++) + if (n[i] < 0) n[i] = -n[i]; + +} + +/* +static int32_t *copyNVector(vn_type *x, int32_t *dest) +{ + int32_t* reponse; + + if (dest) + reponse=dest; + else + KMRK_MALLOC(reponse,int32_t,x->size * sizeof(int32_t), + "copyNVector: I cannot not allocate a copy " + "of n vector"); + + memcpy(reponse,x->n,x->size*sizeof(int32_t)); + + return reponse; +} +*/ + +static vn_type* new_vn(int32_t size,int32_t count) +{ + vn_type* reponse; + + reponse = MyCalloc( 1, sizeof(vn_type)+sizeof(int32_t) * (count -1) , "vn_type: first calloc" ); + + reponse -> v = (int32_t *)MyCalloc(size, sizeof(int32_t), + "new_vn: I cannot not allocate a new VN structure"); + reponse->n = (int32_t *)MyCalloc(size, sizeof(int32_t), + "new_vn: I cannot not allocate a new VN structure"); + + + reponse->size = size; + reponse->seqCount= count; + + return reponse; +} + + + +static void phase1(vn_type* x) +{ + + int32_t *v; + int32_t *n; + int32_t size; + int32_t i; + int32_t symbole; + int32_t oldlast; + + v = GETVECTOR(x,v); + n = GETVECTOR(x,n); + size = x->size; + + clearNVector(x); + + for(i = 1; i <= size; i++) + { + symbole = GET(v,i); + if (symbole == i) + + SET(n,i,i); + + else if (symbole) + { + oldlast = GET(n,symbole); + SET(n,symbole,i); + SET(n,i,GET(n,oldlast)); + SET(n,oldlast,i); + } + } + +} + + + +static void phase2(vn_type* x, int32_t k) +{ + + int32_t *v; /* vector v of the stack 'x' */ + int32_t *n; /* vector n of the stack 'x' */ + int32_t size; /* size of the vector v of stack 'x' */ + int32_t i; + int32_t j; + int32_t next; + int32_t jump; + int32_t pi; + int32_t nipi; + int32_t symbmax; + + v = GETVECTOR(x,v); + n = GETVECTOR(x,n); + size = x->size - k; + symbmax = x->symbmax; + + for (j=1; j <= symbmax; j++) + if (SYMBOLE(v,j) == j) + { /* debut de chaine rouge */ + next = j; + do + { + i = next; + next = GET(n,i); + + jump = SYMBOLE(v,i+k); + + if (jump) + { + pi = GET(v,jump); + nipi = GET(n,pi); + if ((! IS_MARKED(n,pi)) || + ( SYMBOLE(v,i) != SYMBOLE(v,nipi)) || + ( jump != SYMBOLE(v,nipi+k))) + { + /* beginning of a green list */ + SETMARKED(v,jump,i); + //MARK(v,jump); + + SETMARKED(n,i,i); + //MARK(n,i); + } + else + { + SET(n,pi,i); /* le nouveau dernier */ + SET(n,i,GET(n,nipi)); /* */ + SET(n,nipi,i); + MARK(n,pi); + } + } + + } while (( i != next ) && + (next <= size)); + + } +} + + +static void unroll2(vn_type* x) +{ + int32_t *v; + int32_t *n; + int32_t size; + int32_t i; + int32_t second; + int32_t last; + + v = GETVECTOR(x,v); + n = GETVECTOR(x,n); + size = x->size; + + for(i = 1; i <= size; i++) + if (IS_MARKED(n,i)) + { + last = GET(n,i); + second = GET(n,last); + SET(n,last,last); + SETMARKED(n,i,second); + } +} + + + +static void unstack(vn_type* x, int32_t k) +{ + int32_t *v; + int32_t *n; + int32_t size; + int32_t i; + int32_t j; + int32_t classe=0; + int32_t next; + + + v = GETVECTOR(x,v); + n = GETVECTOR(x,n); + size = x->size; + + clearVVector(x); + + for(j = 1; j <= size; j++) + if (IS_MARKED(n,j)) + { + + classe=j; + next =j; + + do + { + + i = next; + next = GET(n,i); + SET(v,i,classe); + + } while (i != next); + } + + x->symbmax=classe; + x->size-=k; +} + + +static vn_type *hashSequence(char *sequence, + int32_t lmax, + int32_t *k, + masked_area_table_t *mask) +{ + vn_type *reponse; + int32_t ws; + int32_t clef; + int32_t *n; + int32_t *v; + int32_t size; + int32_t symbole; + int32_t word; + int32_t i; + int32_t maskZero; + int32_t maskClef; + int32_t zero; + int32_t count; + char* index; + int32_t symbmax; + int32_t posinseq; + + posinseq=0; + + size=0; + count=1; + symbmax = 0; + for (index=sequence; *index; index++) + { + size++; + if (*index == '@') + count++; + } + + reponse = new_vn(size,count); + + n = GETVECTOR(reponse,n); + v = reponse->v; + + /* estimate hash word size */ + + ws = (int32_t)floor(log((double)size+1)/log(4.0)); + if (ws > lmax) + ws=lmax; + + if (ws <= 1) + { + *k=1; + return NULL; + } + + maskClef=0; + maskZero=0; + clef =0; + zero =0; + count =0; + + maskZero= (1 << ws) - 1; + maskClef= (1 << (ws*2)) - 1; + + + for (i=0; i < ws-1; i++) + { + if (sequence[i]=='@') + { + reponse->limits[count]=i; + count++; + posinseq=0; + } + symbole = code(sequence[i]); + + if (KMRK_isMasked(mask,count,posinseq)) + symbole=0; + + clef <<= 2; + clef |= symbole-1; + + zero <<= 1; + zero |= (symbole) ? 0:1; + posinseq++; + } + + clef &= maskClef; + zero &= maskZero; + + for (i=ws-1; i < size; i++) + { + if (sequence[i]=='@') + { + reponse->limits[count]=i; + count++; + posinseq=0; + } + symbole = code(sequence[i]); + + if (KMRK_isMasked(mask,count,posinseq)) + symbole=0; + + clef <<= 2; + clef |= symbole-1; + clef &= maskClef; + + + zero <<= 1; + zero |= (symbole) ? 0:1; + zero &= maskZero; + + word = (zero) ? 0:(clef+1); + + if (word) + { + symbole = GET(n,word); + if (!symbole) + { + symbole = i+2-ws; + SET(n,word,symbole); + } + } + else + symbole=0; + + if (symbole > symbmax) + symbmax = symbole; + + v[i-ws+1]=symbole; + posinseq++; + } + + reponse->limits[count]= size; + reponse->symbmax = symbmax; + + for (i=reponse->size; i < size; i++) + v[i]=0; + + *k=ws; + + return reponse; +} + +static vn_type* encodeSequence(char *sequence,masked_area_table_t *mask) +{ + int32_t size; + vn_type *reponse; + int32_t i; + int32_t nucleotide; + int32_t symbole; + int32_t *n; + int32_t *v; + int32_t count; + char* index; + int32_t symbmax; + int32_t posinseq; + + size=0; + count=1; + posinseq=0; + + for (index=sequence; *index; index++) + { + size++; + if (*index == '@') + count++; + } + + reponse = new_vn(size,count); + + n = GETVECTOR(reponse,n); + v = reponse->v; + count = 0; + symbmax = 0; + + for (i=0; i < size; i++) + { + if (sequence[i]=='@') + { + reponse->limits[count]=i; + count++; + posinseq=0; + } + nucleotide = code(sequence[i]); + + if (KMRK_isMasked(mask,count,posinseq)) + nucleotide=0; + + + if (nucleotide) + { + symbole = GET(n,nucleotide); + if (!symbole) + { + symbole = i + 1; + SET(n,nucleotide,symbole); + } + } + else + symbole=0; + + if (symbole > symbmax) + symbmax = symbole; + + v[i]=symbole; + posinseq++; + } + + reponse->limits[count]= size; + reponse->symbmax = symbmax; + return reponse; +} + + +static void applyQuorum(vn_type *x, + int32_t count, + int32_t countSeq, + KMRK_QUORUM_FUNC_PTR(quorum)) +{ + int32_t *n; + int32_t *v; + int32_t size; + int32_t i; + + n = GETVECTOR(x,n); + v = GETVECTOR(x,v); + size = x->size; + + for (i=1; i < size; i++) + if ((IS_MARKED(n,i)) && + (! quorum(x,i,count,countSeq))) + UNMARK(n,i); +} + + + + +/******************************************** + ******************************************** + ** + ** Defintion of public functions + ** + ******************************************** + ********************************************/ + +/* + Count the number of direct pairs in the KMR-k result + input 'x' is the input stack -- see KMRK.h + vector n is a chained list where each n[i] is the next occurence of i +*/ +int32_t KMRK_upperCoupleCount(vn_type *x) +{ + int32_t i; /* i and j, dummy counters */ + int32_t j; + int32_t *n; /* the n vector of stack 'x' */ + int32_t size; /* its size */ + int32_t reponse; /* number of pairs */ + int32_t lllength; /* number of occurence */ + int32_t next; /* the next occurence in vector 'n' of stack 'x' */ + int32_t xmax; /* limit of the first sequence: from 1 to xmax */ + + xmax = x->limits[0]; + + reponse = 0; + + n = GETVECTOR(x,n); /* set n as the vector x->n */ + size = x->size; /* set its size */ + + for (j=1; j < xmax; j++) /* for all pos of the first sequence */ + if (IS_MARKED(n,j)) /* check if it is marked */ + { + next = j; + lllength = 0; + do + { + i = next; + next = GET(n,i); /* get the next occurence position */ + lllength++; + + } while ((next <= xmax) && (i != next)); /* while the next occurence are in direct seq * + * and not themselves */ + + + reponse += lllength * (lllength - 1) / 2; /* lllength choose 2 */ + + } + + return reponse; +} + + +/* + Count the number of inverted pairs in the KMR-k result + input 'x' is the input stack -- see KMRK.h + vector n is a chained list where each n[i] is the next occurence of i +*/ +int32_t KMRK_upperInvertedCount(vn_type* x,int32_t wordsize) +{ + int32_t i; /* i and j, dummy counters */ + int32_t j; + int32_t *n; /* the n vector of stack 'x' */ + int32_t size; /* its size */ + int32_t reponse; /* number of pairs with at least one inverted occurence */ + int32_t direct; /* occurence in the direct sequence */ + int32_t inverted; /* ccurence in the inverted sequence */ + int32_t next; /* the next occurence in vector 'n' of stack 'x' */ + int32_t posinv; + int32_t posdeb; + + /* + if there was no inverted sequence, there is no inverted seeds + */ + if ((x->seqCount == 1) || + (! x->complement)) + return 0; + + reponse = 0; + + n = GETVECTOR(x,n); /* set n as the vector n of stack 'x' */ + size = x->size; /* set its size */ + + for (j=1; j <= x->limits[0]; j++) /* for all position of the first (direct) sequence */ + if( IS_MARKED(n,j) ) /* if it is tagged (negative) */ + { + posdeb = j; + next = j; + direct = 0; + inverted = 0; + + do + { + i = next; /* set i as the position to check */ + next = GET(n,i); /* next is the symbol n[i] -- pos of the next occurence */ + + if ((i > x->limits[0]) && + (i < x->limits[1]) ) /* if it position is in seq inverted */ + inverted++; + else + direct++; + + } while (i != next); /* when the next occurence is the same pos, stop */ + + posinv = 2 * x->limits[0] - i -wordsize + 3; + if (inverted && (posinv == posdeb)) + reponse += direct; + reponse += inverted * direct; /* the number of occurence for this seed. * + * NB: if inv is 0, there is no inv occurence */ + } + +/* return (reponse+1)/2; .... weird ?? */ + + return reponse / 2; +} + +int32_t KMRK_upperInterCount(vn_type* x,int32_t seq1,int32_t seq2,int32_t wordsize) +{ + int32_t i; + int32_t j; + int32_t *n; + int32_t size; + int32_t reponse; + int32_t seqcount1; + int32_t seqcount2; + int32_t next; + + int min1,min2,max1,max2; + + if (seq1 > seq2) + { + i=seq1; + seq1=seq2; + seq2=i; + } + + if (seq1==seq2 && seq1 == 0) + return KMRK_upperCoupleCount(x); + + if (seq1==0 && seq2==1 && x->complement) + return KMRK_upperInvertedCount(x,wordsize); + + reponse = 0; + + n = GETVECTOR(x,n); + size = x->size; + + if (seq1==0) + min1=0; + else + min1 = x->limits[seq1-1]; + + max1 = x->limits[seq1]; + + min2 =x->limits[seq2-1]; + max2 =x->limits[seq2]; + + for (j=1; j <= max1; j++) + if (IS_MARKED(n,j)) + { + next = j; + seqcount1 = 0; + seqcount2 = 0; + do + { + i = next; + next = GET(n,i); + if ((i > min1) && + (i <=max1)) + seqcount1++; + else if ((i > min2) && + (i <=max2)) + seqcount2++; + } while ((i != next) && (next <= max2)); + + reponse += seqcount1 * seqcount2; + } + + /*fprintf(stderr,"\ncouple count : %i\n",reponse);*/ + return reponse; +} + + +void KMRK_markStart(vn_type* x) +{ + int32_t *v; + int32_t *n; + int32_t size; + int32_t i; + + v = GETVECTOR(x,v); + n = GETVECTOR(x,n); + size = x->size; + + for(i = 1; i <= size; i++) + if (SYMBOLE(v,i) == i) + MARK(n,i); + else + UNMARK(n,i); + + +} + + +int32_t KMRK_CoupleQuorum(vn_type* x, + int32_t pos, + int32_t count, + int32_t countSeq) +{ + return (GET(GETVECTOR(x,n),pos) != pos) ? 1:0; +} + +int32_t KMRK_DirInvQuorum(vn_type* x, + int32_t pos, + int32_t count, + int32_t countSeq) +{ + return ((pos >= x->limits[0]) || + (GET(GETVECTOR(x,n),pos) == pos)) ? 0:1; +} + +int32_t KMRK_InvQuorum(vn_type* x, + int32_t pos, + int32_t count, + int32_t countSeq) +{ + return ((pos < x->limits[0]) && + (GET(GETVECTOR(x,n),pos) >= x->limits[0])) ? 1:0; +} + +int32_t KMRK_Int12Quorum(vn_type* x, + int32_t pos, + int32_t count, + int32_t countSeq) +{ + return ((pos < x->limits[0]) && + (GET(GETVECTOR(x,n),pos) >= x->limits[0])) ? 1:0; +} + + +int32_t KMRK_IntInv12Quorum(vn_type* x, + int32_t pos, + int32_t count, + int32_t countSeq) +{ + return ((pos < x->limits[1]) && + (pos >= x->limits[0]) && + (GET(GETVECTOR(x,n),pos) >= x->limits[1])) ? 1:0; +} + +int32_t KMRK_IntDirInv12Quorum(vn_type* x, + int32_t pos, + int32_t count, + int32_t countSeq) +{ + return ((pos < x->limits[1]) && + (GET(GETVECTOR(x,n),pos) >= x->limits[1])) ? 1:0; +} + + + + +void KMRK_RunKMRK(vn_type *x, + int32_t k, + int32_t count, + int32_t countSeq, + KMRK_QUORUM_FUNC_PTR(quorum)) +{ + clearNMark(x); + phase2(x,k); + applyQuorum(x,count,countSeq,quorum); + unroll2(x); + unstack(x,k); +} + +vn_type *KMRK_HashOneSequence(char *sequence, + int32_t complement, + int32_t count, + int32_t countSeq, + int32_t *k, + KMRK_QUORUM_FUNC_PTR(quorum), + masked_area_table_t *mask) +{ + + int32_t i; + int32_t *n; + int32_t *v; + int32_t size; + vn_type *reponse; + int32_t lmax; + + lmax = *k; + + reponse = hashSequence(sequence,lmax,k,mask); + + if (!reponse) + { + reponse = encodeSequence(sequence,mask); + *k=1; + } + + reponse->complement = complement; + + phase1(reponse); + + n = GETVECTOR(reponse,n); + v = GETVECTOR(reponse,v); + size = reponse->size; + + for (i=1; i <= size; i++) + if (SYMBOLE(v,i) == i) + MARK(n,i); + + applyQuorum(reponse,count,countSeq,quorum); + + unroll2(reponse); + unstack(reponse,(*k)-1); + + /* fprintf(stderr, "Hash up to size %d\n",*k); */ + + return reponse; +} + +vn_type* KMRK_InitOneSequence(char *sequence, + int32_t complement, + int32_t count, + int32_t countSeq, + int32_t *k, + KMRK_QUORUM_FUNC_PTR(quorum), + masked_area_table_t *mask) +{ + + vn_type *reponse; + int32_t i; + int32_t *n; + int32_t *v; + int32_t size; + + reponse = encodeSequence(sequence,mask); + reponse->complement = complement; + + phase1(reponse); + + n = GETVECTOR(reponse,n); + v = GETVECTOR(reponse,v); + size = reponse->size; + + for (i=1; i <= size; i++) + if (SYMBOLE(v,i) == i) + MARK(n,i); + + applyQuorum(reponse,count,countSeq,quorum); + + unroll2(reponse); + unstack(reponse,0); + *k=1; + return reponse; + +} + + + +vn_type *KMRK_RunTo(char *sequence, + int32_t size, + int32_t complement, + int32_t count, + int32_t countSeq, + KMRK_QUORUM_FUNC_PTR(quorum), + KMRK_INIT_FUNC_PTR(init,quorum), + masked_area_table_t *mask) +{ + + int32_t k; + vn_type *reponse; + + k=size; + + reponse = init(sequence,complement,count,countSeq,&k,quorum,mask); + + while(k <= size/2) + { + /*fprintf(stderr,"KMRK step %d\n",k);*/ + KMRK_RunKMRK(reponse,k,count,countSeq,quorum); + k*=2; + } + + if (k < size) + KMRK_RunKMRK(reponse,size - k,count,countSeq,quorum); + + return reponse; +} + + +void KMRK_FreeVN(vn_type *x) +{ + if (x) + { + if (x->v) MyFree(x->v, x->size*sizeof(int32_t)); + if (x->n) MyFree(x->n, x->size*sizeof(int32_t)); + MyFree(x, 1*sizeof(vn_type)); + } +} + + diff --git a/src/libKMRK/KMRK.h b/src/libKMRK/KMRK.h new file mode 100644 index 0000000..fba733e --- /dev/null +++ b/src/libKMRK/KMRK.h @@ -0,0 +1,309 @@ +#ifndef _KMRK_H_ +#define _KMRK_H_ + +/******************************************** + ******************************************** + ** + ** Declaration of struct + ** + ******************************************** + ********************************************/ + +#include +#include "repseek_types.h" +#include "KMRK_mask.h" + +/** + * Structure used to manipulate simultanously + * the v and n vector + * + */ + +typedef struct { + + int32_t size; /**< size of vectors */ + int32_t seqCount; /**< count of concatenated sequences */ + int32_t complement; /**< if seqCount > 1 and complement !=0 + * then second sequence is the inverted complement + * strand of first one */ + int32_t symbmax; + int32_t* v; /**< sequence vector */ + int32_t* n; /**< linked list vector */ + int32_t limits[1]; /**< array of end limits of concatenated + * sequences in v (array size is seqCount) */ +} vn_type; + + + +/******************************************** + ******************************************** + ** + ** Declaration of public macro + ** + ******************************************** + ********************************************/ + +// Return a pointer to a vector from a vn_type structure +#define GETVECTOR(x,vector) (((x)->vector) - 1) + +#define IS_MARKED(x,i) ((x)[i] < 0) +#define MARK(x,i) ((x)[i]) = -ABS((x)[i]) +#define UNMARK(x,i) ((x)[i]) = ABS((x)[i]) + +#define SET(x,i,v) ((x)[i]) = (v) + +// set and mark in one operation +#define SETMARKED(x,i,v) ((x)[i]) = -(v) + +//internal macro +#define GET(x,i) ABS((x)[i]) + +// get symbole at position i in vector x +#define SYMBOLE(x,i) ((IS_MARKED((x),(i))) ? (i): (GET(x,i))) + + + +/** + * Macro used to declare a pointer to a quorum function. + * + * @param name name of the pointer + * + */ + +#define KMRK_QUORUM_FUNC_PTR(name) int32_t (*name)(vn_type* x, \ + int32_t pos, \ + int32_t count, \ + int32_t countSeq) + + + + +/** + * Macro used to declare a pointer to an initialisation function. + * + * @param name name of the pointer + * @param quorum name used for the quorum assiciated function + * + * @see KMRK_QUORUM_FUNC_PTR + * + */ + +#define KMRK_INIT_FUNC_PTR(name,quorum) vn_type* (*name)(char *sequence, \ + int32_t complement, \ + int32_t count, \ + int32_t countSeq, \ + int32_t *k, \ + KMRK_QUORUM_FUNC_PTR(quorum),\ + masked_area_table_t *mask) + + +/******************************************** + ******************************************** + ** + ** Declaration of public functions + ** + ******************************************** + ********************************************/ + + + + + +/** + * Initialise a vn_type record from one sequence to run KMRK algorithm + * + * @param sequence pointer to a C string containing the sequence + * @param complement != 0 means that seq one and two are the two strands of + * the same sequence. + * @param count parameter count passed to the quorun function + * @param countSeq parametter countSeq passed to the quorun function + * @param k length of the word represented by each symbole of v. + * k is an output parametter + * @param quorum pointer to a quorum function + * + * @return a pointer to vn_type structure correctly initialized + * to be used by KMRK_RunKMRK + * + * @see KMRK_HashOneSequence + */ + +vn_type* KMRK_InitOneSequence(char *sequence, + int32_t complement, + int32_t count, + int32_t countSeq, + int32_t *k, + KMRK_QUORUM_FUNC_PTR(quorum), + masked_area_table_t *mask); + + + + + +/** + * Initialise a vn_type record from one sequence to run KMRK algorithme. + * In more than KMRK_InitOneSequence, KMRK_HashOneSequence construct + * word of len k with an hash algorithm. k used is a function of + * sequence size and alphabet size. If calculed k is superior to lmax + * then k = lmax. + * + * @param sequence pointer to a C string containing the sequence + * @param complement != 0 means that seq one and two are the two strands of + * the same sequence. + * @param count parametter count passed to the quorun function + * @param countSeq parametter countSeq passed to the quorun function + * @param k maximum length of the created word (input) + * length of the word represented by each symbole + * of v (output). + * @param quorum pointer to a quorum function + * + * @return a pointer to vn_type structure correctly initialized + * to be used by KMRK_RunKMRK + * + * @see KMRK_InitOneSequence + */ + +vn_type* KMRK_HashOneSequence(char *sequence, + int32_t complement, + int32_t count, + int32_t countSeq, + int32_t *k, + KMRK_QUORUM_FUNC_PTR(quorum), + masked_area_table_t *mask); + + +/** + * An example of quorum function testing than a factor is + * present a least two times. Because of definition of this + * quorum function count and countSeq parametter have no meanning + * in this instance of quorum function + * + * @param x a pointer to vn_type structure to check + * @param pos position in n of the beginning of the linked list to test + * @param count minimal number of occurence of factor + * @param countSeq minimal number of sequences concerned + * + * @return 1 if quorum is ok 0 otherwise. + */ + +int32_t KMRK_CoupleQuorum(vn_type* x, + int32_t pos, + int32_t count, + int32_t countSeq); + + +/** + * An example of quorum function testing than a factor is + * present a least two times in the direct strand of a sequence or + * at least one time in the direct strand and one time in the reverse + * strand. Because of definition of this + * quorum function count and countSeq parametter have no meanning + * in this instance of quorum function + * + * @param x a pointer to vn_type structure to check + * @param pos position in n of the beginning of the linked list to test + * @param count minimal number of occurence of factor + * @param countSeq minimal number of sequences concerned + * + * @return 1 if quorum is ok 0 otherwise. + */ + +int32_t KMRK_DirInvQuorum(vn_type* x, + int32_t pos, + int32_t count, + int32_t countSeq); + +/** + * An example of quorum function testing than a factor is + * present a least one time in the direct strand and one time in the reverse + * strand. Because of definition of this + * quorum function count and countSeq parametter have no meanning + * in this instance of quorum function + * + * @param x a pointer to vn_type structure to check + * @param pos position in n of the beginning of the linked list to test + * @param count minimal number of occurence of factor + * @param countSeq minimal number of sequences concerned + * + * @return 1 if quorum is ok 0 otherwise. + */ + +int32_t KMRK_InvQuorum(vn_type* x, + int32_t pos, + int32_t count, + int32_t countSeq); + +int32_t KMRK_Int12Quorum(vn_type* x, + int32_t pos, + int32_t count, + int32_t countSeq); + +int32_t KMRK_IntInv12Quorum(vn_type* x, + int32_t pos, + int32_t count, + int32_t countSeq); + +int32_t KMRK_IntDirInv12Quorum(vn_type* x, + int32_t pos, + int32_t count, + int32_t countSeq); +/** + * realize one cycle of KMR. + * + * @param x a pointer to vn_type created by an intialisation + * function or returned by this function. + * @param k step used to join two words + * @param count parametter count passed to the quorun function + * @param countSeq parametter countSeq passed to the quorun function + * @param KMRK_QUORUM_FUNC_PTR quorum pointer to a quorum function + */ + +void KMRK_RunKMRK(vn_type *x, + int32_t k, + int32_t count, + int32_t countSeq, + KMRK_QUORUM_FUNC_PTR(quorum)); + + + + +/** + * realises serveral run of KMR cycle to make from a sequence + * a vn_type structure describing sequences of factors of a precise size. + * + * @param sequence pointer to a C string containing the sequence + * @param size word size to construct + * @param count parametter count passed to the quorun function + * @param countSeq parametter countSeq passed to the quorun function + * @param quorum pointer to a quorum function + * @param init pointer to a initialisation function + * + * @return a vn_type pointer to a structure containing sequences of factors + */ + +vn_type *KMRK_RunTo(char *sequence, + int32_t size, + int32_t complement, + int32_t count, + int32_t countSeq, + KMRK_QUORUM_FUNC_PTR(quorum), + KMRK_INIT_FUNC_PTR(init,quorum), + masked_area_table_t *mask); + + + +/** + * free memory associated to a vn_type pointer + * + * @param x a pointer to vn_type structure + */ + +void KMRK_FreeVN(vn_type *x); + + +int32_t KMRK_upperCoupleCount(vn_type *x); +int32_t KMRK_upperInvertedCount(vn_type* x,int32_t wordsize); +int32_t KMRK_upperInterCount(vn_type* x,int32_t seq1,int32_t seq2,int32_t wordsize); + +void KMRK_markStart(vn_type* x); + +#endif //_KMRK_H_ diff --git a/src/libKMRK/KMRK_Seeds.c b/src/libKMRK/KMRK_Seeds.c new file mode 100644 index 0000000..8083d5c --- /dev/null +++ b/src/libKMRK/KMRK_Seeds.c @@ -0,0 +1,908 @@ +#include "KMRK_Seeds.h" +#include "memory.h" +#include +#include +#include "sequence.h" + + +static void SetMultipleLenInvSeeds(SmallSeed_type* seeds, + int32_t nseeds, + int32_t wordSize, + int8_t same, + AllSeeds_type *PtrAllSeeds); + + +/* + Concatenate + DirSeq\0InvSeq\0 +*/ +static char* makeDirInvSeq(char* seq, int32_t size) +{ + char *SeqInv; + + seq = (char *)MyRealloc( (void *)seq, (size*2+2)*sizeof(char), + (size+1)* sizeof(char), "Cannot allocate space for reverse sequence"); + + SeqInv= seq + size + 1; + seq[size]=0; + invseq(seq, SeqInv); + seq[size]='@'; + SeqInv[size]=0; + + return seq; +} + +/* + Merge the seq1 with seq2 +*/ +static char *merge2seq(char* seq1, char* seq2, + int32_t size1, int32_t size2) +{ + char * dest; + + seq1 = (char *)MyRealloc((void *)seq1, (size1+size2+2) *sizeof(char), + (size1+1)*sizeof(char), "Cannot allocate space for reverse sequence"); + + dest = seq1 + size1 + 1; + seq1[size1]='@'; + memcpy(dest,seq2,size2); + dest[size2]=0; + + return seq1; +} + +static int32_t dirDelta(SmallSeed_type* seed) +{ + return seed->pos2 - seed->pos1; +} + +void KMRK_SetMultipleLenDirSeeds(SmallSeed_type* seeds, + int32_t nseeds, + int32_t wordSize, + AllSeeds_type *PtrAllSeeds) +{ + + int32_t i,j; /* dummy counters j=kept seeds ; i = current seed */ + int32_t curLen=wordSize; /* Length of the current seed */ + int32_t delta; + SmallSeed_type *mainSeed; + SmallSeed_type *curSeed; + int32_t add; + +/* fprintf(stderr,"New Version\n");*/ + + KMRK_sortSeeds(seeds,nseeds,KMRK_cmpDeltaSeedsPos); + + for(j=0,mainSeed=seeds ; + j < nseeds; + j=i,mainSeed=curSeed) + { + /* foreach seed */ + delta = dirDelta(mainSeed); + curLen=wordSize; + + for (i=j+1,curSeed=mainSeed+1; + i < nseeds && + dirDelta(curSeed)==delta && + (curSeed->pos1 - mainSeed->pos1) <= curLen; + i++,curSeed++) + { + add=wordSize - mainSeed->pos1 - curLen + curSeed->pos1; + if (add < 0) add = 0; + curLen+=add; + } + + KMRK_pushSeed(PtrAllSeeds, + seeds[j].pos1,seeds[j].pos2, + curLen, + 1); + } +} + +static int32_t invDelta(SmallSeed_type* seed) +{ + return seed->pos2 + seed->pos1; +} + +static void SetMultipleLenInvSeeds(SmallSeed_type* seeds, + int32_t nseeds, + int32_t wordSize, + int8_t same, + AllSeeds_type *PtrAllSeeds) +{ + + int32_t i,j; /* dummy counters j=kept seeds ; i = current seed */ + int32_t curLen=wordSize; /* Length of the current seed */ + int32_t delta; + int32_t pos2; + SmallSeed_type *mainSeed; + SmallSeed_type *curSeed; + int32_t add; + +/* fprintf(stderr,"New Version\n");*/ + + KMRK_sortSeeds(seeds,nseeds,KMRK_cmpDeltaInvSeedsPos); + + for(j=0,mainSeed=seeds ; + j < nseeds; + j=i,mainSeed=curSeed) + { + /* foreach seed */ + delta = invDelta(mainSeed); + curLen=wordSize; + + for (i=j+1,curSeed=mainSeed+1; + i < nseeds && + invDelta(curSeed)==delta && + (curSeed->pos1 - mainSeed->pos1) <= curLen; + i++,curSeed++) + { + add=wordSize - mainSeed->pos1 - curLen + curSeed->pos1; + if (add < 0) add = 0; + curLen+=add; + } + + if ( same && (seeds[j].pos1+curLen>= seeds[i-1].pos2)) + { + curLen = (seeds[i-1].pos2 - seeds[j].pos1 + 1) * 2; + pos2 = seeds[j].pos1; + } + else + pos2 = seeds[i-1].pos2; + + KMRK_pushSeed(PtrAllSeeds, + seeds[j].pos1,pos2, + curLen, + 0); + } +} + +AllSeeds_type *KMRK_allocSeeds(AllSeeds_type *AllSeeds, + int32_t size, + int8_t opt_dir, + int8_t opt_inv) +{ + + AllSeeds_type *reponse; + + if(opt_inv != 1 && opt_dir != 1) + { + fprintf(stderr,"AllocSeeds: requiere at least " + "one of opt_dir and opt_inv to be 1\n"); + exit(4); + } + + reponse = AllSeeds; + + if (!reponse) + reponse = MyCalloc( 1, sizeof(AllSeeds_type),"KMRK_allocSeeds: cannot allocate new data structure"); + + if(opt_dir) + { + if (reponse->dirSeeds==NULL) + reponse->dirSeeds = (Seed_type *)MyCalloc( size,sizeof(Seed_type),"KMRK_allocSeeds: cannot allocate new data structure"); + else + { + if(size) + reponse->dirSeeds = (Seed_type *)MyRealloc( (void *)reponse->dirSeeds, + size*sizeof(Seed_type), reponse->cDirSeeds*sizeof(Seed_type), + "allocSeeds: cannot reallocate data structure" ); + else + { + MyFree(reponse->dirSeeds, reponse->cDirSeeds*sizeof(Seed_type)); + reponse->dirSeeds=NULL; + reponse->cDirSeeds=0; + } + } + + reponse->cDirSeeds=size; + } + + + if(opt_inv) + { + if (reponse->invSeeds==NULL) + reponse->invSeeds = (Seed_type *)MyCalloc( size, sizeof(Seed_type),"KMRK_allocSeeds: cannot allocate new data structure"); + else + { + if(size) + reponse->invSeeds = (Seed_type *)MyRealloc( (void *)reponse->invSeeds, + size*sizeof(Seed_type), reponse->cInvSeeds*sizeof(Seed_type), + "allocSeeds: cannot reallocate data structure" ); + else + { + MyFree(reponse->invSeeds, reponse->cInvSeeds*sizeof(Seed_type)); + reponse->invSeeds=NULL; + reponse->cInvSeeds=0; + } + } + + reponse->cInvSeeds=size; + } + + + return reponse; +} + +void KMRK_freeSeeds(AllSeeds_type *AllSeeds) +{ + if (!AllSeeds) + return; + + if (AllSeeds->dirSeeds) + MyFree(AllSeeds->dirSeeds, AllSeeds->cDirSeeds*sizeof(Seed_type) ); + AllSeeds->dirSeeds=NULL; + + if (AllSeeds->invSeeds) + MyFree(AllSeeds->invSeeds, AllSeeds->cInvSeeds*sizeof(Seed_type) ); + AllSeeds->dirSeeds=NULL; + + MyFree(AllSeeds, 1*sizeof( AllSeeds_type ) ); +} + +void KMRK_compactSeeds(AllSeeds_type *AllSeeds) +{ + if (AllSeeds) + { + if (AllSeeds->dirSeeds) + KMRK_allocSeeds(AllSeeds, + AllSeeds->nDirSeeds, + 1, + 0); + + if (AllSeeds->invSeeds) + KMRK_allocSeeds(AllSeeds, + AllSeeds->nInvSeeds, + 0, + 1); + + } +} + + +void KMRK_pushSeed(AllSeeds_type *AllSeeds, + int32_t pos1, + int32_t pos2, + int32_t length, + int8_t dir) +{ + Seed_type* stack; + int32_t maxcount; + int32_t index; + + if (dir) + { + dir = 1; + stack = AllSeeds->dirSeeds; + maxcount = AllSeeds->cDirSeeds; + index = AllSeeds->nDirSeeds; + } + else + { + dir = 0; + stack = AllSeeds->invSeeds; + maxcount = AllSeeds->cInvSeeds; + index = AllSeeds->nInvSeeds; + } + + if (index == maxcount) + { + (void) KMRK_allocSeeds(AllSeeds, + maxcount * 2, + dir, + !dir); + + if (dir) + stack = AllSeeds->dirSeeds; + else + stack = AllSeeds->invSeeds; + } + + stack+=index; + + stack->pos1 = pos1; + stack->pos2 = pos2; + stack->length = length; + + if (dir) + AllSeeds->nDirSeeds++; + else + AllSeeds->nInvSeeds++; + +} + + + + +AllSeeds_type* KMRK_enumerateDirectCouple(AllSeeds_type* Seeds, + int32_t expected, + int32_t wordSize, + vn_type* stack, + int32_t seq) +{ + + int32_t xmin; + int32_t xmax; + int32_t i; + int32_t j; + int32_t k; + int32_t next; + int32_t* n; + int32_t nseeds; + + SmallSeed_type *list; + + list = (SmallSeed_type *)MyCalloc( expected, sizeof(SmallSeed_type) , + "KMRK_enumerateDirectCouple: cannot allocate DirectCouple memory"); + + nseeds = 0; + + n = GETVECTOR(stack,n); + + if (seq) + xmin = stack->limits[seq-1]; + else + xmin = 0; + + xmax = stack->limits[seq]; + + for (i=1; i <= xmax; i++) + if (IS_MARKED(n,i)) /* Check begining of chained list */ + { + /* Look for begining of sequence of interest */ + for( ;(i <= xmin) && (i != GET(n,i)); + i=GET(n,i)); + + /* for each factor in sequence of interest */ + for (j=i; + (j != GET(n,j)) && (j <= xmax); + j = GET(n,j)) + { + next = GET(n,j); + if (next <= xmax) + do + { + k = next; + next = GET(n,k); + list[nseeds].pos1 = j-1; + list[nseeds].pos2 = k-1; + nseeds++; + } while ((k!=next) && (next <= xmax)); + } + } ; + + Seeds = KMRK_allocSeeds(Seeds, + expected/20+1, + 1,0); + +/* fprintf(stderr,"Expected direct couple : %d\n",expected);*/ + + KMRK_SetMultipleLenDirSeeds(list,nseeds,wordSize,Seeds); + MyFree(list, expected*sizeof(SmallSeed_type) ); + KMRK_compactSeeds(Seeds); + + return Seeds; +} + + +/* + From KMR-K Stacks to SmallSeeds +*/ +AllSeeds_type* KMRK_enumerateInvertedCouple(AllSeeds_type* Seeds, + int32_t expected, + int32_t wordSize, + vn_type* stack) +{ + int32_t xmax; + int32_t invmax; + int32_t posinv; + int32_t i; + int32_t j; + int32_t k; + int32_t memk; + int32_t* n; + int32_t next; + + int32_t nseeds; /* number of seeds */ + + SmallSeed_type *list; /* seed list with only pos1 and pos2 --simple output from kmrk */ + + + list = (SmallSeed_type *)MyCalloc( expected, sizeof(SmallSeed_type) , + "KMRK_enumerateInvertedCouple: cannot allocate InvertedCouple memory"); + + nseeds = 0; + + n = GETVECTOR(stack,n); + + xmax = stack->limits[0]; + invmax = stack->limits[1]; + + for (i=1; i <= xmax; i++) + if (IS_MARKED(n,i)) + { + for(memk=i ; + (memk < xmax) && + memk != GET(n,memk) && + (memk <= invmax); + memk=GET(n,memk)); + + if ((memk > xmax) && (memk <= invmax)) + for (j=i; + (j <= xmax) && (j != GET(n,j)); + j=GET(n,j)) + { + next = memk; + do + { + k = next; + next = GET(n,k); + posinv = 2 * xmax - k -wordSize + 3; + if (j <= posinv) + { + list[nseeds].pos1=j-1; + list[nseeds].pos2=posinv-1; + nseeds++; + } + } while ((next <= invmax) && (k != next)); + } + } + + Seeds = KMRK_allocSeeds(Seeds, + expected/20+1, + 0,1); + +/* fprintf(stderr,"Expected inverted couple : %d\n",expected);*/ + + SetMultipleLenInvSeeds(list,nseeds,wordSize,1,Seeds); /* turn the Small seeds into merged seeds */ + MyFree(list, expected*sizeof(SmallSeed_type) ); + KMRK_compactSeeds(Seeds); + + + return Seeds; +} + +AllSeeds_type* KMRK_enumerateInterCouple(AllSeeds_type* Seeds, + int32_t seq1, + int32_t seq2, + int32_t expected, + int32_t wordSize, + vn_type* stack) +{ + int32_t xmin; + int32_t xmax; + int32_t ymax; + int32_t ymin; + int32_t pos1; + int32_t pos2; + int32_t i; + int32_t j; + int32_t k; + int32_t memj; + int32_t memk; + int32_t* n; + int32_t next; + + int32_t nseeds; + + + SmallSeed_type *list; + nseeds=0; + + list = (SmallSeed_type *)MyCalloc( expected, sizeof(SmallSeed_type) , + "KMRK_enumerateInterCouple: cannot allocate InterCouple memory"); + + n = GETVECTOR(stack,n); + + if (seq1==0) + xmin=0; + else + xmin = stack->limits[seq1-1]; + + xmax = stack->limits[seq1]; + ymin = stack->limits[seq2-1]; + ymax = stack->limits[seq2]; + + for (i=1; i <= xmax; i++) + if (IS_MARKED(n,i)) + { + for(memj=i ; + (memj < xmin) && + memj != GET(n,memj); + memj=GET(n,memj)); + + if ((memj > xmin) && (memj <= xmax)) + { + for(memk=memj ; + (memk < ymin) && + memk != GET(n,memk); + memk=GET(n,memk)); + + if ((memk > ymin) && (memk <= ymax)) + for (j=memj; + (j <= xmax) && (j != GET(n,j)); + j=GET(n,j)) + { + next = memk; + do + { + k = next; + next = GET(n,k); + if (seq1 > 0) + pos1 = j - xmin - 1; + else + pos1 = j; + pos2 = k - ymin - 1; + list[nseeds].pos1 = pos1 - 1; + list[nseeds].pos2 = pos2 - 1; + nseeds++; + } while ((next <= ymax) && (k != next)); + } + } + } + + + Seeds = KMRK_allocSeeds(Seeds, + expected/20+1, + 1,0); + + /* fprintf(stderr,"Expected inter-direct couple : %d\n",expected);*/ + KMRK_SetMultipleLenDirSeeds(list,nseeds,wordSize,Seeds); + + MyFree(list, expected*sizeof(SmallSeed_type) ); + + KMRK_compactSeeds(Seeds); + + return Seeds; +} + +AllSeeds_type* KMRK_enumerateInterInvertedCouple(AllSeeds_type* Seeds, + int32_t seq2, + int32_t expected, + int32_t wordSize, + vn_type* stack) + +{ + int32_t xmin; + int32_t xmax; + int32_t ymax; + int32_t ymin; + int32_t posinv; + int32_t pos2; + int32_t i; + int32_t j; + int32_t k; + int32_t memj; + int32_t memk; + int32_t* n; + int32_t next; + + int32_t nseeds; + + SmallSeed_type *list; + + list = (SmallSeed_type *)MyCalloc( expected, sizeof(SmallSeed_type) , + "KMRK_enumerateInterCouple: cannot allocate InterCouple memory"); + + + nseeds = 0; + n = GETVECTOR(stack,n); + + if (seq2 < 2) + { + fprintf(stderr,"enumerateInterInvertedCouple: seq2 must be differente to 0"); + exit(4); + } + + + xmin = stack->limits[0]; + xmax = stack->limits[1]; + + ymin = stack->limits[seq2-1]; + ymax = stack->limits[seq2]; + + Seeds = KMRK_allocSeeds(Seeds, + expected, + 0,1); + + for (i=1; i <= xmax; i++) + if (IS_MARKED(n,i)) + { + for(memj=i ; + (memj < xmin) && + memj != GET(n,memj); + memj=GET(n,memj)); + + if ((memj > xmin) && (memj <= xmax)) + { + for(memk=memj ; + (memk < ymin) && + memk != GET(n,memk); + memk=GET(n,memk)); + + if ((memk > ymin) && (memk <= ymax)) + for (j=memj; + (j <= xmax) && (j != GET(n,j)); + j=GET(n,j)) + { + next = memk; + do + { + k = next; + next = GET(n,k); + posinv = 2 * xmin - j -wordSize + 3; + pos2 = k - ymin - 1; + list[nseeds].pos1=posinv-1; + list[nseeds].pos2=pos2-1; + nseeds++; + } while ((next <= ymax) && (k != next)); + } + } + } + + Seeds = KMRK_allocSeeds(Seeds, + expected/20+1, + 0,1); + +/* fprintf(stderr,"Expected inter-inverted couple : %d\n",expected);*/ + SetMultipleLenInvSeeds(list,nseeds,wordSize,0,Seeds); + MyFree(list, expected* sizeof(SmallSeed_type) ); + KMRK_compactSeeds(Seeds); + + + return Seeds; +} + + +int32_t KMRK_cmpSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2) +{ + if (s1->pos1==s2->pos1) + return s1->pos2 - s2->pos2; + else + return s1->pos1 - s2->pos1; +} + +int32_t KMRK_cmpDeltaSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2) +{ + int32_t delta1 = s1->pos2-s1->pos1; + int32_t delta2 = s2->pos2-s2->pos1; + + if (delta1==delta2) + return s1->pos1 - s2->pos1; + else + return delta1 - delta2; +} + +int32_t KMRK_cmpDeltaInvSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2) +{ + int32_t delta1 = s1->pos2+s1->pos1; + int32_t delta2 = s2->pos2+s2->pos1; + + if (delta1==delta2) + + return s1->pos1 - s2->pos1; + else + return delta1 - delta2; +} + + +void KMRK_sortSeeds(SmallSeed_type* seeds, + int32_t nseeds, + KMRK_SORT_SEEDS_FUNC_PTR(compare)) +{ + + qsort(seeds, + nseeds, + sizeof(SmallSeed_type), + (int (*)(const void *, const void *))compare); +} + +AllSeeds_type* KMRK_get_seeds(char **seq, + int32_t SimpleSeqLen, + int16_t Lmin, + int8_t opt_dir, + int8_t opt_inv, + int8_t opt_verbose, + masked_area_table_t *mask) +{ + AllSeeds_type* AllSeeds; + char *SeqDir = *seq; + vn_type * stacks; + int32_t dirExpect=0; + int32_t invExpect=0; + KMRK_QUORUM_FUNC_PTR(quorum); + + if(opt_inv != 1 && + opt_dir != 1) + { + fprintf(stderr, + "get_seeds: requiered at least " + "opt_dir or opt_inv to be 1\n"); + exit(4); + } + + if(opt_inv) + SeqDir = makeDirInvSeq(SeqDir,SimpleSeqLen); /* create a sequence with "DirSeq\0InvSeq\0" */ + + if (opt_inv){ /* Are we interested in dir, inv or both ? */ + if (opt_dir) + quorum = KMRK_DirInvQuorum; + else + quorum = KMRK_InvQuorum; + } + else + quorum = KMRK_CoupleQuorum; + + stacks = KMRK_RunTo(SeqDir, + Lmin, + opt_inv, + 2, + 1, + quorum, + KMRK_HashOneSequence, + mask); + + invExpect =0; + + KMRK_markStart(stacks); + + if (opt_inv) + { + SeqDir = (char *)MyRealloc( (void *)SeqDir, (SimpleSeqLen+1)*sizeof(char), + (2*SimpleSeqLen+2)*sizeof(char) , "KRMK_get_seeds: Cannot shrink memory"); /* reset mem to a sigle sequence */ + SeqDir[SimpleSeqLen]=0; + } + + if(opt_inv) + invExpect = KMRK_upperInvertedCount(stacks,Lmin); + + if(opt_dir) + dirExpect = KMRK_upperCoupleCount(stacks); + + + AllSeeds = NULL; + + + MyFree(stacks->v, stacks->size * sizeof(int32_t)); + stacks->v=NULL; + + + if (opt_dir) + AllSeeds = KMRK_enumerateDirectCouple(AllSeeds,dirExpect,Lmin ,stacks,0); + + if (opt_inv) + AllSeeds = KMRK_enumerateInvertedCouple(AllSeeds,invExpect,Lmin,stacks); + + KMRK_FreeVN(stacks); + + *seq = SeqDir; + + return AllSeeds; +} + +AllSeeds_type* KMRK_get_seeds_2seqs(char **seq1, + char **seq2, + int32_t size1, + int32_t size2, + int16_t Lmin, + int8_t opt_dir, + int8_t opt_inv, + int8_t opt_verbose, + masked_area_table_t *mask) +{ + + AllSeeds_type* AllSeeds; + char *sequence1 = *seq1; + char *sequence2 = *seq2; + vn_type * stacks; + int32_t dirExpect=0; + int32_t invExpect=0; + KMRK_QUORUM_FUNC_PTR(quorum); + int32_t sizef; + + if(opt_inv != 1 && + opt_dir != 1) + { + fprintf(stderr, + "get_seeds_2seqs: requiered at least " + "opt_dir or opt_inv to be 1\n"); + exit(4); + } + + sizef = size1; + + if(opt_inv) + { + sequence1 = makeDirInvSeq(sequence1,size1); + sizef+=(1+size1); + } + + sequence1 = merge2seq(sequence1,sequence2,sizef,size2); + + if (opt_inv) + if (opt_dir) + quorum = KMRK_IntDirInv12Quorum; + else + quorum = KMRK_IntInv12Quorum; + else + quorum = KMRK_Int12Quorum; + + stacks = KMRK_RunTo(sequence1, + Lmin, + opt_inv, + 2, + 2, + quorum, + KMRK_HashOneSequence, + mask); + + KMRK_markStart(stacks); + + sequence1= (char *)MyRealloc( + (void *)sequence1, + (size1+1)*sizeof(char), + (sizef+size2+2)*sizeof(char), + "KMRK_get_seeds_2seqs: shrink memory from 3N to 1N... ???"); + + sequence1[size1]=0; + + if (opt_dir){ + if (opt_inv) + dirExpect = KMRK_upperInterCount(stacks,0,2,Lmin); + else + dirExpect = KMRK_upperInterCount(stacks,0,1,Lmin); + } + if (opt_inv) + invExpect = KMRK_upperInterCount(stacks,1,2,Lmin); + + AllSeeds = NULL; + MyFree(stacks->v, stacks->size*sizeof(int32_t)); + stacks->v=NULL; + + if (opt_dir){ + if (opt_inv) + AllSeeds = KMRK_enumerateInterCouple(AllSeeds, + 0,2, + dirExpect, + Lmin , + stacks); + else + AllSeeds = KMRK_enumerateInterCouple(AllSeeds, + 0,1, + dirExpect, + Lmin , + stacks); + + } + if (opt_inv) + AllSeeds = KMRK_enumerateInterInvertedCouple(AllSeeds, + 2, + invExpect, + Lmin , + stacks); + + + KMRK_FreeVN(stacks); + + *seq1 = sequence1; + + return AllSeeds; +} + +#define SIGN(x) (((x)<0) ? -1:(((x)>0) ? 1:0)) + +static int32_t compareSeedsByPos(Seed_type* s1,Seed_type* s2) +{ + if (s1->pos1 == s2->pos1) + return SIGN(s1->pos2 - s2->pos2); + else + return SIGN(s1->pos1 - s2->pos1); +} + +void KMRK_sortSeedsByPos(Seed_type* seeds, int32_t count) +{ + qsort(seeds, + count, + sizeof(Seed_type), + (int (*)(const void *, const void *))compareSeedsByPos); +}; diff --git a/src/libKMRK/KMRK_Seeds.h b/src/libKMRK/KMRK_Seeds.h new file mode 100644 index 0000000..a97fcab --- /dev/null +++ b/src/libKMRK/KMRK_Seeds.h @@ -0,0 +1,126 @@ +#ifndef KMRK_Seeds_h +#define KMRK_Seeds_h + +/******************************************** + ******************************************** + ** + ** Declaration of struct + ** + ******************************************** + ********************************************/ + +#include +#include +#include "KMRK.h" + + + + +#define KMRK_SORT_SEEDS_FUNC_PTR(name) int32_t (*name)(SmallSeed_type*, \ + SmallSeed_type*) + +#define KMRK_DELTA_SEEDS_FUNC_PTR(name) int32_t (*name)(SmallSeed_type*) + +/******************************************** + ******************************************** + ** + ** Declaration of public functions + ** + ******************************************** + ********************************************/ + + +AllSeeds_type *KMRK_allocSeeds(AllSeeds_type *AllSeeds, + int32_t size, + int8_t opt_dir, + int8_t opt_inv); + +void KMRK_SetMultipleLenDirSeeds(SmallSeed_type* seeds, + int32_t nseeds, + int32_t wordSize, + AllSeeds_type *PtrAllSeeds); + + +void KMRK_freeSeeds(AllSeeds_type *AllSeeds); + +void KMRK_compactSeeds(AllSeeds_type *AllSeeds); + +void KMRK_pushSeed(AllSeeds_type *AllSeeds, + int32_t pos1, + int32_t pos2, + int32_t length, + int8_t dir); + +AllSeeds_type* KMRK_enumerateDirectCouple(AllSeeds_type* Seeds, + int32_t expected, + int32_t wordSize, + vn_type* stack, + int32_t seq); + +AllSeeds_type* KMRK_enumerateInvertedCouple(AllSeeds_type* Seeds, + int32_t expected, + int32_t wordSize, + vn_type* stack); + +AllSeeds_type* KMRK_enumerateInterCouple(AllSeeds_type* Seeds, + int32_t seq1, + int32_t seq2, + int32_t expected, + int32_t wordSize, + vn_type* stack); + +AllSeeds_type* KMRK_enumerateInterInvertedCouple(AllSeeds_type* Seeds, + int32_t seq2, + int32_t expected, + int32_t wordSize, + vn_type* stack); + + +/** + * Compare two seeds and return an integer less than, equal to or greater + * than zero considering the relative order of the two seeds. This + * version take into account only pos1 and pos2 of seeds without taking + * account of the sequences or of the relative direction + * + * @param s1 pointer to seed one + * @param s2 pointer to seed two + * + * @return a integer less than, equal to or greater than zero + */ + +int32_t KMRK_cmpSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2); +int32_t KMRK_cmpDeltaSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2); +int32_t KMRK_cmpDeltaInvSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2); + +void KMRK_sortSeeds(SmallSeed_type* seeds, + int32_t nseeds, + KMRK_SORT_SEEDS_FUNC_PTR(compare)); + + +AllSeeds_type* KMRK_get_seeds(char **seq, + int32_t SimpleSeqLen, + int16_t Lmin, + int8_t opt_dir, + int8_t opt_inv, + int8_t opt_verbose, + masked_area_table_t *mask); + +AllSeeds_type* KMRK_get_seeds_2seqs(char **seq1, + char **seq2, + int32_t size1, + int32_t size2, + int16_t Lmin, + int8_t opt_dir, + int8_t opt_inv, + int8_t opt_verbose, + masked_area_table_t *mask); + +/** + * Order an array of seeds by pos1,pos2 + * + * @param seeds pointer to an array of Seed_type object to sort + * @param count count of element in the array + */ +void KMRK_sortSeedsByPos(Seed_type* seeds, int32_t count); + +#endif /* KMRK_Seeds_h */ diff --git a/src/libKMRK/KMRK_mask.c b/src/libKMRK/KMRK_mask.c new file mode 100644 index 0000000..2f92540 --- /dev/null +++ b/src/libKMRK/KMRK_mask.c @@ -0,0 +1,259 @@ +/* + * KMRK_mask.c + * repseek + * + * Created by Eric Coissac on 04/12/04. + * Copyright 2004 __MyCompanyName__. All rights reserved. + * + */ + +#include "KMRK_mask.h" +#include +#include +#include "memory.h" + +#define MASKED_AREA_TABLE_SIZE(seqcount) (sizeof(masked_area_table_t) + (sizeof(masked_area_list_t*) * ((seqcount)-1))) +#define MASKED_AREA_LIST_SIZE(areacount) (sizeof(masked_area_list_t) + (sizeof(masked_area_t) * ((areacount)-1))) + +#define AREA_COUNT_INIT (1000) + +static masked_area_table_t *new_masked_area_table(int32_t seqcount, int32_t areacount); +static masked_area_list_t *new_masked_area_list(int32_t areacount); +static masked_area_list_t *realloc_masked_area_list(masked_area_list_t *list,int32_t areacount); +static int32_t push_area(masked_area_table_t* table,int32_t sequence,int32_t begin,int32_t end); +static void sort_area_table(masked_area_table_t* table); +static int32_t compare_area(const masked_area_t* v1,const masked_area_t* v2); +static int32_t search_area(const masked_area_t* v1,const masked_area_t* v2); + +static masked_area_list_t *strip_area_list(masked_area_list_t* list); +static void strip_area_table(masked_area_table_t* table); + + +static masked_area_list_t *new_masked_area_list(int32_t areacount) +{ + masked_area_list_t *list; + + list = MyCalloc(1,MASKED_AREA_LIST_SIZE(areacount),"Not enougth memory for mask table"); + list->reserved=areacount; + + return list; +} + +static masked_area_list_t *realloc_masked_area_list(masked_area_list_t *list,int32_t areacount) +{ + list = MyRealloc(list, + MASKED_AREA_LIST_SIZE(areacount), + MASKED_AREA_LIST_SIZE(list->reserved), + "Not enougth memory for mask table"); + + list->reserved=areacount; + + return list; +} + +static masked_area_table_t *new_masked_area_table(int32_t seqcount, int32_t areacount) +{ + masked_area_table_t *table; + int32_t i; + + table = MyCalloc(1,MASKED_AREA_TABLE_SIZE(seqcount),"Not enougth memory for mask table"); + table->seqcount=seqcount; + + for (i=0;isequence[i]=new_masked_area_list(areacount); + + return table; +} + +static int32_t push_area(masked_area_table_t* table,int32_t sequence,int32_t begin,int32_t end) +{ + masked_area_list_t * list; + + if (sequence >= table->seqcount) + return -1; + + list = table->sequence[sequence]; + + if (list->reserved == list->count) + { + list = realloc_masked_area_list(list,list->reserved*2); + table->sequence[sequence]=list; + } + + list->area[list->count].begin=begin; + list->area[list->count].end=end; + + list->count++; + table->total++; + + return table->total; +} + +static int32_t compare_area(const masked_area_t* v1,const masked_area_t* v2) +{ + return v1->begin - v2->begin; +} + +static void sort_area_table(masked_area_table_t* table) +{ + int32_t i; + + for (i=0; iseqcount;i++) + { + qsort(table->sequence[i]->area, + table->sequence[i]->count, + sizeof(masked_area_t), + (int (*)(const void *, const void *))compare_area); + } +} + +static masked_area_list_t *strip_area_list(masked_area_list_t* list) +{ + int32_t i; + int32_t j; + int32_t count; + int32_t newcount; + + count = list->count; + newcount=count; + + for (i=1;i%d %d->%d ==>",list->area[i-1].begin,list->area[i-1].end,list->area[i].begin,list->area[i].end); */ + if ((list->area[i].begin-1) <= list->area[i-1].end) + { + /* fprintf(stderr," joined"); */ + list->area[i].begin=list->area[i-1].begin; + list->area[i-1].begin=-1; + newcount--; + } + } + list->count=newcount; + + for (i=0,j=0;iarea[i].begin>=0) + { + if (i!=j) + list->area[j]=list->area[i]; + j++; + } + } + + return realloc_masked_area_list(list,newcount); + +} + +static void strip_area_table(masked_area_table_t* table) +{ + int32_t seq; + int32_t oldcount; + masked_area_list_t* list; + + sort_area_table(table); + + for (seq=0; seq < table->seqcount;seq++) + { + list = table->sequence[seq]; + oldcount = list->count; + table->sequence[seq]=strip_area_list(list); + table->total-=oldcount - table->sequence[seq]->count; + } +} + + +static int32_t search_area(const masked_area_t* v1,const masked_area_t* v2) +{ + int32_t pos; + + pos = v1->begin; + + if (pos < v2->begin) + return -1; + + if (pos > v2->end) + return 1; + + return 0; +} + + +masked_area_table_t *KMRK_ReadMaskArea(char *areafile,int32_t seqcount,int32_t complement) +{ + FILE* area; + char buffer[1000]; + char *ok; + int32_t begin; + int32_t end; + int32_t sequence; + int32_t column; + int32_t linecount; + masked_area_table_t *table; + + if (complement > 0) + seqcount++; + else + complement=0; + + area = fopen(areafile,"r"); + linecount=0; + table=new_masked_area_table(seqcount,AREA_COUNT_INIT); + + do { + linecount++; + ok = fgets(buffer,999,area); + if (ok) + { + column = sscanf(buffer,"%d %d %d",&begin,&end,&sequence); + if (column > 1 && begin <= end) + { + begin--; + end--; + if (column==3) + sequence--; + else + sequence=0; + + if (sequence && complement) + sequence++; + + push_area(table,sequence,begin,end); + + if (!sequence && complement) + push_area(table,1,complement -1 - end,complement -1 -begin); + + } + + if (column==1) + fprintf(stderr,"WARNING in mask file reading line %d\n",linecount); + + } + } while (ok); + + fprintf(stderr,"\nread %d masked areas from file\n",table->total); + strip_area_table(table); + fprintf(stderr,"strip to %d non overlaping areas\n",table->total); + + return table; +} + +char KMRK_isMasked(masked_area_table_t *mask,int32_t seq, int32_t position) +{ + masked_area_t input; + int32_t result; + masked_area_list_t *list; + + if (! mask || (seq >= mask->seqcount)) + return 0; + + list = mask->sequence[seq]; + input.begin=position; + + result = bsearch(&input, + list->area, + list->count, + sizeof(masked_area_t), + (int (*)(const void *, const void *))search_area) != NULL; + + return result; +} \ No newline at end of file diff --git a/src/libKMRK/KMRK_mask.h b/src/libKMRK/KMRK_mask.h new file mode 100644 index 0000000..3f4c8f3 --- /dev/null +++ b/src/libKMRK/KMRK_mask.h @@ -0,0 +1,37 @@ +/* + * KMRK_mask.h + * repseek + * + * Created by Eric Coissac on 04/12/04. + * Copyright 2004 __MyCompanyName__. All rights reserved. + * + */ + +#include + +#ifndef _KMRK_MASK_H_ +#define _KMRK_MASK_H_ + +typedef struct { + int32_t begin; + int32_t end; +} masked_area_t; + +typedef struct { + int32_t reserved; + int32_t count; + + masked_area_t area[1]; +} masked_area_list_t; + +typedef struct { + int32_t seqcount; + int32_t total; + + masked_area_list_t *sequence[1]; +} masked_area_table_t; + +masked_area_table_t *KMRK_ReadMaskArea(char *areafile,int32_t seqcount,int32_t complement); +char KMRK_isMasked(masked_area_table_t *mask,int32_t seq, int32_t position); + +#endif \ No newline at end of file diff --git a/src/libKMRK/KMRK_merge_seeds.c b/src/libKMRK/KMRK_merge_seeds.c new file mode 100644 index 0000000..768ccb9 --- /dev/null +++ b/src/libKMRK/KMRK_merge_seeds.c @@ -0,0 +1,123 @@ +/** + * @file KMRK_merge_seeds.c + * @author Eric Coissac + * @date Wed Mar 3 11:15:57 2004 + * + * @brief Merge function of overlapping seeds + * + * + */ + +#include "KMRK_merge_seeds.h" + +void KMRK_MergeSeeds(AllSeeds_type *AllSeeds, + int8_t opt_dir, + int8_t opt_inv) +{ + + int32_t i; /* the current seed */ + int32_t j; /* the checked seed */ + + int32_t N; /* the kept ones */ + Seed_type* seeds; + + + if(opt_dir){ + seeds = AllSeeds->dirSeeds; + for(i=0, N=0 ;inDirSeeds; i++){ + + if(seeds[i].pos1==-1) /* any seed at -1 is removed */ + continue; + + + j=i+1; + + while( (seeds[j].pos1!=-1) && + (seeds[i].pos1!=-1) && + (j < AllSeeds->nDirSeeds) && + (seeds[j].pos1 < seeds[i].pos1+ seeds[i].length)) + { + + if( + ((seeds[j].pos2 >= seeds[i].pos2) && + (seeds[j].pos2 < seeds[i].pos2 + seeds[i].length)) || /* if the seeds are overlapping */ + ((seeds[j].pos2 + seeds[j].length >= seeds[i].pos2) && + (seeds[j].pos2 + seeds[j].length < seeds[i].pos2 + seeds[i].length))) + { + if(seeds[j].length <= seeds[i].length) + seeds[j].pos1=seeds[j].pos2=seeds[j].length=-1; /* removed the smallest */ + else + seeds[i].pos1=seeds[i].pos2=seeds[i].length=-1; + } + + j++; + } + + if(seeds[i].pos1 !=-1) + { /* if this seed is not out, store it */ + + seeds[N].pos1 = seeds[i].pos1; + seeds[N].pos2 = seeds[i].pos2; + seeds[N].length = seeds[i].length; + N++; + } + + } + + AllSeeds->nFilteredDirSeeds += AllSeeds->nDirSeeds-N; + AllSeeds->nDirSeeds=N; + + + } + + if(opt_inv){ + seeds = AllSeeds->invSeeds; + + for(i=0, N=0 ;inInvSeeds; i++){ + + if(seeds[i].pos1==-1) + continue; + + + j=i+1; + + while( (seeds[j].pos1!=-1 ) && + (seeds[i].pos1!=-1 ) && + (j < AllSeeds->nInvSeeds) && + (seeds[j].pos1 < seeds[i].pos1+seeds[i].length)) + { + + if( + ((seeds[j].pos2 >= seeds[i].pos2) && /* if the seeds are overlapping */ + (seeds[j].pos2 < seeds[i].pos2+seeds[i].length)) || + ((seeds[j].pos2 + seeds[j].length >= seeds[i].pos2) && + (seeds[j].pos2 + seeds[j].length < seeds[i].pos2+seeds[i].length))) + { + + if(seeds[j].length <= seeds[i].length) + seeds[j].pos1=seeds[j].pos2=seeds[j].length=-1; /* removed the smallest */ + else + seeds[i].pos1=seeds[i].pos2=seeds[i].length=-1; + } + + j++; + } + + if(seeds[i].pos1!=-1) + { /* if this seed is not out, store it */ + + seeds[N].pos1 = seeds[i].pos1; + seeds[N].pos2 = seeds[i].pos2; + seeds[N].length = seeds[i].length; + N++; + } + + } + + AllSeeds->nFilteredInvSeeds += AllSeeds->nInvSeeds-N; + AllSeeds->nInvSeeds=N; + } + + KMRK_compactSeeds(AllSeeds); + +} diff --git a/src/libKMRK/KMRK_merge_seeds.h b/src/libKMRK/KMRK_merge_seeds.h new file mode 100644 index 0000000..a9fd6f1 --- /dev/null +++ b/src/libKMRK/KMRK_merge_seeds.h @@ -0,0 +1,11 @@ +#ifndef KMRK_merge_seeds_h +#define KMRK_merge_seeds_h + +#include "KMRK_Seeds.h" + +void KMRK_MergeSeeds(AllSeeds_type *AllSeeds, + int8_t opt_dir, + int8_t opt_inv); + + +#endif /* KMRK_MergeSeeds */ diff --git a/src/libKMRK/Makefile b/src/libKMRK/Makefile new file mode 100644 index 0000000..a3ce2d5 --- /dev/null +++ b/src/libKMRK/Makefile @@ -0,0 +1,25 @@ + +SOURCES = KMRK.c \ + KMRK_mask.c \ + KMRK_merge_seeds.c \ + KMRK_Seeds.c + +SRCS=$(SOURCES) + +OBJECTS= $(patsubst %.c,%.o,$(SOURCES)) + +LIBFILE= libKMRK.a +RANLIB= ranlib + + +include ../global.mk + + +all: $(LIBFILE) + +clean: + rm -rf $(OBJECTS) $(LIBFILE) + +$(LIBFILE): $(OBJECTS) + ar -cr $@ $? + $(RANLIB) $@ diff --git a/src/libKMRK/memory.c b/src/libKMRK/memory.c new file mode 100644 index 0000000..c633c76 --- /dev/null +++ b/src/libKMRK/memory.c @@ -0,0 +1,224 @@ +/****** + file : memory.c + function : All about memory of the KMR, Seeds and Repeats + All MyMalloc() series is about follwoing how mauch memory has been used + + created : 19 Sep 2003 + modif : Oct 2003, Feb 2004 + modif : Dec 2004 ; Corrected Memory declaration + author : amikezor +*****/ + +#include +#include +#include "repseek_types.h" +#include "memory.h" + + +MemUsage Memory; + + +/* + Functions to count the memory usage all along + dybamic allocation and free +*/ +void PrintMem(char *Comment){ + + extern MemUsage Memory; + + fprintf(stderr,"\n%s\nMemory Usage\n\t* Max is: %d bytes, %.2f Kb, %.2f Mb\n\t* Cur is: %d bytes, %.2f Kb, %.2f Mb\n", + Comment, + Memory.Max, (float)Memory.Max/1024, (float)Memory.Max/(1024*1024), + Memory.Current, (float)Memory.Current/1024, (float)Memory.Current/(1024*1024)); +} +void PrintMaxMem( void ){ + + extern MemUsage Memory; + + if(Memory.Max < 1024) + fprintf(stderr,"Max Memory Usage.. %d bytes\n", Memory.Max); + else if(Memory.Max < 1024*1024) + fprintf(stderr,"Max Memory Usage.. %.2f Kilobytes\n", (float)Memory.Max/1024); + else if(Memory.Max < 1024*1024*1024) + fprintf(stderr,"Max Memory Usage.. %.2f Megabytes\n", (float)Memory.Max/(1024*1024)); + else + fprintf(stderr,"Max Memory Usage.. %.2f Gigabytes\n", (float)Memory.Max/(1024*1024*1024)); +} +void Update_Mem(int32_t Value){ + + extern MemUsage Memory; + + Memory.Current += Value; + Memory.Max = (Memory.Current>Memory.Max)?Memory.Current:Memory.Max; +} +void Init_Mem(int32_t Value){ + + extern MemUsage Memory; + + Memory.Current = Value; + Memory.Max = Value; +} + + +/* + Replace functions of dynamic allocation + to allow the tracking of memory usage +*/ +void *MyMalloc( int32_t size , char *Error ){ + + void *pointer; + + pointer = malloc(size); + if(!pointer)fprintf(stderr,"%s\n",Error),exit(3); + + Update_Mem(size); + + return pointer; +} +void *MyCalloc( int32_t number, int32_t TypeSize , char *Error ){ + + void *pointer; + + pointer = calloc(number, TypeSize); + if(!pointer)fprintf(stderr,"%s\n",Error),exit(3); + + Update_Mem(number*TypeSize ); + + return pointer; +} +void MyFree( void *Pointer, int32_t size){ + free(Pointer); + Pointer=NULL; + Update_Mem(-size); +} +void *MyRealloc( void *Pointer, int32_t newsize, int32_t oldsize, char *Error){ + + Pointer = realloc(Pointer,newsize); + if(!Pointer)fprintf(stderr,"%s\n",Error),exit(3); + Update_Mem( newsize-oldsize ); + return Pointer; +} + + +/* + Deal with Stacks structure for KMR + +void MallocStack(Stacks *s, int32_t number, int32_t *histo, int32_t AllValues){ + + int32_t i; + + s->nStacks = number; + s->nValues = AllValues; + + s->ppStacks = (int32_t **)MyMalloc( number * sizeof(int32_t *), + "MallocStack: ppStacks malloc error, bye\n"); + s->lenStacks = (int32_t *)MyMalloc( number * sizeof(int32_t), + "MallocStack: lenStacks malloc error, bye\n"); + s->ppStacks[0] = (int32_t *)MyMalloc( AllValues * sizeof(int32_t), + "MallocStack: ppStacks[0] malloc error, bye\n"); + s->lenStacks[0]=0; + + for(i=1;i < number; i++){ + s->lenStacks[i]=0; + s->ppStacks[i] = s->ppStacks[i-1] + histo[i] ; + } +} + +void FreeStack( Stacks *p){ + MyFree(p->ppStacks[0] , p->nValues*sizeof(int32_t) ); + MyFree(p->ppStacks , p->nStacks*sizeof(int32_t *) ); + MyFree(p->lenStacks , p->nStacks*sizeof(int32_t)); +} +*/ + + +/* + Deal with the Seeds part + + +void free_Seeds(Seeds AllSeeds) +{ + + if( AllSeeds.nDirSeeds ){ + MyFree(AllSeeds.DirPos1, AllSeeds.nDirSeeds*sizeof(int32_t)); + MyFree(AllSeeds.DirPos2, AllSeeds.nDirSeeds*sizeof(int32_t)); + MyFree(AllSeeds.DirLen, AllSeeds.nDirSeeds*sizeof(int32_t)); + MyFree(AllSeeds.DirMeanR, AllSeeds.nDirSeeds*sizeof(float)); + } + + if(AllSeeds.nInvSeeds ){ + MyFree(AllSeeds.InvPos1, AllSeeds.nInvSeeds*sizeof(int32_t)); + MyFree(AllSeeds.InvPos2, AllSeeds.nInvSeeds*sizeof(int32_t)); + MyFree(AllSeeds.InvLen, AllSeeds.nInvSeeds*sizeof(int32_t)); + MyFree(AllSeeds.InvMeanR, AllSeeds.nInvSeeds*sizeof(float)); + } + +} +*/ +/* + Malloc if it is the first time otherwise readjust + +void AllocSeeds(Seeds *AllSeeds, int32_t size, int32_t old_size, int8_t opt_dir, int8_t opt_inv){ + + if(opt_inv != 1 && opt_dir != 1) + fprintf(stderr,"AllocSeeds: requiere at least one of opt_dir and opt_inv to be 1\n"),exit(4); + + if(opt_dir == 1){ + if( AllSeeds->DirPos1 == NULL){ + AllSeeds->DirPos1 = (int32_t *)MyCalloc(size , sizeof(int32_t),"AllocSeeds: Alloc for DirPos1 failed, bye"); + AllSeeds->DirPos2 = (int32_t *)MyCalloc(size , sizeof(int32_t),"AllocSeeds: Alloc for DirPos2 failed, bye"); + } + else{ + AllSeeds->DirPos1 = (int32_t *)MyRealloc(AllSeeds->DirPos1, size * sizeof(int32_t), old_size* sizeof(int32_t), + "AllocSeeds: realloc for DirPos1 failed, bye"); + AllSeeds->DirPos2 = (int32_t *)MyRealloc(AllSeeds->DirPos2, size * sizeof(int32_t), old_size* sizeof(int32_t), + "AllocSeeds: realloc for DirPos2 failed, bye"); + } + } + + if(opt_inv == 1){ + if( AllSeeds->InvPos1 == NULL){ + AllSeeds->InvPos1 = (int32_t *)MyCalloc(size , sizeof(int32_t), "AllocSeeds: Alloc for InvPos1 failed, bye"); + AllSeeds->InvPos2 = (int32_t *)MyCalloc(size , sizeof(int32_t), "AllocSeeds: Alloc for InvPos2 failed, bye"); + } + else{ + AllSeeds->InvPos1 = (int32_t *)MyRealloc(AllSeeds->InvPos1, size * sizeof(int32_t), old_size* sizeof(int32_t), + "AllocSeeds: realloc for InvPos1 failed, bye"); + AllSeeds->InvPos2 = (int32_t *)MyRealloc(AllSeeds->InvPos2, size * sizeof(int32_t), old_size* sizeof(int32_t), + "AllocSeeds: realloc for InvPos2 failed, bye"); + } + } +} +*/ + +/* + Deal with the Repeats structure +*/ +Repeats mem_Repeats(int32_t Ndir, int32_t Ninv){ + + Repeats AllRepeats; /* All Repeats structure */ + + AllRepeats.nDirRep = Ndir; /* set the number of repeats to the number of seeds */ + AllRepeats.nInvRep = Ninv; + AllRepeats.nDirBadRep = 0; /* set the "bad" repet (included into another rep) as 0 */ + AllRepeats.nInvBadRep = 0; + + if(AllRepeats.nDirRep) + AllRepeats.DirRep = (Rep *)MyMalloc( (AllRepeats.nDirRep)*sizeof(Rep), "init_Repeats: repdir malloc error" ); + else + AllRepeats.DirRep = NULL; + + if(AllRepeats.nInvRep) + AllRepeats.InvRep = (Rep *)MyMalloc( (AllRepeats.nInvRep)*sizeof(Rep), "init_Repeats: repinv malloc error" ); + else + AllRepeats.InvRep = NULL; + + return AllRepeats; +} +void free_Repeats(Repeats AllRep) +{ + if(AllRep.nDirRep) + MyFree(AllRep.DirRep, AllRep.nDirRep*sizeof(Rep)); + if(AllRep.nInvRep) + MyFree(AllRep.InvRep, AllRep.nInvRep*sizeof(Rep)); +} diff --git a/src/libKMRK/memory.h b/src/libKMRK/memory.h new file mode 100644 index 0000000..abc5853 --- /dev/null +++ b/src/libKMRK/memory.h @@ -0,0 +1,105 @@ +/** + * @file memory.h + * @author Achaz G + * @date April 2004 + * + * @brief header for memory alloc/dealloc + * modif : Dec 2004 ; Corrected Memory declaration + * + * + */ + +#ifndef _MEMORY_H_ +#define _MEMORY_H_ + +#include "repseek_types.h" + +typedef struct { /********** Memory Usage structure **************/ + + int32_t Max; + int32_t Current; + +} MemUsage; + +#include + +/********** ********** + + Global Variable(s) + + ********** **********/ + +extern MemUsage Memory; /* Instance of the global variable for memory tracking */ + +/* + Follow memory usage +*/ +void PrintMem(char *Comment); +void PrintMaxMem( void ); +void Update_Mem(int32_t Value); +void Init_Mem(int32_t Value); +/* + All Alloc/Free to follow of memory usage +*/ +void *MyMalloc( int32_t size , char *Error ); +void *MyCalloc( int32_t number, int32_t TypeSize , char *Error ); +void MyFree( void *Pointer, int32_t size); +void *MyRealloc( void *Pointer, int32_t newsize, int32_t oldsize, char *Error); +/* + For Stacks + +void MallocStack(Stacks *s, int32_t number, int32_t *histo, int32_t AllValues); +void FreeStack( Stacks *p); + + For Seeds + +void free_Seeds(Seeds AllSeeds); +void AllocSeeds(Seeds *AllSeeds, int32_t size, int32_t old_size, int8_t opt_dir, int8_t opt_inv); +*/ + +/* + For Repeats +*/ +Repeats mem_Repeats(int32_t Ndir, int32_t Ninv); +void free_Repeats(Repeats AllRep); + + + + +/* + Not used anymore, but just in case +*/ + + +#include +#include + +#define KMRK_MALLOC(var,type,size,message) { \ + var = (type*) malloc(size); \ + if (!var) \ + { \ + fprintf(stderr,"%s\n",message); \ + exit(4); \ + } \ + } + +#define KMRK_CALLOC(var,type,length,message) { \ + var = (type*) calloc(length,sizeof(type)); \ + if (!var) \ + { \ + fprintf(stderr,"%s\n",message); \ + exit(4); \ + } \ + } + +#define KMRK_REALLOC(var,type,size,message) { \ + var = (type*) realloc(var,size); \ + if (!var) \ + { \ + fprintf(stderr,"%s\n",message); \ + exit(4); \ + } \ + } + + +#endif /* _MEMORY_H_*/ diff --git a/src/libKMRK/repseek_types.h b/src/libKMRK/repseek_types.h new file mode 100644 index 0000000..3bd8779 --- /dev/null +++ b/src/libKMRK/repseek_types.h @@ -0,0 +1,197 @@ +/** + * @file repseek_types.h + * @author Guillaume Achaz + * @date April 2004 + * @modif July 2004 turn scores into doubles + * @brief definition of general types and macros for repseek + * + * + */ + + + +#ifndef _REPSEEK_TYPES_ +#define _REPSEEK_TYPES_ + +/* + Version of the program +*/ +#define REPSEEK_VERSION "4.2" +#define REPSEEK_DATE "Nov 2004" + + + +/********** ********** + + General Macros + + ********** **********/ + +/* + Macros to compare 2 or three values +*/ +#define MAX2( A, B ) ( ((A)>(B))?(A):(B) ) +#define MAX3( A, B, C ) ( ((MAX2(A,B))>(C))?(MAX2(A,B)):(C) ) +#define MIN2( A, B ) ( ((A)<(B))?(A):(B) ) + +#define MAX( A, B ) MAX2( A, B ) +#define MIN( A, B ) MIN2( A, B ) + +/* + Absolute values +*/ +#define ABS(x) (( (x)>=0 )? (x) : -(x)) + + + +/********** ********** + + All types used in repseek + + ********** **********/ + +#include /* The type FILE * is defined here */ +#include /* all, the int??_t are defined in there */ + + +/** + * Store informations about one STRICT repeat (seeds) + * + */ +typedef struct { /* the complete seed structure */ + + int32_t pos1; /**< position of the first copy */ + int32_t pos2; /**< position of the second copy */ + + int32_t length; /**< length of the strict repeats */ + float rmean; /**< mean repeat leavel */ + +} Seed_type; + +typedef struct { /* Just after a KMRK length X, only the 2 pos matter */ + + int32_t pos1; /**< postion of the first copy */ + int32_t pos2; /**< postion of the second copy */ + +} SmallSeed_type; + + + + +/** + * Store informations about all strict repeat (seeds) + * + */ +typedef struct { + + int32_t cDirSeeds; /**< currently allocated space in dirSeeds array */ + int32_t nDirSeeds; /**< count of direct strict repeats */ + int32_t nFilteredDirSeeds; /**< ??? */ + + Seed_type* dirSeeds; /**< array of direct repeats */ + + int32_t cInvSeeds; /**< currently allocated space in invSeeds array */ + int32_t nInvSeeds; /**< count of inverted strict repeats */ + int32_t nFilteredInvSeeds; /**< ??? */ + + Seed_type* invSeeds; /**< array of inverted repeats */ + +} AllSeeds_type; + + + + + +/** + * Store informations about one GENERIC repeat + * + */ +typedef struct{ + + char type[20]; /* its name; i.e. Tandem, Palindrome, etc... */ + + int32_t pos1, pos2, /* both copies postions */ + len1, len2, /* both copies length */ + seed_pos1,seed_pos2, /* pos1 and pos2 of the originate seed */ + seed_len, /* len of the seed */ + + match, align_len; /* number of match and length of alignment */ + + double score; /* the alignment score */ + + float seed_meanR; /* the seed meanR */ + + float meanR; /* The mean R-level of the repeat */ + int32_t mainR; /* its Mode R */ + float fraction_mainR; /* the fraction of length containing the Mode R */ + + +} Rep; + + + +/** + * Store informations about All GENERIC repeats + * + */ +typedef struct { + + int32_t nDirRep; /* Total Number of Direct Repats in Mem */ + int32_t nDirBadRep; /* Direct repeats set to -1 -- filtered out and co. */ + Rep *DirRep; /* The array of Dir Rep */ + + int32_t nInvRep; /* Total Number of Inverted Repats in Mem */ + int32_t nInvBadRep; /* Inverted Repeats set to -1 -- filtered out and co. */ + Rep *InvRep; /* The array of Inverted Rep */ + +} Repeats; + + +#define MATRIX_SIZE 26 + +typedef struct { /******* The scoring Matrix ************/ + + double matrix[MATRIX_SIZE][MATRIX_SIZE]; /* the matrix of match/missmatch */ + double gap_open; /* value of gap-open */ + double gap_ext; /* value of gap_ext */ + double expect; + +} SCORING; + + + +typedef struct { /******* The Results of Alignement by Dynamik programming ************/ + + + double *scores; /* the score strings (+/- 'matrix') */ + double *pscore; /* pointer to the current score */ + char *traces; /* the path matrix - could take values 'l'eft, 'd'iagonal, or 'a'bove or 'u'nknown */ + + double *F; /* *Above -- needed for memorizing deletion in seq2 */ + + double *pBestScore; /* pointer to it */ + int32_t BestScore_row; /* its row and col */ + int32_t BestScore_col; + + char *traceback; /* all you need for bactracking */ + char *traceback_f; /* memory for forward traceback and then check other seeds */ + char *traceback_b; /* memory needed for backward traceback - to avoid erasing the forward one */ + + int32_t alignment_len; /* guess ?? */ + int32_t matches; /* number of match (score>0 in scoring matrix) */ + + int32_t nSegment; /* number of segment */ + int32_t *Segment_begin; /* begin and end of each segment */ + int32_t *Segment_end; + + int32_t max_scores; /* size of the matrices only for memory purposes */ + int32_t max_col; + int32_t max_row; + int32_t max_alignlen; + +} RESULTS; + + + +#endif /* _REPSEEK_TYPES_ */ + diff --git a/src/libKMRK/sequence.h b/src/libKMRK/sequence.h new file mode 100644 index 0000000..4c8a105 --- /dev/null +++ b/src/libKMRK/sequence.h @@ -0,0 +1,25 @@ +/** + * @file KMRK_sequence.h + * @author Eric Coissac + * @date Tue Feb 24 22:22:57 2004 + * + * @brief Header file for sequence utilities + * + * + */ + +#ifndef KMRK_sequence_h +#define KMRK_sequence_h + +#include + +int8_t CheckSeq(char *seq, char *alpha); + +void nonACGTXtoN(char *seq); + +void UpperSequence(char *seq); + +void invseq(char *seqsrc, char *seqdest); + + +#endif /* KMRK_sequence_h */ diff --git a/src/libapat/CODES/dft_code.h b/src/libapat/CODES/dft_code.h new file mode 100644 index 0000000..b9caf28 --- /dev/null +++ b/src/libapat/CODES/dft_code.h @@ -0,0 +1,14 @@ +/* ----------------------------------------------- */ +/* dft_pat_seq_code.h */ +/* default alphabet encoding for alpha */ +/* ----------------------------------------------- */ + + 0x00000001 /* A */, 0x00000002 /* B */, 0x00000004 /* C */, + 0x00000008 /* D */, 0x00000010 /* E */, 0x00000020 /* F */, + 0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */, + 0x00000200 /* J */, 0x00000400 /* K */, 0x00000800 /* L */, + 0x00001000 /* M */, 0x00002000 /* N */, 0x00004000 /* O */, + 0x00008000 /* P */, 0x00010000 /* Q */, 0x00020000 /* R */, + 0x00040000 /* S */, 0x00080000 /* T */, 0x00100000 /* U */, + 0x00200000 /* V */, 0x00400000 /* W */, 0x00800000 /* X */, + 0x01000000 /* Y */, 0x02000000 /* Z */ diff --git a/src/libapat/CODES/dna_code.h b/src/libapat/CODES/dna_code.h new file mode 100644 index 0000000..0febf41 --- /dev/null +++ b/src/libapat/CODES/dna_code.h @@ -0,0 +1,71 @@ +/* ----------------------------------------------- */ +/* dna_code.h */ +/* alphabet encoding for dna/rna */ +/* ----------------------------------------- */ +/* IUPAC encoding */ +/* ----------------------------------------- */ +/* G/A/T/C */ +/* U=T */ +/* R=AG */ +/* Y=CT */ +/* M=AC */ +/* K=GT */ +/* S=CG */ +/* W=AT */ +/* H=ACT */ +/* B=CGT */ +/* V=ACG */ +/* D=AGT */ +/* N=ACGT */ +/* X=ACGT */ +/* EFIJLOPQZ not recognized */ +/* ----------------------------------------- */ +/* dual encoding */ +/* ----------------------------------------- */ +/* A=ADHMNRVW */ +/* B=BCDGHKMNRSTUVWY */ +/* C=BCHMNSVY */ +/* D=ABDGHKMNRSTUVWY */ +/* G=BDGKNRSV */ +/* H=ABCDHKMNRSTUVWY */ +/* K=BDGHKNRSTUVWY */ +/* M=ABCDHMNRSVWY */ +/* N=ABCDGHKMNRSTUVWY */ +/* R=ABDGHKMNRSVW */ +/* S=BCDGHKMNRSVY */ +/* T=BDHKNTUWY */ +/* U=BDHKNTUWY */ +/* V=ABCDGHKMNRSVWY */ +/* W=ABDHKMNRTUVWY */ +/* X=ABCDGHKMNRSTUVWY */ +/* Y=BCDHKMNSTUVWY */ +/* EFIJLOPQZ not recognized */ +/* ----------------------------------------------- */ + +#ifndef USE_DUAL + + /* IUPAC */ + + 0x00000001 /* A */, 0x00080044 /* B */, 0x00000004 /* C */, + 0x00080041 /* D */, 0x00000000 /* E */, 0x00000000 /* F */, + 0x00000040 /* G */, 0x00080005 /* H */, 0x00000000 /* I */, + 0x00000000 /* J */, 0x00080040 /* K */, 0x00000000 /* L */, + 0x00000005 /* M */, 0x00080045 /* N */, 0x00000000 /* O */, + 0x00000000 /* P */, 0x00000000 /* Q */, 0x00000041 /* R */, + 0x00000044 /* S */, 0x00080000 /* T */, 0x00080000 /* U */, + 0x00000045 /* V */, 0x00080001 /* W */, 0x00080045 /* X */, + 0x00080004 /* Y */, 0x00000000 /* Z */ + +#else + /* DUAL */ + + 0x00623089 /* A */, 0x017e34ce /* B */, 0x01243086 /* C */, + 0x017e34cb /* D */, 0x00000000 /* E */, 0x00000000 /* F */, + 0x0026244a /* G */, 0x017e348f /* H */, 0x00000000 /* I */, + 0x00000000 /* J */, 0x017e24ca /* K */, 0x00000000 /* L */, + 0x0166308f /* M */, 0x017e34cf /* N */, 0x00000000 /* O */, + 0x00000000 /* P */, 0x00000000 /* Q */, 0x006634cb /* R */, + 0x012634ce /* S */, 0x0158248a /* T */, 0x0158248a /* U */, + 0x016634cf /* V */, 0x017a348b /* W */, 0x017e34cf /* X */, + 0x017c348e /* Y */, 0x00000000 /* Z */ +#endif diff --git a/src/libapat/CODES/prot_code.h b/src/libapat/CODES/prot_code.h new file mode 100644 index 0000000..edcdfc1 --- /dev/null +++ b/src/libapat/CODES/prot_code.h @@ -0,0 +1,51 @@ +/* ----------------------------------------------- */ +/* prot_code.h */ +/* alphabet encoding for proteins */ +/* ----------------------------------------- */ +/* IUPAC encoding */ +/* ----------------------------------------- */ +/* B=DN */ +/* Z=EQ */ +/* X=any - {X} */ +/* JOU not recognized */ +/* ----------------------------------------- */ +/* dual encoding */ +/* ----------------------------------------- */ +/* B=BDN */ +/* D=BD */ +/* E=EZ */ +/* N=BN */ +/* Q=QZ */ +/* X=any - {X} */ +/* Z=EQZ */ +/* JOU not recognized */ +/* ----------------------------------------------- */ + +#ifndef USE_DUAL + + /* IUPAC */ + + 0x00000001 /* A */, 0x00002008 /* B */, 0x00000004 /* C */, + 0x00000008 /* D */, 0x00000010 /* E */, 0x00000020 /* F */, + 0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */, + 0x00000000 /* J */, 0x00000400 /* K */, 0x00000800 /* L */, + 0x00001000 /* M */, 0x00002000 /* N */, 0x00000000 /* O */, + 0x00008000 /* P */, 0x00010000 /* Q */, 0x00020000 /* R */, + 0x00040000 /* S */, 0x00080000 /* T */, 0x00000000 /* U */, + 0x00200000 /* V */, 0x00400000 /* W */, 0x037fffff /* X */, + 0x01000000 /* Y */, 0x00010010 /* Z */ + +#else + /* DUAL */ + + 0x00000001 /* A */, 0x0000200a /* B */, 0x00000004 /* C */, + 0x0000000a /* D */, 0x02000010 /* E */, 0x00000020 /* F */, + 0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */, + 0x00000000 /* J */, 0x00000400 /* K */, 0x00000800 /* L */, + 0x00001000 /* M */, 0x00002002 /* N */, 0x00000000 /* O */, + 0x00008000 /* P */, 0x02010000 /* Q */, 0x00020000 /* R */, + 0x00040000 /* S */, 0x00080000 /* T */, 0x00000000 /* U */, + 0x00200000 /* V */, 0x00400000 /* W */, 0x037fffff /* X */, + 0x01000000 /* Y */, 0x02010010 /* Z */ + +#endif diff --git a/src/libapat/Gmach.h b/src/libapat/Gmach.h new file mode 100644 index 0000000..8fb1c69 --- /dev/null +++ b/src/libapat/Gmach.h @@ -0,0 +1,97 @@ +/* ---------------------------------------------------------------- */ +/* Copyright (c) Atelier de BioInformatique */ +/* @file: Gmach.h */ +/* @desc: machine dependant setup */ +/* @+ *should* be included in all ABI softs */ +/* */ +/* @history: */ +/* @+ : Jul 95 : MWC first draft */ +/* @+ : Jan 96 : adapted to Pwg */ +/* @+ : Nov 00 : adapted to Mac_OS_X */ +/* ---------------------------------------------------------------- */ + +#ifndef _H_Gmach + + /* OS names */ + +#define _H_Gmach + + /* Macintosh Classic */ + /* Think C environment */ +#ifdef THINK_C +#define MACINTOSH +#define MAC_OS_C +#endif + + + /* Macintosh Classic */ + /* Code-Warrior */ +#ifdef __MWERKS__ +#define MACINTOSH +#define MAC_OS_C +#endif + + /* Macintosh OS-X */ +#ifdef MAC_OS_X +#define MACINTOSH +#define UNIX +#define UNIX_BSD +#endif + + /* LINUX */ +#ifdef LINUX +#define UNIX +#define UNIX_BSD +#endif + + /* other Unix Boxes */ + /* SunOS / Solaris */ +#ifdef SUN +#define UNIX +#ifdef SOLARIS +#define UNIX_S7 +#else +#define UNIX_BSD +#endif +#endif + + /* SGI Irix */ +#ifdef SGI +#define UNIX +#define UNIX_S7 +#endif + +/* ansi setup */ +/* for unix machines see makefile */ + +#ifndef PROTO +#define PROTO 1 +#endif + +#ifndef ANSI_PROTO +#define ANSI_PROTO PROTO +#endif + +#ifndef ANSI_STR +#define ANSI_STR 1 +#endif + +/* unistd.h header file */ + +#ifdef UNIX +#define HAS_UNISTD_H +#endif + +/* getopt.h header file */ + +#ifdef MAC_OS_C +#define HAS_GETOPT_H "getopt.h" +#endif + +#ifdef SGI +#define HAS_GETOPT_H +#endif + + + +#endif diff --git a/src/libapat/Gtypes.h b/src/libapat/Gtypes.h new file mode 100644 index 0000000..9bf5a93 --- /dev/null +++ b/src/libapat/Gtypes.h @@ -0,0 +1,104 @@ +/* ---------------------------------------------------------------- */ +/* Copyright (c) Atelier de BioInformatique */ +/* @file: Gtypes.h */ +/* @desc: general & machine dependant types */ +/* @+ *should* be included in all ABI softs */ +/* */ +/* @history: */ +/* @+ : Jan 91 : MWC first draft */ +/* @+ : Jul 95 : Gmach addition */ +/* ---------------------------------------------------------------- */ + +#define _H_Gtypes + +#ifndef _H_Gmach +#include "Gmach.h" +#endif + +#ifndef NULL +#include /* is the official NULL here ? */ +#endif + +/* ==================================================== */ +/* constantes */ +/* ==================================================== */ + +#ifndef PROTO +#define PROTO 1 /* prototypes flag */ +#endif + +#ifdef MAC_OS_C +#define Vrai true /* TC boolean values */ +#define Faux false /* */ +#else +#define Vrai 0x1 /* bool values = TRUE */ +#define Faux 0x0 /* = FALSE */ +#endif + +#define Nil NULL /* nil pointer */ + +#define kBigInt16 0x7fff /* plus grand 16 bits signe */ +#define kBigInt32 0x7fffffff /* plus grand 32 bits signe */ +#define kBigUInt16 0xffff /* plus grand 16 bits ~signe */ +#define kBigUInt32 0xffffffff /* plus grand 32 bits ~signe */ + +#ifdef MAC_OS_C +/* ==================================================== */ +/* Types (for Macintosh ThinK C || MWerks) */ +/* ==================================================== */ + + /* --- specific sizes --------- */ +typedef long Int32; /* Int32 = 32 bits signe */ +typedef unsigned long UInt32; /* UInt32 = 32 bits ~signe */ +typedef short Int16; /* Int16 = 16 bits signe */ +typedef unsigned short UInt16; /* UInt32 = 16 bits ~signe */ +typedef char Int8; /* Int8 = 8 bits signe */ +typedef unsigned char UInt8; /* UInt8 = 8 bits ~signe */ + + /* --- default types ---------- */ + +typedef Boolean Bool; /* booleen */ + +typedef long Int; /* 'natural' int (>= 32 bits) */ + +typedef void *Ptr; /* pointeur */ + +#elif ((defined SUN) || (defined SGI) || (defined UNIX)) +/* ==================================================== */ +/* Types (for Sun & Iris - 32 bits machines) */ +/* ==================================================== */ + + /* --- specific sizes --------- */ +typedef int Int32; /* Int32 = 32 bits signe */ +typedef unsigned int UInt32; /* UInt32 = 32 bits ~signe */ +typedef short Int16; /* Int16 = 16 bits signe */ +typedef unsigned short UInt16; /* UInt32 = 16 bits ~signe */ +typedef char Int8; /* Int8 = 8 bits signe */ +typedef unsigned char UInt8; /* UInt8 = 8 bits ~signe */ + + /* --- default types ---------- */ + +typedef int Bool; /* booleen (int for ANSI) */ + +typedef int Int; /* 'natural' int (>= 32 bits) */ + +typedef void *Ptr; /* pointeur */ + +#else +/* ==================================================== */ +/* Types (for undefined machines) */ +/* ==================================================== */ + +#error undefined MACHINE + +#endif + +/* ==================================================== */ +/* special macro for prototypes */ +/* ==================================================== */ + +#if PROTO +#define P(s) s +#else +#define P(s) () +#endif diff --git a/src/libapat/Makefile b/src/libapat/Makefile new file mode 100644 index 0000000..b4dc9be --- /dev/null +++ b/src/libapat/Makefile @@ -0,0 +1,24 @@ + +SOURCES = apat_parse.c \ + apat_search.c \ + libstki.c + +SRCS=$(SOURCES) + + +OBJECTS= $(patsubst %.c,%.o,$(SOURCES)) + +LIBFILE= libapat.a +RANLIB=ranlib + + +include ../global.mk + +all: $(LIBFILE) + +clean: + rm -rf $(OBJECTS) $(LIBFILE) + +$(LIBFILE): $(OBJECTS) + ar -cr $@ $? + $(RANLIB) $@ diff --git a/src/libapat/apat.h b/src/libapat/apat.h new file mode 100644 index 0000000..eaa06df --- /dev/null +++ b/src/libapat/apat.h @@ -0,0 +1,173 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Dec. 94 */ +/* File: apat.h */ +/* Purpose: pattern scan */ +/* History: */ +/* 28/12/94 : ascan first version */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + +#ifndef _H_Gtypes +#include "Gtypes.h" +#endif + +#ifndef _H_libstki +#include "libstki.h" +#endif + +#define H_apat + +/* ----------------------------------------------- */ +/* constantes */ +/* ----------------------------------------------- */ + +#ifndef BUFSIZ +#define BUFSIZ 1024 /* io buffer size */ +#endif + +#define MAX_NAME_LEN BUFSIZ /* max length of sequence name */ + +#define ALPHA_LEN 26 /* alphabet length */ + /* *DO NOT* modify */ + +#define MAX_PATTERN 4 /* max # of patterns */ + /* *DO NOT* modify */ + +#define MAX_PAT_LEN 32 /* max pattern length */ + /* *DO NOT* modify */ + +#define MAX_PAT_ERR 32 /* max # of errors */ + /* *DO NOT* modify */ + +#define PATMASK 0x3ffffff /* mask for 26 symbols */ + /* *DO NOT* modify */ + +#define OBLIBIT 0x4000000 /* bit 27 to 1 -> oblig. pos */ + /* *DO NOT* modify */ + + /* mask for position */ +#define ONEMASK 0x80000000 /* mask for highest position */ + + /* masks for Levenhstein edit */ +#define OPER_IDT 0x00000000 /* identity */ +#define OPER_INS 0x40000000 /* insertion */ +#define OPER_DEL 0x80000000 /* deletion */ +#define OPER_SUB 0xc0000000 /* substitution */ + +#define OPER_SHFT 30 /* shift */ + + /* Levenhstein Opcodes */ +#define SOPER_IDT 0x0 /* identity */ +#define SOPER_INS 0x1 /* insertion */ +#define SOPER_DEL 0x2 /* deletion */ +#define SOPER_SUB 0x3 /* substitution */ + + /* Levenhstein Opcodes masks */ +#define OPERMASK 0xc0000000 /* mask for Opcodes */ +#define NOPERMASK 0x3fffffff /* negate of previous */ + + /* special chars in pattern */ +#define PATCHARS "[]!#" + + /* 26 letter alphabet */ + /* in alphabetical order */ + +#define ORD_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + + /* protein alphabet */ + +#define PROT_ALPHA "ACDEFGHIKLMNPQRSTVWY" + + /* dna/rna alphabet */ + +#define DNA_ALPHA "ABCDGHKMNRSTUVWXY" + + +/* ----------------------------------------------- */ +/* data structures */ +/* ----------------------------------------------- */ + + /* -------------------- */ +typedef enum { /* data encoding */ + /* -------------------- */ + alpha = 0, /* [A-Z] */ + dna, /* IUPAC DNA */ + protein /* IUPAC proteins */ +} CodType; + + /* -------------------- */ +typedef struct { /* sequence */ + /* -------------------- */ + char *name; /* sequence name */ + Int32 seqlen; /* sequence length */ + Int32 seqsiz; /* sequence buffer size */ + Int32 datsiz; /* data buffer size */ + Int32 circular; + UInt8 *data; /* data buffer */ + char *cseq; /* sequence buffer */ + StackiPtr hitpos[MAX_PATTERN]; /* stack of hit pos. */ + StackiPtr hiterr[MAX_PATTERN]; /* stack of errors */ +} Seq, *SeqPtr; + + /* -------------------- */ +typedef struct { /* pattern */ + /* -------------------- */ + int patlen; /* pattern length */ + int maxerr; /* max # of errors */ + char *cpat; /* pattern string */ + Int32 *patcode; /* encoded pattern */ + UInt32 *smat; /* S matrix */ + UInt32 omask; /* oblig. bits mask */ + Bool hasIndel; /* are indels allowed */ + Bool ok; /* is pattern ok */ +} Pattern, *PatternPtr; + +/* ----------------------------------------------- */ +/* macros */ +/* ----------------------------------------------- */ + +#ifndef NEW +#define NEW(typ) (typ*)malloc(sizeof(typ)) +#define NEWN(typ, dim) (typ*)malloc((unsigned long)(dim) * sizeof(typ)) +#define REALLOC(typ, ptr, dim) (typ*)realloc((void *) (ptr), (unsigned long)(dim) * sizeof(typ)) +#define FREE(ptr) free((void *) ptr) +#endif + +/* ----------------------------------------------- */ +/* prototypes */ +/* ----------------------------------------------- */ + + /* apat_seq.c */ + +SeqPtr FreeSequence (SeqPtr pseq); +SeqPtr NewSequence (void); +int ReadNextSequence (SeqPtr pseq); +int WriteSequence (FILE *filou , SeqPtr pseq); + + /* apat_parse.c */ + +Int32 *GetCode (CodType ctype); +int CheckPattern (Pattern *ppat); +int EncodePattern (Pattern *ppat, CodType ctype); +int ReadPattern (Pattern *ppat); +void PrintDebugPattern (Pattern *ppat); + + /* apat_search.c */ + +int CreateS (Pattern *ppat, Int32 lalpha); +Int32 ManberNoErr (Seq *pseq , Pattern *ppat, int patnum,int begin,int length); +Int32 ManberSub (Seq *pseq , Pattern *ppat, int patnum,int begin,int length); +Int32 ManberIndel (Seq *pseq , Pattern *ppat, int patnum,int begin,int length); +Int32 ManberAll (Seq *pseq , Pattern *ppat, int patnum,int begin,int length); +Int32 NwsPatAlign (Seq *pseq , Pattern *ppat, Int32 nerr , + Int32 *reslen , Int32 *reserr); + + /* apat_sys.c */ + +float UserCpuTime (int reset); +float SysCpuTime (int reset); +char *StrCpuTime (int reset); +void Erreur (char *msg , int stat); +int AccessFile (char *path, char *mode); + diff --git a/src/libapat/apat_parse.c b/src/libapat/apat_parse.c new file mode 100644 index 0000000..43cda48 --- /dev/null +++ b/src/libapat/apat_parse.c @@ -0,0 +1,369 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Mar. 92 */ +/* File: apat_parse.c */ +/* Purpose: Codage du pattern */ +/* History: */ +/* 00/07/94 : first version (stanford) */ +/* 00/11/94 : revised for DNA/PROTEIN */ +/* 30/12/94 : modified EncodePattern */ +/* for manber search */ +/* 14/05/99 : indels added */ +/* ==================================================== */ + +#include +#include +#include +#include + +#include "Gtypes.h" +#include "apat.h" + /* -------------------- */ + /* default char */ + /* encodings */ + /* -------------------- */ + +static Int32 sDftCode[] = { + +#include "CODES/dft_code.h" + +}; + /* -------------------- */ + /* char encodings */ + /* IUPAC */ + /* -------------------- */ + + /* IUPAC Proteins */ +static Int32 sProtCode[] = { + +#include "CODES/prot_code.h" + +}; + /* IUPAC Dna/Rna */ +static Int32 sDnaCode[] = { + +#include "CODES/dna_code.h" + +}; + + +/* -------------------------------------------- */ +/* internal replacement of gets */ +/* -------------------------------------------- */ +static char *sGets(char *buffer, int size) { + + char *ebuf; + + if (! fgets(buffer, size-1, stdin)) + return NULL; + + /* remove trailing line feed */ + + ebuf = buffer + strlen(buffer); + + while (--ebuf >= buffer) { + if ((*ebuf == '\n') || (*ebuf == '\r')) + *ebuf = '\000'; + else + break; + } + + return buffer; +} + +/* -------------------------------------------- */ +/* returns actual code associated to type */ +/* -------------------------------------------- */ + +Int32 *GetCode(CodType ctype) +{ + Int32 *code = sDftCode; + + switch (ctype) { + case dna : code = sDnaCode ; break; + case protein : code = sProtCode ; break; + default : code = sDftCode ; break; + } + + return code; +} + +/* -------------------------------------------- */ + +#define BAD_IF(tst) if (tst) return 0 + +int CheckPattern(Pattern *ppat) +{ + int lev; + char *pat; + + pat = ppat->cpat; + + BAD_IF (*pat == '#'); + + for (lev = 0; *pat ; pat++) + + switch (*pat) { + + case '[' : + BAD_IF (lev); + BAD_IF (*(pat+1) == ']'); + lev++; + break; + + case ']' : + lev--; + BAD_IF (lev); + break; + + case '!' : + BAD_IF (lev); + BAD_IF (! *(pat+1)); + BAD_IF (*(pat+1) == ']'); + break; + + case '#' : + BAD_IF (lev); + BAD_IF (*(pat-1) == '['); + break; + + default : + if (! isupper(*pat)) + return 0; + break; + } + + return (lev ? 0 : 1); +} + +#undef BAD_IF + + +/* -------------------------------------------- */ +static char *skipOblig(char *pat) +{ + return (*(pat+1) == '#' ? pat+1 : pat); +} + +/* -------------------------------------------- */ +static char *splitPattern(char *pat) +{ + switch (*pat) { + + case '[' : + for (; *pat; pat++) + if (*pat == ']') + return skipOblig(pat); + return NULL; + break; + + case '!' : + return splitPattern(pat+1); + break; + + } + + return skipOblig(pat); +} + +/* -------------------------------------------- */ +static Int32 valPattern(char *pat, Int32 *code) +{ + Int32 val; + + switch (*pat) { + + case '[' : + return valPattern(pat+1, code); + break; + + case '!' : + val = valPattern(pat+1, code); + return (~val & PATMASK); + break; + + default : + val = 0x0; + while (isupper(*pat)) { + val |= code[*pat - 'A']; + pat++; + } + return val; + } + + return 0x0; +} + +/* -------------------------------------------- */ +static Int32 obliBitPattern(char *pat) +{ + return (*(pat + strlen(pat) - 1) == '#' ? OBLIBIT : 0x0); +} + + +/* -------------------------------------------- */ +static int lenPattern(char *pat) +{ + int lpat; + + lpat = 0; + + while (*pat) { + + if (! (pat = splitPattern(pat))) + return 0; + + pat++; + + lpat++; + } + + return lpat; +} + +/* -------------------------------------------- */ +/* Interface */ +/* -------------------------------------------- */ + +/* -------------------------------------------- */ +/* encode un pattern */ +/* -------------------------------------------- */ +int EncodePattern(Pattern *ppat, CodType ctype) +{ + int pos, lpat; + Int32 *code; + char *pp, *pa, c; + + ppat->ok = Faux; + + code = GetCode(ctype); + + ppat->patlen = lpat = lenPattern(ppat->cpat); + + if (lpat <= 0) + return 0; + + if (! (ppat->patcode = NEWN(Int32, lpat))) + return 0; + + pa = pp = ppat->cpat; + + pos = 0; + + while (*pa) { + + pp = splitPattern(pa); + + c = *++pp; + + *pp = '\000'; + + ppat->patcode[pos++] = valPattern(pa, code) | obliBitPattern(pa); + + *pp = c; + + pa = pp; + } + + ppat->ok = Vrai; + + return lpat; +} + +/* -------------------------------------------- */ +/* remove blanks */ +/* -------------------------------------------- */ +static char *RemBlanks(char *s) +{ + char *sb, *sc; + + for (sb = sc = s ; *sb ; sb++) + if (! isspace(*sb)) + *sc++ = *sb; + + return s; +} + +/* -------------------------------------------- */ +/* count non blanks */ +/* -------------------------------------------- */ +static Int32 CountAlpha(char *s) +{ + Int32 n; + + for (n = 0 ; *s ; s++) + if (! isspace(*s)) + n++; + + return n; +} + + +/* -------------------------------------------- */ +/* lit un pattern */ +/* #mis */ +/* ligne starting with '/' are comments */ +/* -------------------------------------------- */ +int ReadPattern(Pattern *ppat) +{ + int val; + char *spac; + char buffer[BUFSIZ]; + + ppat->ok = Vrai; + + if (! sGets(buffer, sizeof(buffer))) + return 0; + + if (*buffer == '/') + return ReadPattern(ppat); + + if (! CountAlpha(buffer)) + return ReadPattern(ppat); + + for (spac = buffer ; *spac ; spac++) + if ((*spac == ' ') || (*spac == '\t')) + break; + + ppat->ok = Faux; + + if (! *spac) + return 0; + + if (sscanf(spac, "%d", &val) != 1) + return 0; + + ppat->hasIndel = (val < 0); + + ppat->maxerr = ((val >= 0) ? val : -val); + + *spac = '\000'; + + (void) RemBlanks(buffer); + + if ((ppat->cpat = NEWN(char, strlen(buffer)+1))) + strcpy(ppat->cpat, buffer); + + ppat->ok = (ppat->cpat != NULL); + + return (ppat->ok ? 1 : 0); +} + +/* -------------------------------------------- */ +/* ecrit un pattern - Debug - */ +/* -------------------------------------------- */ +void PrintDebugPattern(Pattern *ppat) +{ + int i; + + printf("Pattern : %s\n", ppat->cpat); + printf("Encoding : \n\t"); + + for (i = 0 ; i < ppat->patlen ; i++) { + printf("0x%8.8x ", ppat->patcode[i]); + if (i%4 == 3) + printf("\n\t"); + } + printf("\n"); +} + diff --git a/src/libapat/apat_search.c b/src/libapat/apat_search.c new file mode 100644 index 0000000..f0dd394 --- /dev/null +++ b/src/libapat/apat_search.c @@ -0,0 +1,339 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Dec. 94 */ +/* File: apat_search.c */ +/* Purpose: recherche du pattern */ +/* algorithme de Baeza-Yates/Gonnet */ +/* Manber (agrep) */ +/* History: */ +/* 07/12/94 : first version */ +/* 28/12/94 : revised version */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + +#if 0 +#ifndef THINK_C +#include +#endif +#endif + +#include +#include +#include + +#include "Gtypes.h" +#include "libstki.h" +#include "apat.h" + +#define POP PopiOut +#define PUSH PushiIn +#define TOPCURS CursiToTop +#define DOWNREAD ReadiDown + +#define KRONECK(x, msk) ((~x & msk) ? 0 : 1) +#define MIN(x, y) ((x) < (y) ? (x) : (y)) + +/* -------------------------------------------- */ +/* Construction de la matrice S */ +/* -------------------------------------------- */ + +int CreateS(Pattern *ppat, Int32 lalpha) +{ + Int32 i, j, indx; + UInt32 pindx, amask, omask, *smat; + + ppat->ok = Faux; + + omask = 0x0L; + + if (! (smat = NEWN(UInt32, lalpha))) + return 0; + + for (i = 0 ; i < lalpha ; i++) + smat[i] = 0x0; + + for (i = ppat->patlen - 1, amask = 0x1L ; i >= 0 ; i--, amask <<= 1) { + + indx = ppat->patcode[i]; + + if (ppat->patcode[i] & OBLIBIT) + omask |= amask; + + for (j = 0, pindx = 0x1L ; j < lalpha ; j++, pindx <<= 1) + if (indx & pindx) + smat[j] |= amask; + } + + ppat->smat = smat; + + ppat->omask = omask; + + ppat->ok = Vrai; + + return 1; + +} + +/* -------------------------------------------- */ +/* Baeza-Yates/Manber algorithm */ +/* NoError */ +/* -------------------------------------------- */ +Int32 ManberNoErr(Seq *pseq, Pattern *ppat, int patnum,int begin,int length) +{ + UInt32 pos; + UInt32 smask, r; + UInt8 *data; + StackiPtr *stkpos, *stkerr; + UInt32 end; + + end = begin + length; + end = (end <= (size_t)(pseq->seqlen+pseq->circular)) ? end:(size_t)(pseq->seqlen+pseq->circular); + + + /* create local masks */ + + smask = r = 0x1L << ppat->patlen; + + /* init. scan */ + data = pseq->data + begin; + stkpos = pseq->hitpos + patnum; + stkerr = pseq->hiterr + patnum; + + /* loop on text data */ + + for (pos = begin ; pos < end ; pos++) { + + r = (r >> 1) & ppat->smat[*data++]; + + if (r & 0x1L) { + PUSH(stkpos, pos - ppat->patlen + 1); + PUSH(stkerr, 0); + } + + r |= smask; + } + + return (*stkpos)->top; /* aka # of hits */ +} + +/* -------------------------------------------- */ +/* Baeza-Yates/Manber algorithm */ +/* Substitution only */ +/* */ +/* Note : r array is stored as : */ +/* 0 0 r(0,j) r(0,j+1) r(1,j) r(1,j+1) ... */ +/* */ +/* -------------------------------------------- */ +Int32 ManberSub(Seq *pseq, Pattern *ppat, int patnum,int begin,int length) +{ + int e, emax, found; + UInt32 pos; + UInt32 smask, cmask, sindx; + UInt32 *pr, r[2 * MAX_PAT_ERR + 2]; + UInt8 *data; + StackiPtr *stkpos, *stkerr; + UInt32 end; + + end = begin + length; + end = (end <= (size_t)(pseq->seqlen+pseq->circular)) ? end:(size_t)(pseq->seqlen+pseq->circular); + + /* create local masks */ + emax = ppat->maxerr; + + r[0] = r[1] = 0x0; + + cmask = smask = 0x1L << ppat->patlen; + + for (e = 0, pr = r + 3 ; e <= emax ; e++, pr += 2) + *pr = cmask; + + cmask = ~ ppat->omask; + + /* init. scan */ + data = pseq->data + begin; + stkpos = pseq->hitpos + patnum; + stkerr = pseq->hiterr + patnum; + + /* loop on text data */ + + for (pos = begin ; pos < end ; pos++) { + + sindx = ppat->smat[*data++]; + + for (e = found = 0, pr = r ; e <= emax ; e++, pr += 2) { + + pr[2] = pr[3] | smask; + + pr[3] = ((pr[0] >> 1) & cmask) /* sub */ + | ((pr[2] >> 1) & sindx); /* ident */ + + if (pr[3] & 0x1L) { /* found */ + if (! found) { + PUSH(stkpos, pos - ppat->patlen + 1); + PUSH(stkerr, e); + } + found++; + } + } + } + + return (*stkpos)->top; /* aka # of hits */ +} + +/* -------------------------------------------- */ +/* Baeza-Yates/Manber algorithm */ +/* Substitution + Indels */ +/* */ +/* Note : r array is stored as : */ +/* 0 0 r(0,j) r(0,j+1) r(1,j) r(1,j+1) ... */ +/* */ +/* Warning: may return shifted pos. */ +/* */ +/* -------------------------------------------- */ +Int32 ManberIndel(Seq *pseq, Pattern *ppat, int patnum,int begin,int length) +{ + int e, emax, found; + UInt32 pos; + UInt32 smask, cmask, sindx; + UInt32 *pr, r[2 * MAX_PAT_ERR + 2]; + UInt8 *data; + StackiPtr *stkpos, *stkerr; + UInt32 end; + + end = begin + length; + end = (end <= (size_t)(pseq->seqlen+pseq->circular)) ? end:(size_t)(pseq->seqlen+pseq->circular); + + /* create local masks */ + emax = ppat->maxerr; + + r[0] = r[1] = 0x0; + + cmask = smask = 0x1L << ppat->patlen; + + for (e = 0, pr = r + 3 ; e <= emax ; e++, pr += 2) { + *pr = cmask; + cmask = (cmask >> 1) | smask; + } + + cmask = ~ ppat->omask; + + /* init. scan */ + data = pseq->data + begin; + stkpos = pseq->hitpos + patnum; + stkerr = pseq->hiterr + patnum; + + /* loop on text data */ + + for (pos = begin ; pos < end ; pos++) { + + sindx = ppat->smat[*data++]; + + for (e = found = 0, pr = r ; e <= emax ; e++, pr += 2) { + + pr[2] = pr[3] | smask; + + pr[3] = (( pr[0] /* ins */ + | (pr[0] >> 1) /* sub */ + | (pr[1] >> 1)) /* del */ + & cmask) + | ((pr[2] >> 1) & sindx); /* ident */ + + if (pr[3] & 0x1L) { /* found */ + if (! found) { + PUSH(stkpos, pos - ppat->patlen + 1); + PUSH(stkerr, e); + } + found++; + } + + } + } + + return (*stkpos)->top; /* aka # of hits */ +} + +/* -------------------------------------------- */ +/* Baeza-Yates/Manber algorithm */ +/* API call to previous functions */ +/* -------------------------------------------- */ +Int32 ManberAll(Seq *pseq, Pattern *ppat, int patnum,int begin,int length) +{ + if (ppat->maxerr == 0) + return ManberNoErr(pseq, ppat, patnum, begin, length); + else if (ppat->hasIndel) + return ManberIndel(pseq, ppat, patnum, begin, length); + else + return ManberSub(pseq, ppat, patnum, begin, length); +} + + +/* -------------------------------------------- */ +/* Alignement NWS */ +/* pour edition des hits */ +/* (avec substitution obligatoire aux bords) */ +/* -------------------------------------------- */ + +Int32 NwsPatAlign(pseq, ppat, nerr, reslen, reserr) + Seq *pseq; + Pattern *ppat; + Int32 nerr, *reslen, *reserr; +{ + UInt8 *sseq, *px; + Int32 i, j, lseq, lpat, npos, dindel, dsub, + *pc, *pi, *pd, *ps; + UInt32 amask; + + static Int32 sTab[(MAX_PAT_LEN+MAX_PAT_ERR+1) * (MAX_PAT_LEN+1)]; + + lseq = pseq->seqlen; + + pc = sTab; /* |----|----| --> i */ + pi = pc - 1; /* | ps | pd | | */ + pd = pi - lseq; /* |----|----| | */ + ps = pd - 1; /* | pi | pc | v j */ + /* |---------| */ + + lseq = pseq->seqlen; + lpat = ppat->patlen; + + sseq = pseq->data - 1; + + amask = ONEMASK >> lpat; + + for (j = 0 ; j <= lpat ; j++) { + + for (i = 0 , px = sseq ; i <= lseq ; i++, px++) { + + if (i && j) { + dindel = MIN(*pi, *pd) + 1; + dsub = *ps + KRONECK(ppat->smat[*px], amask); + *pc = MIN(dindel, dsub); + } + else if (i) /* j == 0 */ + *pc = *pi + 1; + else if (j) /* i == 0 */ + *pc = *pd + 1; + else /* root */ + *pc = 0; + + pc++; + pi++; + pd++; + ps++; + } + + amask <<= 1; + } + + pc--; + + for (i = lseq, npos = 0 ; i >= 0 ; i--, pc--) { + if (*pc <= nerr) { + *reslen++ = i; + *reserr++ = *pc; + npos++; + } + } + + return npos; +} diff --git a/src/libapat/libstki.c b/src/libapat/libstki.c new file mode 100644 index 0000000..1ca9868 --- /dev/null +++ b/src/libapat/libstki.c @@ -0,0 +1,379 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Mar. 92 */ +/* File: libstki.c */ +/* Purpose: A library to deal with 'stacks' of */ +/* integers */ +/* Note: 'stacks' are dynamic (i.e. size is */ +/* automatically readjusted when needed) */ +/* History: */ +/* 00/03/92 : first draft */ +/* 15/08/93 : revised version */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + +#include +#include +#include + +#include "Gtypes.h" +#include "libstki.h" + + +/* ============================ */ +/* Constantes et Macros locales */ +/* ============================ */ + +#define ExpandStack(stkh) ResizeStacki((stkh), (*stkh)->size << 1) + +#define ShrinkStack(stkh) ResizeStacki((stkh), (*stkh)->size >> 1) + + +static Int16 sStkiLastError = kStkiNoErr; + +/* -------------------------------------------- */ +/* gestion des erreurs */ +/* get/reset erreur flag */ +/* */ +/* @function: StkiError */ +/* -------------------------------------------- */ + +Int16 StkiError(Bool reset) +{ + Int16 err; + + err = sStkiLastError; + + if (reset) + sStkiLastError = kStkiNoErr; + + return err; + +} /* end of StkiError */ + +/* -------------------------------------------- */ +/* creation d'un stack */ +/* */ +/* @function: NewStacki */ +/* -------------------------------------------- */ + +StackiPtr NewStacki(Int32 size) +{ + StackiPtr stki; + + if (! (stki = NEW(Stacki))) + return NULL; + + stki->size = size; + stki->top = 0; + stki->cursor = 0; + + if ( ! (stki->val = NEWN(Int32, size))) { + sStkiLastError = kStkiMemErr; + return FreeStacki(stki); + } + + return stki; + +} /* end of NewStacki */ + + +/* -------------------------------------------- */ +/* liberation d'un stack */ +/* */ +/* @function: FreeStacki */ +/* -------------------------------------------- */ + +StackiPtr FreeStacki(StackiPtr stki) +{ + if (stki) { + if (stki->val) + FREE(stki->val); + FREE(stki); + } + + return NULL; + +} /* end of FreeStacki */ + +/* -------------------------------------------- */ +/* creation d'un vecteur de stacks */ +/* */ +/* @function: NewStackiVector */ +/* -------------------------------------------- */ + +StackiHdle NewStackiVector(Int32 vectSize, Int32 stackSize) +{ + Int32 i; + StackiHdle stkh; + + if (! (stkh = NEWN(StackiPtr, vectSize))) { + sStkiLastError = kStkiMemErr; + return NULL; + } + + for (i = 0 ; i < vectSize ; i++) + if (! (stkh[i] = NewStacki(stackSize))) + return FreeStackiVector(stkh, i); + + return stkh; + +} /* end of NewStackiVector */ + + +/* -------------------------------------------- */ +/* liberation d'un vecteur de stacks */ +/* */ +/* @function: FreeStackiVector */ +/* -------------------------------------------- */ + +StackiHdle FreeStackiVector(StackiHdle stkh, Int32 vectSize) +{ + Int32 i; + + if (stkh) { + for (i = 0 ; i < vectSize ; i++) + (void) FreeStacki(stkh[i]); + FREE(stkh); + } + + return NULL; + +} /* end of FreeStackiVector */ + +/* -------------------------------------------- */ +/* resize d'un stack */ +/* */ +/* @function: ResizeStacki */ +/* -------------------------------------------- */ + +Int32 ResizeStacki(StackiHdle stkh, Int32 size) +{ + Int32 resize = 0; /* assume error */ + Int32 *val; + + if ((val = REALLOC(Int32, (*stkh)->val, size))) { + (*stkh)->size = resize = size; + (*stkh)->val = val; + } + + if (! resize) + sStkiLastError = kStkiMemErr; + + return resize; + +} /* end of ResizeStacki */ + +/* -------------------------------------------- */ +/* empilage(/lement) */ +/* */ +/* @function: PushiIn */ +/* -------------------------------------------- */ + +Bool PushiIn(StackiHdle stkh, Int32 val) +{ + if (((*stkh)->top >= (*stkh)->size) && (! ExpandStack(stkh))) + return Faux; + + (*stkh)->val[((*stkh)->top)++] = val; + + return Vrai; + +} /* end of PushiIn */ + +/* -------------------------------------------- */ +/* depilage(/lement) */ +/* */ +/* @function: PopiOut */ +/* -------------------------------------------- */ + +Bool PopiOut(StackiHdle stkh, Int32 *val) +{ + if ((*stkh)->top <= 0) + return Faux; + + *val = (*stkh)->val[--((*stkh)->top)]; + + if ( ((*stkh)->top < ((*stkh)->size >> 1)) + && ((*stkh)->top > kMinStackiSize)) + + (void) ShrinkStack(stkh); + + return Vrai; + +} /* end of PopiOut */ + +/* -------------------------------------------- */ +/* lecture descendante */ +/* */ +/* @function: ReadiDown */ +/* -------------------------------------------- */ + +Bool ReadiDown(StackiPtr stki, Int32 *val) +{ + if (stki->cursor <= 0) + return Faux; + + *val = stki->val[--(stki->cursor)]; + + return Vrai; + +} /* end of ReadiDown */ + +/* -------------------------------------------- */ +/* lecture ascendante */ +/* */ +/* @function: ReadiUp */ +/* -------------------------------------------- */ + +Bool ReadiUp(StackiPtr stki, Int32 *val) +{ + if (stki->cursor >= stki->top) + return Faux; + + *val = stki->val[(stki->cursor)++]; + + return Vrai; + +} /* end of ReadiUp */ + +/* -------------------------------------------- */ +/* remontee/descente du curseur */ +/* */ +/* @function: CursiToTop */ +/* @function: CursiToBottom */ +/* -------------------------------------------- */ + +void CursiToTop(StackiPtr stki) +{ + stki->cursor = stki->top; + +} /* end of CursiToTop */ + +void CursiToBottom(stki) + StackiPtr stki; +{ + stki->cursor = 0; + +} /* end of CursiToBottom */ + +/* -------------------------------------------- */ +/* echange des valeurs cursor <-> (top - 1) */ +/* */ +/* @function: CursiSwap */ +/* -------------------------------------------- */ + +void CursiSwap(StackiPtr stki) +{ + Int32 tmp; + + if ((stki->top <= 0) || (stki->cursor < 0)) + return; + + tmp = stki->val[stki->cursor]; + stki->val[stki->cursor] = stki->val[stki->top - 1]; + stki->val[stki->top - 1] = tmp; + +} /* end of CursiSwap */ + +/* -------------------------------------------- */ +/* Recherche d'une valeur en stack a partir du */ +/* curseur courant en descendant. */ +/* on laisse le curseur a l'endroit trouve */ +/* */ +/* @function: SearchDownStacki */ +/* -------------------------------------------- */ + +Bool SearchDownStacki(StackiPtr stki, Int32 sval) +{ + Int32 val; + Bool more; + + while ((more = ReadiDown(stki, &val))) + if (val == sval) + break; + + return more; + +} /* end of SearchDownStacki */ + +/* -------------------------------------------- */ +/* Recherche dichotomique d'une valeur en stack */ +/* le stack est suppose trie par valeurs */ +/* croissantes. */ +/* on place le curseur a l'endroit trouve */ +/* */ +/* @function: BinSearchStacki */ +/* -------------------------------------------- */ + +Bool BinSearchStacki(StackiPtr stki, Int32 sval) +{ + Int32 midd, low, high, span; + + low = 0; + high = stki->top - 1; + + while (high >= low) { + + midd = (high + low) / 2; + + span = stki->val[midd] - sval; + + if (span == 0) { + stki->cursor = midd; + return Vrai; + } + + if (span > 0) + high = midd - 1; + else + low = midd + 1; + } + + return Faux; + +} /* end of BinSearchStacki */ + +/* -------------------------------------------- */ +/* teste l'egalite *physique* de deux stacks */ +/* */ +/* @function: SameStacki */ +/* -------------------------------------------- */ + +Bool SameStacki(StackiPtr stki1, StackiPtr stki2) +{ + if (stki1->top != stki2->top) + return Faux; + + return ((memcmp(stki1->val, stki2->val, + stki1->top * sizeof(Int32)) == 0) ? Vrai : Faux); + +} /* end of SameStacki */ + + +/* -------------------------------------------- */ +/* inverse l'ordre des elements dans un stack */ +/* */ +/* @function: ReverseStacki */ +/* -------------------------------------------- */ + +Bool ReverseStacki(StackiPtr stki) +{ + Int32 *t, *b, swp; + + if (stki->top <= 0) + return Faux; + + b = stki->val; + t = b + stki->top - 1; + + while (t > b) { + swp = *t; + *t-- = *b; + *b++ = swp; + } + + return Vrai; + +} /* end of ReverseStacki */ + diff --git a/src/libapat/libstki.h b/src/libapat/libstki.h new file mode 100644 index 0000000..6331ae7 --- /dev/null +++ b/src/libapat/libstki.h @@ -0,0 +1,87 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Mar. 92 */ +/* File: libstki.h */ +/* Purpose: library of dynamic stacks holding */ +/* integer values */ +/* History: */ +/* 00/03/92 : first draft */ +/* 07/07/93 : complete revision */ +/* 10/03/94 : added xxxVector funcs */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + +#ifndef _H_Gtypes +#include "Gtypes.h" +#endif + +#define _H_libstki + +/* ==================================================== */ +/* Constantes de dimensionnement */ +/* ==================================================== */ + +#ifndef kMinStackiSize +#define kMinStackiSize 2 /* taille mini stack */ +#endif + + +#define kStkiNoErr 0 /* ok */ +#define kStkiMemErr 1 /* not enough memory */ + +#define kStkiReset Vrai +#define kStkiGet Faux + +/* ==================================================== */ +/* Macros standards */ +/* ==================================================== */ + +#ifndef NEW +#define NEW(typ) (typ*)malloc(sizeof(typ)) +#define NEWN(typ, dim) (typ*)malloc((unsigned long)(dim) * sizeof(typ)) +#define REALLOC(typ, ptr, dim) (typ*)realloc((void *) (ptr), (unsigned long)(dim) * sizeof(typ)) +#define FREE(ptr) free((Ptr) ptr) +#endif + + +/* ==================================================== */ +/* Types & Structures de donnees */ +/* ==================================================== */ + + /* -------------------- */ + /* structure : pile */ + /* -------------------- */ +typedef struct Stacki { + /* ---------------------*/ + Int32 size; /* stack size */ + Int32 top; /* current free pos. */ + Int32 cursor; /* current cursor */ + Int32 *val; /* values */ + /* ---------------------*/ +} Stacki, *StackiPtr, **StackiHdle; + + + +/* ==================================================== */ +/* Prototypes (generated by mproto) */ +/* ==================================================== */ + + /* libstki.c */ + +Int16 StkiError (Bool reset ); +StackiPtr NewStacki (Int32 size ); +StackiPtr FreeStacki (StackiPtr stki ); +StackiHdle NewStackiVector (Int32 vectSize, Int32 stackSize ); +StackiHdle FreeStackiVector (StackiHdle stkh, Int32 vectSize ); +Int32 ResizeStacki (StackiHdle stkh , Int32 size ); +Bool PushiIn (StackiHdle stkh , Int32 val ); +Bool PopiOut (StackiHdle stkh , Int32 *val ); +Bool ReadiDown (StackiPtr stki , Int32 *val ); +Bool ReadiUp (StackiPtr stki , Int32 *val ); +void CursiToTop (StackiPtr stki ); +void CursiToBottom (StackiPtr stki ); +void CursiSwap (StackiPtr stki ); +Bool SearchDownStacki (StackiPtr stki , Int32 sval ); +Bool BinSearchStacki (StackiPtr stki , Int32 sval ); +Bool SameStacki (StackiPtr stki1 , StackiPtr stki2 ); +Bool ReverseStacki (StackiPtr stki ); diff --git a/src/libecoPCR/Makefile b/src/libecoPCR/Makefile new file mode 100644 index 0000000..08f3745 --- /dev/null +++ b/src/libecoPCR/Makefile @@ -0,0 +1,31 @@ + +SOURCES = ecoapat.c \ + ecodna.c \ + ecoError.c \ + ecoIOUtils.c \ + ecoMalloc.c \ + ecorank.c \ + ecoseq.c \ + ecotax.c \ + ecofilter.c \ + econame.c + +SRCS=$(SOURCES) + +OBJECTS= $(patsubst %.c,%.o,$(SOURCES)) + +LIBFILE= libecoPCR.a +RANLIB= ranlib + + +include ../global.mk + + +all: $(LIBFILE) + +clean: + rm -rf $(OBJECTS) $(LIBFILE) + +$(LIBFILE): $(OBJECTS) + ar -cr $@ $? + $(RANLIB) $@ diff --git a/src/libecoPCR/ecoError.c b/src/libecoPCR/ecoError.c new file mode 100644 index 0000000..00bbfa2 --- /dev/null +++ b/src/libecoPCR/ecoError.c @@ -0,0 +1,26 @@ +#include "ecoPCR.h" +#include +#include + +/* + * print the message given as argument and exit the program + * @param error error number + * @param message the text explaining what's going on + * @param filename the file source where the program failed + * @param linenumber the line where it has failed + * filename and linenumber are written at pre-processing + * time by a macro + */ +void ecoError(int32_t error, + const char* message, + const char * filename, + int linenumber) +{ + fprintf(stderr,"Error %d in file %s line %d : %s\n", + error, + filename, + linenumber, + message); + + abort(); +} diff --git a/src/libecoPCR/ecoIOUtils.c b/src/libecoPCR/ecoIOUtils.c new file mode 100644 index 0000000..8d7ce82 --- /dev/null +++ b/src/libecoPCR/ecoIOUtils.c @@ -0,0 +1,122 @@ +#include "ecoPCR.h" +#include +#include + +#define SWAPINT32(x) ((((x) << 24) & 0xFF000000) | (((x) << 8) & 0xFF0000) | \ + (((x) >> 8) & 0xFF00) | (((x) >> 24) & 0xFF)) + + +int32_t is_big_endian() +{ + int32_t i=1; + + return (int32_t)((char*)&i)[0]; +} + + + + +int32_t swap_int32_t(int32_t i) +{ + return SWAPINT32(i); +} + + +/** + * Read part of the file + * @param *f the database + * @param recordSize the size to be read + * + * @return buffer + */ +void *read_ecorecord(FILE *f,int32_t *recordSize) +{ + static void *buffer =NULL; + int32_t buffersize=0; + int32_t read; + + if (!recordSize) + ECOERROR(ECO_ASSERT_ERROR, + "recordSize cannot be NULL"); + + read = fread(recordSize, + 1, + sizeof(int32_t), + f); + + if (feof(f)) + return NULL; + + if (read != sizeof(int32_t)) + ECOERROR(ECO_IO_ERROR,"Reading record size error"); + + if (is_big_endian()) + *recordSize=swap_int32_t(*recordSize); + + if (buffersize < *recordSize) + { + if (buffer) + buffer = ECOREALLOC(buffer,*recordSize, + "Increase size of record buffer"); + else + buffer = ECOMALLOC(*recordSize, + "Allocate record buffer"); + } + + read = fread(buffer, + 1, + *recordSize, + f); + + if (read != *recordSize) + ECOERROR(ECO_IO_ERROR,"Reading record data error"); + + return buffer; +}; + + + + + +/** + * Open the database and check it's readable + * @param filename name of the database (.sdx, .rdx, .tbx) + * @param sequencecount buffer - pointer to variable storing the number of occurence + * @param abort_on_open_error boolean to define the behaviour in case of error + * while opening the database + * @return FILE type + **/ +FILE *open_ecorecorddb(const char *filename, + int32_t *sequencecount, + int32_t abort_on_open_error) +{ + FILE *f; + int32_t read; + + f = fopen(filename,"rb"); + + if (!f) + { + if (abort_on_open_error) + ECOERROR(ECO_IO_ERROR,"Cannot open file"); + else + { + *sequencecount=0; + return NULL; + } + } + + read = fread(sequencecount, + 1, + sizeof(int32_t), + f); + + if (read != sizeof(int32_t)) + ECOERROR(ECO_IO_ERROR,"Reading record size error"); + + if (is_big_endian()) + *sequencecount=swap_int32_t(*sequencecount); + + return f; +} + diff --git a/src/libecoPCR/ecoMalloc.c b/src/libecoPCR/ecoMalloc.c new file mode 100644 index 0000000..0ea8d3b --- /dev/null +++ b/src/libecoPCR/ecoMalloc.c @@ -0,0 +1,79 @@ +#include "ecoPCR.h" +#include + +static int eco_log_malloc = 0; + +void eco_trace_memory_allocation() +{ + eco_log_malloc=1; +} + +void eco_untrace_memory_allocation() +{ + eco_log_malloc=0; +} + + +void *eco_malloc(int32_t chunksize, + const char *error_message, + const char *filename, + int32_t line) +{ + void * chunk; + + chunk = calloc(1,chunksize); + + if (!chunk) + ecoError(ECO_MEM_ERROR,error_message,filename,line); + + if (eco_log_malloc) + fprintf(stderr, + "Memory segment located at %p of size %d is allocated (file : %s [%d])", + chunk, + chunksize, + filename, + line); + + return chunk; +} + +void *eco_realloc(void *chunk, + int32_t newsize, + const char *error_message, + const char *filename, + int32_t line) +{ + void *newchunk; + + newchunk = realloc(chunk,newsize); + + if (!newchunk) + ecoError(ECO_MEM_ERROR,error_message,filename,line); + + if (eco_log_malloc) + fprintf(stderr, + "Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])", + chunk, + newchunk, + newsize, + filename, + line); + + return newchunk; +} + +void eco_free(void *chunk, + const char *error_message, + const char *filename, + int32_t line) +{ + free(chunk); + + if (eco_log_malloc) + fprintf(stderr, + "Memory segment %p is released => %s (file : %s [%d])", + chunk, + error_message, + filename, + line); +} diff --git a/src/libecoPCR/ecoPCR.h b/src/libecoPCR/ecoPCR.h new file mode 100644 index 0000000..3e13b18 --- /dev/null +++ b/src/libecoPCR/ecoPCR.h @@ -0,0 +1,269 @@ +#ifndef ECOPCR_H_ +#define ECOPCR_H_ + +#include +#include + +#ifndef H_apat +#include "../libapat/apat.h" +#endif + +/***************************************************** + * + * Data type declarations + * + *****************************************************/ + +/* + * + * Sequence types + * + */ + +typedef struct { + + int32_t taxid; + char AC[20]; + int32_t DE_length; + int32_t SQ_length; + int32_t CSQ_length; + + char data[1]; + +} ecoseqformat_t; + +typedef struct { + int32_t taxid; + int32_t SQ_length; + char *AC; + char *DE; + char *SQ; +} ecoseq_t; + +/* + * + * Taxonomy taxon types + * + */ + + +typedef struct { + int32_t taxid; + int32_t rank; + int32_t parent; + int32_t namelength; + char name[1]; + +} ecotxformat_t; + +typedef struct ecotxnode { + int32_t taxid; + int32_t rank; + struct ecotxnode *parent; + char *name; +} ecotx_t; + +typedef struct { + int32_t count; + ecotx_t taxon[1]; +} ecotxidx_t; + + +/* + * + * Taxonomy rank types + * + */ + +typedef struct { + int32_t count; + char* label[1]; +} ecorankidx_t; + +/* + * + * Taxonomy name types + * + */ + +typedef struct { + int32_t is_scientificname; + int32_t namelength; + int32_t classlength; + int32_t taxid; + char names[1]; +} econameformat_t; + + + typedef struct { + char *name; + char *classname; + int32_t is_scientificname; + struct ecotxnode *taxon; +} econame_t; + + +typedef struct { + int32_t count; + econame_t names[1]; +} econameidx_t; + + + typedef struct { + ecorankidx_t *ranks; + econameidx_t *names; + ecotxidx_t *taxons; +} ecotaxonomy_t; + + +/***************************************************** + * + * Function declarations + * + *****************************************************/ + +/* + * + * Low level system functions + * + */ + +int32_t is_big_endian(); +int32_t swap_int32_t(int32_t); + +void *eco_malloc(int32_t chunksize, + const char *error_message, + const char *filename, + int32_t line); + + +void *eco_realloc(void *chunk, + int32_t chunksize, + const char *error_message, + const char *filename, + int32_t line); + +void eco_free(void *chunk, + const char *error_message, + const char *filename, + int32_t line); + +void eco_trace_memory_allocation(); +void eco_untrace_memory_allocation(); + +#define ECOMALLOC(size,error_message) \ + eco_malloc((size),(error_message),__FILE__,__LINE__) + +#define ECOREALLOC(chunk,size,error_message) \ + eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__) + +#define ECOFREE(chunk,error_message) \ + eco_free((chunk),(error_message),__FILE__,__LINE__) + + + + +/* + * + * Error managment + * + */ + + +void ecoError(int32_t,const char*,const char *,int); + +#define ECOERROR(code,message) ecoError((code),(message),__FILE__,__LINE__) + +#define ECO_IO_ERROR (1) +#define ECO_MEM_ERROR (2) +#define ECO_ASSERT_ERROR (3) +#define ECO_NOTFOUND_ERROR (4) + + +/* + * + * Low level Disk access functions + * + */ + +FILE *open_ecorecorddb(const char *filename, + int32_t *sequencecount, + int32_t abort_on_open_error); + +void *read_ecorecord(FILE *,int32_t *recordSize); + + + +/* + * Read function in internal binary format + */ + +FILE *open_ecoseqdb(const char *filename, + int32_t *sequencecount); + +ecoseq_t *readnext_ecoseq(FILE *); + +ecorankidx_t *read_rankidx(const char *filename); + +econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy); + + + + /** + * Read taxonomy data as formated by the ecoPCRFormat.py script. + * + * This function is normaly uses internaly by the read_taxonomy + * function and should not be called directly. + * + * @arg filename path to the *.tdx file of the reformated db + * + * @return pointer to a taxonomy index structure + */ + +ecotxidx_t *read_taxonomyidx(const char *filename); + +ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName); + +ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid); + +int eco_isundertaxon(ecotx_t *taxon, int other_taxid); + +ecoseq_t *ecoseq_iterator(const char *prefix); + + + +ecoseq_t *new_ecoseq(); +int32_t delete_ecoseq(ecoseq_t *); +ecoseq_t *new_ecoseq_with_data( char *AC, + char *DE, + char *SQ, + int32_t taxid + ); + + +int32_t delete_taxon(ecotx_t *taxon); +int32_t delete_taxonomy(ecotxidx_t *index); + + +int32_t rank_index(const char* label,ecorankidx_t* ranks); + +int32_t delete_apatseq(SeqPtr pseq); +PatternPtr buildPattern(const char *pat, int32_t error_max); +PatternPtr complementPattern(PatternPtr pat); + +SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular); + +char *ecoComplementPattern(char *nucAcSeq); +char *ecoComplementSequence(char *nucAcSeq); +char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end); + +ecotx_t *eco_getspecies(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getgenus(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy); + +int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid); +int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid); + +#endif /*ECOPCR_H_*/ diff --git a/src/libecoPCR/ecoapat.c b/src/libecoPCR/ecoapat.c new file mode 100644 index 0000000..79a722e --- /dev/null +++ b/src/libecoPCR/ecoapat.c @@ -0,0 +1,199 @@ +#include "../libapat/libstki.h" +#include "../libapat/apat.h" + +#include "ecoPCR.h" + +#include + +static void EncodeSequence(SeqPtr seq); +static void UpperSequence(char *seq); + +/* -------------------------------------------- */ +/* uppercase sequence */ +/* -------------------------------------------- */ + +#define IS_LOWER(c) (((c) >= 'a') && ((c) <= 'z')) +#define TO_UPPER(c) ((c) - 'a' + 'A') + +void UpperSequence(char *seq) +{ + char *cseq; + + for (cseq = seq ; *cseq ; cseq++) + if (IS_LOWER(*cseq)) + *cseq = TO_UPPER(*cseq); +} + +#undef IS_LOWER +#undef TO_UPPER + + + + +/* -------------------------------------------- */ +/* encode sequence */ +/* IS_UPPER is slightly faster than isupper */ +/* -------------------------------------------- */ + +#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'Z')) + + + +void EncodeSequence(SeqPtr seq) +{ + int i; + UInt8 *data; + char *cseq; + + data = seq->data; + cseq = seq->cseq; + + while (*cseq) { + + *data = (IS_UPPER(*cseq) ? *cseq - 'A' : 0x0); + data++; + cseq++; + } + + for (i=0,cseq=seq->cseq;i < seq->circular; i++,cseq++,data++) + *data = (IS_UPPER(*cseq) ? *cseq - 'A' : 0x0); + + for (i = 0 ; i < MAX_PATTERN ; i++) + seq->hitpos[i]->top = seq->hiterr[i]->top = 0; + +} + +#undef IS_UPPER + + +SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular) +{ + int i; + + if (!out) + { + out = ECOMALLOC(sizeof(Seq), + "Error in Allocation of a new Seq structure"); + + for (i = 0 ; i < MAX_PATTERN ; i++) + { + + if (! (out->hitpos[i] = NewStacki(kMinStackiSize))) + ECOERROR(ECO_MEM_ERROR,"Error in hit stack Allocation"); + + if (! (out->hiterr[i] = NewStacki(kMinStackiSize))) + ECOERROR(ECO_MEM_ERROR,"Error in error stack Allocation"); + } + } + + + out->name = in->AC; + out->seqsiz = out->seqlen = in->SQ_length; + out->circular = circular; + + if (!out->data) + { + out->data = ECOMALLOC((out->seqlen+circular) *sizeof(UInt8), + "Error in Allocation of a new Seq data member"); + out->datsiz= out->seqlen+circular; + } + else if ((out->seqlen +circular) >= out->datsiz) + { + out->data = ECOREALLOC(out->data,(out->seqlen+circular), + "Error during Seq data buffer realloc"); + out->datsiz= out->seqlen+circular; + } + + out->cseq = in->SQ; + + EncodeSequence(out); + + return out; +} + +int32_t delete_apatseq(SeqPtr pseq) +{ + int i; + + if (pseq) { + + if (pseq->data) + ECOFREE(pseq->data,"Freeing sequence data buffer"); + + for (i = 0 ; i < MAX_PATTERN ; i++) { + if (pseq->hitpos[i]) FreeStacki(pseq->hitpos[i]); + if (pseq->hiterr[i]) FreeStacki(pseq->hiterr[i]); + } + + ECOFREE(pseq,"Freeing apat sequence structure"); + + return 0; + } + + return 1; +} + +PatternPtr buildPattern(const char *pat, int32_t error_max) +{ + PatternPtr pattern; + int32_t patlen; + + pattern = ECOMALLOC(sizeof(Pattern), + "Error in pattern allocation"); + + pattern->ok = Vrai; + pattern->hasIndel= Faux; + pattern->maxerr = error_max; + patlen = strlen(pat); + + pattern->cpat = ECOMALLOC(sizeof(char)*patlen+1, + "Error in sequence pattern allocation"); + + strncpy(pattern->cpat,pat,patlen); + pattern->cpat[patlen]=0; + UpperSequence(pattern->cpat); + + if (!CheckPattern(pattern)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking"); + + if (! EncodePattern(pattern, dna)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding"); + + if (! CreateS(pattern, ALPHA_LEN)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling"); + + return pattern; + +} + +PatternPtr complementPattern(PatternPtr pat) +{ + PatternPtr pattern; + + pattern = ECOMALLOC(sizeof(Pattern), + "Error in pattern allocation"); + + pattern->ok = Vrai; + pattern->hasIndel= pat->hasIndel; + pattern->maxerr = pat->maxerr; + pattern->patlen = pat->patlen; + + pattern->cpat = ECOMALLOC(sizeof(char)*(strlen(pat->cpat)+1), + "Error in sequence pattern allocation"); + + strcpy(pattern->cpat,pat->cpat); + + ecoComplementPattern(pattern->cpat); + + if (!CheckPattern(pattern)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking"); + + if (! EncodePattern(pattern, dna)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding"); + + if (! CreateS(pattern, ALPHA_LEN)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling"); + + return pattern; + +} diff --git a/src/libecoPCR/ecodna.c b/src/libecoPCR/ecodna.c new file mode 100644 index 0000000..7d29a0e --- /dev/null +++ b/src/libecoPCR/ecodna.c @@ -0,0 +1,153 @@ +#include +#include "ecoPCR.h" + +/* + * @doc: DNA alphabet (IUPAC) + */ +#define LX_BIO_DNA_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]" + +/* + * @doc: complementary DNA alphabet (IUPAC) + */ +#define LX_BIO_CDNA_ALPHA "TVGHEFCDIJMLKNOPQYSAABWXRZ#!][" + + +static char sNuc[] = LX_BIO_DNA_ALPHA; +static char sAnuc[] = LX_BIO_CDNA_ALPHA; + +static char LXBioBaseComplement(char nucAc); +static char *LXBioSeqComplement(char *nucAcSeq); +static char *reverseSequence(char *str,char isPattern); + + +/* ---------------------------- */ + +char LXBioBaseComplement(char nucAc) +{ + char *c; + + if ((c = strchr(sNuc, nucAc))) + return sAnuc[(c - sNuc)]; + else + return nucAc; +} + +/* ---------------------------- */ + +char *LXBioSeqComplement(char *nucAcSeq) +{ + char *s; + + for (s = nucAcSeq ; *s ; s++) + *s = LXBioBaseComplement(*s); + + return nucAcSeq; +} + + +char *reverseSequence(char *str,char isPattern) +{ + char *sb, *se, c; + + if (! str) + return str; + + sb = str; + se = str + strlen(str) - 1; + + while(sb <= se) { + c = *sb; + *sb++ = *se; + *se-- = c; + } + + sb = str; + se = str + strlen(str) - 1; + + if (isPattern) + for (;sb < se; sb++) + { + if (*sb=='#') + { + if (((se - sb) > 2) && (*(sb+2)=='!')) + { + *sb='!'; + sb+=2; + *sb='#'; + } + else + { + *sb=*(sb+1); + sb++; + *sb='#'; + } + } + else if (*sb=='!') + { + *sb=*(sb-1); + *(sb-1)='!'; + } + } + + return str; +} + +char *ecoComplementPattern(char *nucAcSeq) +{ + return reverseSequence(LXBioSeqComplement(nucAcSeq),1); +} + +char *ecoComplementSequence(char *nucAcSeq) +{ + return reverseSequence(LXBioSeqComplement(nucAcSeq),0); +} + + +char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end) +{ + static char *buffer = NULL; + static int32_t buffSize= 0; + int32_t length; + + if (begin < end) + { + length = end - begin; + + if (length >= buffSize) + { + buffSize = length+1; + if (buffer) + buffer=ECOREALLOC(buffer,buffSize, + "Error in reallocating sub sequence buffer"); + else + buffer=ECOMALLOC(buffSize, + "Error in allocating sub sequence buffer"); + + } + + strncpy(buffer,nucAcSeq + begin,length); + buffer[length]=0; + } + else + { + length = end + strlen(nucAcSeq) - begin; + + if (length >= buffSize) + { + buffSize = length+1; + if (buffer) + buffer=ECOREALLOC(buffer,buffSize, + "Error in reallocating sub sequence buffer"); + else + buffer=ECOMALLOC(buffSize, + "Error in allocating sub sequence buffer"); + + } + strncpy(buffer,nucAcSeq+begin,length - end); + strncpy(buffer+(length-end),nucAcSeq ,end); + buffer[length]=0; + } + + return buffer; +} + diff --git a/src/libecoPCR/ecofilter.c b/src/libecoPCR/ecofilter.c new file mode 100644 index 0000000..8a7dbb1 --- /dev/null +++ b/src/libecoPCR/ecofilter.c @@ -0,0 +1,19 @@ +#include "ecoPCR.h" + +int eco_is_taxid_included( ecotaxonomy_t *taxonomy, + int32_t *restricted_taxid, + int32_t tab_len, + int32_t taxid) +{ + int i; + ecotx_t *taxon; + + taxon = eco_findtaxonbytaxid(taxonomy, taxid); + + for (i=0; i < tab_len; i++) + if ( (taxon->taxid == restricted_taxid[i]) || + (eco_isundertaxon(taxon, restricted_taxid[i])) ) + return 1; + + return 0; +} \ No newline at end of file diff --git a/src/libecoPCR/econame.c b/src/libecoPCR/econame.c new file mode 100644 index 0000000..835d79c --- /dev/null +++ b/src/libecoPCR/econame.c @@ -0,0 +1,61 @@ +#include "ecoPCR.h" +#include +#include + +static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy); + +econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy) +{ + + int32_t count; + FILE *f; + econameidx_t *indexname; + int32_t i; + + f = open_ecorecorddb(filename,&count,1); + + indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names"); + + indexname->count=count; + + for (i=0; i < count; i++){ + readnext_econame(f,(indexname->names)+i,taxonomy); + } + + return indexname; +} + +econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy) +{ + + econameformat_t *raw; + int32_t rs; + + raw = read_ecorecord(f,&rs); + + if (!raw) + return NULL; + + if (is_big_endian()) + { + raw->is_scientificname = swap_int32_t(raw->is_scientificname); + raw->namelength = swap_int32_t(raw->namelength); + raw->classlength = swap_int32_t(raw->classlength); + raw->taxid = swap_int32_t(raw->taxid); + } + + name->is_scientificname=raw->is_scientificname; + + name->name = ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name"); + strncpy(name->name,raw->names,raw->namelength); + name->name[raw->namelength]=0; + + name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname"); + strncpy(name->classname,(raw->names+raw->namelength),raw->classlength); + name->classname[raw->classlength]=0; + + name->taxon = taxonomy->taxons->taxon + raw->taxid; + + return name; +} + diff --git a/src/libecoPCR/ecorank.c b/src/libecoPCR/ecorank.c new file mode 100644 index 0000000..4796088 --- /dev/null +++ b/src/libecoPCR/ecorank.c @@ -0,0 +1,52 @@ +#include "ecoPCR.h" +#include +#include + +static int compareRankLabel(const void *label1, const void *label2); + +ecorankidx_t *read_rankidx(const char *filename) +{ + int32_t count; + FILE *f; + ecorankidx_t *index; + int32_t i; + int32_t rs; + char *buffer; + + f = open_ecorecorddb(filename,&count,1); + + index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (count-1), + "Allocate rank index"); + + index->count=count; + + for (i=0; i < count; i++) + { + buffer = read_ecorecord(f,&rs); + index->label[i]=(char*) ECOMALLOC(rs+1, + "Allocate rank label"); + strncpy(index->label[i],buffer,rs); + } + + return index; +} + +int32_t rank_index(const char* label,ecorankidx_t* ranks) +{ + char **rep; + + rep = bsearch(label,ranks->label,ranks->count,sizeof(char*),compareRankLabel); + + if (rep) + return rep-ranks->label; + else + ECOERROR(ECO_NOTFOUND_ERROR,"Rank label not found"); + + return -1; +} + + +int compareRankLabel(const void *label1, const void *label2) +{ + return strcmp((const char*)label1,*(const char**)label2); +} diff --git a/src/libecoPCR/ecoseq.c b/src/libecoPCR/ecoseq.c new file mode 100644 index 0000000..2ec2614 --- /dev/null +++ b/src/libecoPCR/ecoseq.c @@ -0,0 +1,223 @@ +#include "ecoPCR.h" +#include +#include +#include +#include +#include + +static FILE *open_seqfile(const char *prefix,int32_t index); + + +ecoseq_t *new_ecoseq() +{ + void *tmp; + + tmp = ECOMALLOC(sizeof(ecoseq_t),"Allocate new ecoseq structure"); + + return tmp; +} + +int32_t delete_ecoseq(ecoseq_t * seq) +{ + + if (seq) + { + if (seq->AC) + ECOFREE(seq->AC,"Free sequence AC"); + + if (seq->DE) + ECOFREE(seq->DE,"Free sequence DE"); + + if (seq->SQ) + ECOFREE(seq->SQ,"Free sequence SQ"); + + ECOFREE(seq,"Free sequence structure"); + + return 0; + + } + + return 1; +} + +ecoseq_t *new_ecoseq_with_data( char *AC, + char *DE, + char *SQ, + int32_t taxid_idx + ) +{ + ecoseq_t *tmp; + int32_t lstr; + tmp = new_ecoseq(); + + tmp->taxid=taxid_idx; + + if (AC) + { + lstr =strlen(AC); + tmp->AC=ECOMALLOC((lstr+1) * sizeof(char), + "Allocate sequence accession"); + strcpy(tmp->AC,AC); + } + + if (DE) + { + lstr =strlen(DE); + tmp->DE=ECOMALLOC((lstr+1) * sizeof(char), + "Allocate sequence definition"); + strcpy(tmp->DE,DE); + } + + if (SQ) + { + lstr =strlen(SQ); + tmp->SQ=ECOMALLOC((lstr+1) * sizeof(char), + "Allocate sequence data"); + strcpy(tmp->SQ,SQ); + } + return tmp; + +} + +/** + * ?? used ?? + **/ +FILE *open_ecoseqdb(const char *filename, + int32_t *sequencecount) +{ + return open_ecorecorddb(filename,sequencecount,1); +} + +ecoseq_t *readnext_ecoseq(FILE *f) +{ + char *compressed=NULL; + + ecoseqformat_t *raw; + ecoseq_t *seq; + int32_t comp_status; + unsigned long int seqlength; + int32_t rs; + + raw = read_ecorecord(f,&rs); + + if (!raw) + return NULL; + + if (is_big_endian()) + { + raw->CSQ_length = swap_int32_t(raw->CSQ_length); + raw->DE_length = swap_int32_t(raw->DE_length); + raw->SQ_length = swap_int32_t(raw->SQ_length); + raw->taxid = swap_int32_t(raw->taxid); + } + + seq = new_ecoseq(); + + seq->taxid = raw->taxid; + + seq->AC = ECOMALLOC(strlen(raw->AC) +1, + "Allocate Sequence Accesion number"); + strncpy(seq->AC,raw->AC,strlen(raw->AC)); + + + seq->DE = ECOMALLOC(raw->DE_length+1, + "Allocate Sequence definition"); + strncpy(seq->DE,raw->data,raw->DE_length); + + seqlength = seq->SQ_length = raw->SQ_length; + + compressed = raw->data + raw->DE_length; + + seq->SQ = ECOMALLOC(seqlength+1, + "Allocate sequence buffer"); + + comp_status = uncompress((unsigned char*)seq->SQ, + &seqlength, + (unsigned char*)compressed, + raw->CSQ_length); + + if (comp_status != Z_OK) + ECOERROR(ECO_IO_ERROR,"I cannot uncompress sequence data"); + + return seq; +} + +/** + * Open the sequences database (.sdx file) + * @param prefix name of the database (radical without extension) + * @param index integer + * + * @return file object + */ +FILE *open_seqfile(const char *prefix,int32_t index) +{ + char filename_buffer[1024]; + int32_t filename_length; + FILE *input; + int32_t seqcount; + + filename_length = snprintf(filename_buffer, + 1023, + "%s_%03d.sdx", + prefix, + index); + + fprintf(stderr,"# Coucou %s\n",filename_buffer); + + + if (filename_length >= 1024) + ECOERROR(ECO_ASSERT_ERROR,"file name is too long"); + + filename_buffer[filename_length]=0; + + input=open_ecorecorddb(filename_buffer,&seqcount,0); + + if (input) + fprintf(stderr,"# Reading file %s containing %d sequences...\n", + filename_buffer, + seqcount); + + return input; +} + +ecoseq_t *ecoseq_iterator(const char *prefix) +{ + static FILE *current_seq_file= NULL; + static int32_t current_file_idx = 1; + static char current_prefix[1024]; + ecoseq_t *seq; + + if (prefix) + { + current_file_idx = 1; + + if (current_seq_file) + fclose(current_seq_file); + + strncpy(current_prefix,prefix,1023); + current_prefix[1024]=0; + + current_seq_file = open_seqfile(current_prefix, + current_file_idx); + + if (!current_seq_file) + return NULL; + + } + + seq = readnext_ecoseq(current_seq_file); + + if (!seq && feof(current_seq_file)) + { + current_file_idx++; + fclose(current_seq_file); + current_seq_file = open_seqfile(current_prefix, + current_file_idx); + + + if (current_seq_file) + seq = readnext_ecoseq(current_seq_file); + } + + return seq; +} \ No newline at end of file diff --git a/src/libecoPCR/ecotax.c b/src/libecoPCR/ecotax.c new file mode 100644 index 0000000..89ac82b --- /dev/null +++ b/src/libecoPCR/ecotax.c @@ -0,0 +1,329 @@ +#include "ecoPCR.h" +#include +#include +#include + +static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon); + + /** + * Open the taxonomy database + * @param pointer to the database (.tdx file) + * @return a ecotxidx_t structure + */ +ecotxidx_t *read_taxonomyidx(const char *filename) +{ + int32_t count; + FILE *f; + ecotxidx_t *index; + int32_t i; + + f = open_ecorecorddb(filename,&count,1); + + index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count-1), + "Allocate taxonomy"); + + index->count=count; + for (i=0; i < count; i++){ + readnext_ecotaxon(f,&(index->taxon[i])); + index->taxon[i].parent=index->taxon + (int32_t)index->taxon[i].parent; + } + return index; +} + + +int32_t delete_taxonomy(ecotxidx_t *index) +{ + int32_t i; + + if (index) + { + for (i=0; i< index->count; i++) + if (index->taxon[i].name) + ECOFREE(index->taxon[i].name,"Free scientific name"); + + ECOFREE(index,"Free Taxonomy"); + + return 0; + } + + return 1; +} + + + +int32_t delete_taxon(ecotx_t *taxon) +{ + if (taxon) + { + if (taxon->name) + ECOFREE(taxon->name,"Free scientific name"); + + ECOFREE(taxon,"Free Taxon"); + + return 0; + } + + return 1; +} + + +/** + * Read the database for a given taxon a save the data + * into the taxon structure(if any found) + * @param *f pointer to FILE type returned by fopen + * @param *taxon pointer to the structure + * + * @return a ecotx_t structure if any taxon found else NULL + */ +ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon) +{ + + ecotxformat_t *raw; + int32_t rs; + + raw = read_ecorecord(f,&rs); + + if (!raw) + return NULL; + + if (is_big_endian()) + { + raw->namelength = swap_int32_t(raw->namelength); + raw->parent = swap_int32_t(raw->parent); + raw->rank = swap_int32_t(raw->rank); + raw->taxid = swap_int32_t(raw->taxid); + } + + taxon->parent = (ecotx_t*)raw->parent; + taxon->taxid = raw->taxid; + taxon->rank = raw->rank; + + taxon->name = ECOMALLOC((raw->namelength+1) * sizeof(char), + "Allocate taxon scientific name"); + + strncpy(taxon->name,raw->name,raw->namelength); + + return taxon; +} + + +ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName) +{ + ecotaxonomy_t *tax; + char *filename; + int buffsize; + + tax = ECOMALLOC(sizeof(ecotaxonomy_t), + "Allocate taxonomy structure"); + + buffsize = strlen(prefix)+10; + + filename = ECOMALLOC(buffsize, + "Allocate filename"); + + snprintf(filename,buffsize,"%s.rdx",prefix); + + tax->ranks = read_rankidx(filename); + + snprintf(filename,buffsize,"%s.tdx",prefix); + + tax->taxons = read_taxonomyidx(filename); + + if (readAlternativeName) + { + snprintf(filename,buffsize,"%s.ndx",prefix); + tax->names=read_nameidx(filename,tax); + } + else + tax->names=NULL; + return tax; + +} + + + +int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy) +{ + if (taxonomy) + { + if (taxonomy->ranks) + ECOFREE(taxonomy->ranks,"Free rank index"); + + if (taxonomy->taxons) + ECOFREE(taxonomy->taxons,"Free taxon index"); + + ECOFREE(taxonomy,"Free taxonomy structure"); + + return 0; + } + + return 1; +} + +ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, + int32_t rankidx) +{ + ecotx_t *current_taxon; + ecotx_t *next_taxon; + + current_taxon = taxon; + next_taxon = current_taxon->parent; + + while ((current_taxon!=next_taxon) && // I' am the root node + (current_taxon->rank!=rankidx)) + { + current_taxon = next_taxon; + next_taxon = current_taxon->parent; + } + + if (current_taxon->rank==rankidx) + return current_taxon; + else + return NULL; +} + +/** + * Get back information concerning a taxon from a taxonomic id + * @param *taxonomy the taxonomy database + * @param taxid the taxonomic id + * + * @result a ecotx_t structure containing the taxonimic information + **/ +ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, + int32_t taxid) +{ + ecotx_t *current_taxon; + int32_t taxoncount; + int32_t i; + + taxoncount=taxonomy->taxons->count; + + for (current_taxon=taxonomy->taxons->taxon, + i=0; + i < taxoncount; + i++, + current_taxon++){ + if (current_taxon->taxid==taxid){ + return current_taxon; + } + } + + return (ecotx_t*)NULL; +} + +/** + * Find out if taxon is son of other taxon (identified by its taxid) + * @param *taxon son taxon + * @param parent_taxid taxonomic id of the other taxon + * + * @return 1 is the other taxid math a parent taxid, else 0 + **/ +int eco_isundertaxon(ecotx_t *taxon, + int other_taxid) +{ + ecotx_t *next_parent; + + next_parent = taxon->parent; + + while ( (other_taxid != next_parent->taxid) && + (strcmp(next_parent->name, "root")) ) + { + next_parent = next_parent->parent; + } + + if (other_taxid == next_parent->taxid) + return 1; + else + return 0; +} + +ecotx_t *eco_getspecies(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("species",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + +ecotx_t *eco_getgenus(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("genus",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + + +ecotx_t *eco_getfamily(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("family",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + +ecotx_t *eco_getkingdom(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("kingdom",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + +ecotx_t *eco_getsuperkingdom(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("superkingdom",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} \ No newline at end of file diff --git a/src/libtm/tm.c b/src/libtm/tm.c new file mode 100644 index 0000000..b143ea5 --- /dev/null +++ b/src/libtm/tm.c @@ -0,0 +1,31 @@ + +/** + * + * J Jr SantaLucia. + * A uniÞed view of polymer, dumbbell, and oligonucleotide + * dna nearest-neighbor thermodynamics. + * Proc Natl Acad Sci U S A, 95(4):1460Ð1465, 1998 Feb 17. + */ + + +//Nearest-neighbor sequence +//(5'-3'/5'-3') deltaH deltaS +// kcal/mol cal/(moláK) +//AA/TT -7.9 -22.2 +//AG/CT -7.8 -21.0 +//AT/AT -7.2 -20.4 +//AC/GT -8.4 -22.4 +//GA/TC -8.2 -22.2 +//GG/CC -8.0 -19.9 +//GC/GC -9.8 -24.4 +//TA/TA -7.2 -21.3 +//TG/CA -8.5 -22.7 +//CG/CG -10.6 -27.2 +//Terminal A-T base pair 2.3 4.1 +//Terminal G-C base pair 0.1 -2.8 + + +float nearestNeighborTm(const char *oligo,float probe,float target) +{ + +}