commit 9e33d6a180500c3f2a88c3a577463a7833f81a37 Author: Eric Coissac Date: Wed Jan 13 10:23:54 2016 +0100 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e4e41cd --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/man/ diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..1dcdb34 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,26 @@ +Package: ROBIBarcodes +Type: Package +Title: Metabarcoding barcode database +Version: 0.1 +Date: 2013-10-24 +Author: LECA - Laboratoire d'ecologie alpine +Maintainer: LECA OBITools team +Description: More about what it does (maybe more than one line) +License: CeCILL v2.0 +LazyLoad: yes +RoxygenNote: 5.0.1 +Collate: + 'ROBIBarcodes.R' + 'xmlMods.R' + 'bibliography.R' + 'ecopcr.R' + 'emptydb.R' + 'logo.R' + 'mismatchplot.R' + 'piexy.R' + 'primer_table.R' + 'primers.R' + 'resolution.R' + 'taxonomy.R' + 'taxonomy_table.R' + 'xmlspare.R' diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..08c91fd --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,28 @@ +# Generated by roxygen2: do not edit by hand + +export(add.reference.barcodedb) +export(add.taxon.barcodedb) +export(addmodsnamespace) +export(bezier3) +export(bib2mods) +export(dna.shanon) +export(dnalogo) +export(dnalogoplot) +export(ecopcr.forward.frequencies) +export(ecopcr.forward.shanon) +export(ecopcr.reverse.frequencies) +export(ecopcr.reverse.shanon) +export(has.bibliography) +export(metabarcodedb) +export(metabarcodedb.taxonomy) +export(mismatchplot) +export(pie.xy) +export(plotDNAletter) +export(primers.data.frame) +export(read.ecopcr.result) +export(resolution) +export(taxonomy.data.frame) +export(whitepaper) +import(ROBITaxonomy) +import(XML) +useDynLib(ROBIBarcodes) diff --git a/R/ROBIBarcodes.R b/R/ROBIBarcodes.R new file mode 100644 index 0000000..4fa6e6d --- /dev/null +++ b/R/ROBIBarcodes.R @@ -0,0 +1,25 @@ +#' A package to manipulate DNA metabarcoding data. +#' +#' This package was written as a following of the OBITools. +#' +#' \tabular{ll}{ +#' Package: \tab ROBIBarcodes\cr +#' Type: \tab Package\cr +#' Version: \tab 0.1\cr +#' Date: \tab 2013-06-27\cr +#' License: \tab CeCILL 2.0\cr +#' LazyLoad: \tab yes\cr +#'} +#' +#' @name ROBIBarcodes-package +#' @aliases ROBIBarcodes +#' @docType package +#' @title A package to manipulate DNA metabarcoding marker database. +#' @author Frederic Boyer +#' @author Aurelie Bonin +#' @author Lucie Zinger +#' @author Eric Coissac +#' +#' @references http://metabarcoding.org/obitools +#' +NULL diff --git a/R/bibliography.R b/R/bibliography.R new file mode 100644 index 0000000..6de0e70 --- /dev/null +++ b/R/bibliography.R @@ -0,0 +1,45 @@ +#'@include xmlMods.R +#'@import XML +#' +NULL + +#' Tests if the metabarcode database has at least one bibliography reference +#' +#' @export +has.bibliography = function(barcodedb) { + length(getNodeSet(barcodedb, + path='/obi:obimetabarcodedb/obi:bibliography', + c(obi="http://metabarcoding.org/OBIMetabarcodes")))>0 +} + +#' @export +add.reference.barcodedb = function(barcodedb,bibfile,bibutils='bib2xml') { + + if (! has.bibliography(barcodedb)) { + # We create the bibliography node + + metabarcode = getNodeSet(barcodedb, + path='/obi:obimetabarcodedb', + c(obi="http://metabarcoding.org/OBIMetabarcodes"))[[1]] + + spare = sparexmltree() + + bibliography = getNodeSet(spare, + path='/obi:obimetabarcodedb/obi:bibliography', + c(obi="http://metabarcoding.org/OBIMetabarcodes"))[[1]] + + bibliography = xmlClone(bibliography) + + addChildren(metadata,bibliography, + at = NA) + + + } + + bibliography=getNodeSet(barcodedb, + path='/obi:obimetabarcodedb/obi:bibliography', + c(obi="http://metabarcoding.org/OBIMetabarcodes")) + + ref = bib2mods(bibfile,bibutils) + hidden=addChildren(bibliography[[1]],kids=ref) +} \ No newline at end of file diff --git a/R/ecopcr.R b/R/ecopcr.R new file mode 100644 index 0000000..874091e --- /dev/null +++ b/R/ecopcr.R @@ -0,0 +1,162 @@ +#'@include ROBIBarcodes.R +NULL + +# column 1 : accession number +# column 2 : sequence length +# column 3 : taxonomic id +# column 4 : rank +# column 5 : species taxonomic id +# column 6 : scientific name +# column 7 : genus taxonomic id +# column 8 : genus name +# column 9 : family taxonomic id +# column 10 : family name +# column 11 : super kingdom taxonomic id +# column 12 : super kingdom name +# column 13 : strand (direct or reverse) +# column 14 : first oligonucleotide +# column 15 : number of errors for the first strand +# column 16 : Tm for hybridization of primer 1 at this site +# column 17 : second oligonucleotide +# column 18 : number of errors for the second strand +# column 19 : Tm for hybridization of primer 1 at this site +# column 20 : amplification length +# column 21 : sequence +# column 22 : definition + +#' Read the result file produced by the ecoPCR program. +#' +#' @export +read.ecopcr.result = function(file) +{ + + split.line = function(line) { + l = strsplit(line,split=" +\\| +")[[1]] + l = c(l[1:21],paste(l[-c(1:21)],sep="|")) + return(l) + } + + if (missing(file) && !missing(text)) { + file <- textConnection(text) + on.exit(close(file)) + } + if (is.character(file)) { + file <- file(file, "rt") + on.exit(close(file)) + } + if (!inherits(file, "connection")) + stop("'file' must be a character string or connection") + if (!isOpen(file, "rt")) { + open(file, "rt") + on.exit(close(file)) + } + + line = readLines(file,1) + while (length(grep('^#',line))==1) { + line = readLines(file,1) + } + + pushBack(line,file) + + lines = lapply(readLines(file),split.line) + nlines = length(lines) + AC = sapply(1:nlines,function(x) lines[[x]][1]) + seq_length = as.integer(sapply(1:nlines,function(x) lines[[x]][2])) + taxid = as.integer(sapply(1:nlines,function(x) lines[[x]][3])) + rank = as.factor(sapply(1:nlines,function(x) lines[[x]][4])) + species = type.convert(sapply(1:nlines,function(x) lines[[x]][5]),na.string="###") + species_name = sapply(1:nlines,function(x) lines[[x]][6]) + genus = type.convert(sapply(1:nlines,function(x) lines[[x]][7]),na.string="###") + genus_name = sapply(1:nlines,function(x) lines[[x]][8]) + family = type.convert(sapply(1:nlines,function(x) lines[[x]][9]),na.string="###") + family_name = sapply(1:nlines,function(x) lines[[x]][10]) + superkingdom = type.convert(sapply(1:nlines,function(x) lines[[x]][11]),na.string="###") + superkingdom_name = sapply(1:nlines,function(x) lines[[x]][12]) + strand = as.factor(sapply(1:nlines,function(x) lines[[x]][13])) + forward_match = sapply(1:nlines,function(x) lines[[x]][14]) + forward_mismatch = as.integer(sapply(1:nlines,function(x) lines[[x]][15])) + forward_tm = as.double(sapply(1:nlines,function(x) lines[[x]][16])) + reverse_match = sapply(1:nlines,function(x) lines[[x]][17]) + reverse_mismatch = as.integer(sapply(1:nlines,function(x) lines[[x]][18])) + reverse_tm = as.double(sapply(1:nlines,function(x) lines[[x]][19])) + amplicon_length = as.integer(sapply(1:nlines,function(x) lines[[x]][20])) + sequence = sapply(1:nlines,function(x) lines[[x]][21]) + definition = sapply(1:nlines,function(x) lines[[x]][22]) + + eco = data.frame(AC,seq_length,taxid,rank, + species,species_name, + genus,genus_name, + family,family_name, + superkingdom,superkingdom_name, + strand, + forward_match,forward_mismatch,forward_tm, + reverse_match,reverse_mismatch,reverse_tm, + amplicon_length,sequence,definition + ) + + return(eco) +} + +ecopcr.frequencies = function(matches,group=NULL) { + compute = function(matches) { + w = as.matrix(do.call(rbind,strsplit(as.character(matches),''))) + d = dim(w) + w=factor(w,levels=c('A','C','G','T')) + dim(w)=d + w=t(w) + freq = mapply(function(x) table(w[x,]),1:d[2]) + freq = freq[c('A','C','G','T'),] + csum = colSums(freq) + freq = sweep(freq,2,csum,'/') + attr(freq,'count')=length(w) + return(freq) + } + if (is.null(group)) + return(compute(matches)) + else { + lmatches = aggregate(matches,by=list(group=as.factor(group)),as.character) + w = lmatches$x + names(w)=lmatches$group + lf = lapply(w,compute) + return(lf) + } +} + +#' @export +ecopcr.forward.frequencies = function(ecopcr,group=NULL) { + return(ecopcr.frequencies(ecopcr$forward_match,group)) +} + +#' @export +ecopcr.reverse.frequencies = function(ecopcr,group=NULL) { + return(ecopcr.frequencies(ecopcr$reverse_match,group)) +} + +#' @export +dna.shanon = function(freq,base=2) { + shanon = log(4)/log(base) - colSums(-freq *log(freq) / log(base),na.rm=TRUE) + return(sweep(freq,2,shanon,'*')) +} + + +ecopcr.shanon = function(matches,group=NULL,base=2) { + + if (is.null(group)) { + freq = ecopcr.frequencies(matches) + return(dna.shanon(freq)) + } + else { + lf = lapply(ecopcr.frequencies(matches,group),dna.shanon) + return(lf) + } +} + +#' @export +ecopcr.forward.shanon = function(ecopcr,group=NULL,base=2) { + return(ecopcr.shanon(ecopcr$forward_match,group,base)) +} + +#' @export +ecopcr.reverse.shanon = function(ecopcr,group=NULL,base=2) { + return(ecopcr.shanon(ecopcr$reverse_match,group,base)) +} diff --git a/R/emptydb.R b/R/emptydb.R new file mode 100644 index 0000000..77dd279 --- /dev/null +++ b/R/emptydb.R @@ -0,0 +1,18 @@ +#'@include ROBIBarcodes.R +#'@import XML +#' +NULL + +#' Creates a new empty metabarcode database. +#' +#' @export +metabarcodedb = function() { + emptyfile = paste(system.file("extdata", + package="ROBIBarcodes"), + 'empty.xml', + sep='/') + + empty = xmlParseDoc(emptyfile) + + return(empty) +} \ No newline at end of file diff --git a/R/logo.R b/R/logo.R new file mode 100644 index 0000000..94ae581 --- /dev/null +++ b/R/logo.R @@ -0,0 +1,488 @@ +#'@include ROBIBarcodes.R +NULL + +svg.A.path="m 53.417969,-13.28125 -29.394532,0 L 19.384766,0 0.48828125,0 27.490234,-72.900391 l 22.41211,0 L 76.904297,0 58.007812,0 53.417969,-13.28125 m -24.707032,-13.525391 19.970704,0 -9.960938,-29.003906 -10.009766,29.003906" +svg.C.path="M 66.992187,-4.0039062 C 63.541603,-2.2135395 59.944602,-0.86262935 56.201172,0.04882812 52.45763,0.9602855 48.551384,1.4160142 44.482422,1.4160156 32.340462,1.4160142 22.721331,-1.9693991 15.625,-8.7402344 8.5286373,-15.543604 4.9804638,-24.755835 4.9804687,-36.376953 4.9804638,-48.030551 8.5286373,-57.242781 15.625,-64.013672 c 7.096331,-6.803314 16.715462,-10.205004 28.857422,-10.205078 4.068962,7.4e-5 7.975208,0.455803 11.71875,1.367188 3.74343,0.91153 7.340431,2.26244 10.791015,4.052734 l 0,15.087891 c -3.483136,-2.376246 -6.917377,-4.117781 -10.302734,-5.22461 -3.38547,-1.106711 -6.949919,-1.660096 -10.693359,-1.660156 -6.705769,6e-5 -11.979201,2.148496 -15.820313,6.445312 -3.841172,4.296925 -5.761743,10.221398 -5.761719,17.773438 -2.4e-5,7.51956 1.920547,13.427757 5.761719,17.724609 3.841112,4.29689 9.114544,6.445325 15.820313,6.445313 3.74344,1.2e-5 7.307889,-0.553373 10.693359,-1.660157 3.385357,-1.106755 6.819598,-2.84829 10.302734,-5.224609 l 0,15.0878908" +svg.G.path="m 74.707031,-5.4199219 c -4.68757,2.278649 -9.554101,3.9876317 -14.599609,5.12695315 C 55.061794,0.84635332 49.853466,1.4160142 44.482422,1.4160156 32.340462,1.4160142 22.721331,-1.9693991 15.625,-8.7402344 8.5286373,-15.543604 4.9804638,-24.755835 4.9804687,-36.376953 c -4.9e-6,-11.751254 3.6132727,-20.996037 10.8398443,-27.734375 7.226539,-6.738211 17.122362,-10.107348 29.687499,-10.107422 4.850211,7.4e-5 9.488878,0.455803 13.916016,1.367188 4.459572,0.91153 8.658786,2.26244 12.597656,4.052734 l 0,15.087891 c -4.069078,-2.311142 -8.121808,-4.036401 -12.158203,-5.175782 -4.003962,-1.139263 -8.02414,-1.708924 -12.060547,-1.708984 -7.487019,6e-5 -13.265008,2.099668 -17.333984,6.298828 -4.036485,4.166717 -6.054712,10.140018 -6.054688,17.919922 -2.4e-5,7.714872 1.953099,13.671898 5.859375,17.871094 3.906216,4.199233 9.456341,6.29884 16.650391,6.298828 1.953076,1.2e-5 3.759715,-0.11392 5.419922,-0.341797 1.692654,-0.260404 3.206325,-0.651029 4.541016,-1.171875 l 0,-14.160156 -11.47461,0 0,-12.597657 29.296875,0 0,35.0585941" +svg.T.path="m 0.48828125,-72.900391 l 67.18749975,0 0,14.208985 -24.169922,0 0,58.691406 -18.798828,0 0,-58.691406 -24.21874975,0 0,-14.208985" +svg.R.path="m 35.888672,-40.576172 c 3.938762,4.1e-5 6.754515,-0.73238 8.447265,-2.197265 1.725215,-1.4648 2.587844,-3.873652 2.587891,-7.226563 -4.7e-5,-3.320259 -0.862676,-5.696559 -2.587891,-7.128906 -1.69275,-1.432233 -4.508503,-2.148378 -8.447265,-2.148438 l -7.910156,0 0,18.701172 7.910156,0 m -7.910156,12.988281 0,27.587891 -18.7988285,0 0,-72.900391 28.7109375,0 c 9.602817,7.3e-5 16.63406,1.6114 21.09375,4.833985 4.492124,3.222721 6.738216,8.317117 6.738281,15.283203 -6.5e-5,4.817756 -1.171939,8.77283 -3.515625,11.865234 -2.311258,3.092486 -5.810603,5.37113 -10.498047,6.835938 2.571561,0.585971 4.86648,1.920605 6.884766,4.003906 2.050721,2.050809 4.117776,5.175806 6.201172,9.375 L 75,0 54.980469,0 46.09375,-18.115234 c -1.790409,-3.645812 -3.613324,-6.136044 -5.46875,-7.470703 -1.822955,-1.334609 -4.264359,-2.001926 -7.324219,-2.001954 l -5.322265,0" +svg.Y.path="m -0.9765625,-72.900391 20.5566405,0 16.601563,25.976563 16.601562,-25.976563 20.605469,0 -27.783203,42.1875 0,30.712891 -18.798828,0 0,-30.712891 -27.7832035,-42.1875" +svg.M.path="m 9.1796875,-72.900391 23.9257815,0 16.601562,39.013672 16.699219,-39.013672 23.876953,0 0,72.900391 -17.773437,0 0,-53.320312 -16.796875,39.30664 -11.914063,0 -16.796875,-39.30664 0,53.320312 -17.8222655,0 0,-72.900391" +svg.K.path="m 9.1796875,-72.900391 18.7988285,0 0,26.611329 27.099609,-26.611329 21.826172,0 L 41.796875,-38.378906 80.517578,0 56.982422,0 27.978516,-28.710937 27.978516,0 9.1796875,0 l 0,-72.900391" +svg.W.path="m 2.9785156,-72.900391 18.0175784,0 12.597656,52.978516 12.5,-52.978516 18.115234,0 12.5,52.978516 12.597657,-52.978516 17.871089,0 L 89.990234,0 68.310547,0 55.078125,-55.419922 41.992187,0 20.3125,0 2.9785156,-72.900391" +svg.S.path="m 59.912109,-70.605469 0,15.429688 c -4.003962,-1.790308 -7.910208,-3.141218 -11.71875,-4.052735 -3.808638,-0.911398 -7.405639,-1.367127 -10.791015,-1.367187 -4.492221,6e-5 -7.81253,0.618549 -9.960938,1.855469 -2.148463,1.237036 -3.22268,3.157607 -3.222656,5.761718 -2.4e-5,1.953176 0.716121,3.483123 2.148437,4.589844 1.464816,1.074266 4.101533,2.002 7.910157,2.783203 l 8.007812,1.611328 c 8.105419,1.627647 13.867132,4.101603 17.285156,7.421875 3.417906,3.320346 5.126889,8.040393 5.126954,14.160157 -6.5e-5,8.040379 -2.392641,14.0299559 -7.177735,17.9687496 -4.752657,3.90625056 -12.02804,5.8593736 -21.826172,5.859375 C 31.070932,1.4160142 26.432265,0.97656152 21.777344,0.09765625 17.122379,-0.78124922 12.467435,-2.0833313 7.8125,-3.8085938 l 0,-15.8691402 c 4.654935,2.473975 9.147118,4.345718 13.476562,5.615234 4.361954,1.236992 8.561169,1.855481 12.597657,1.855469 4.101524,1.2e-5 7.242797,-0.683581 9.423828,-2.050782 2.180944,-1.367171 3.271438,-3.320294 3.271484,-5.859374 -4.6e-5,-2.278624 -0.748744,-4.036435 -2.246094,-5.273438 -1.464886,-1.236953 -4.410847,-2.343722 -8.83789,-3.320312 l -7.275391,-1.611329 c -7.291687,-1.562468 -12.630224,-4.0527 -16.015625,-7.470703 -3.3528732,-3.417927 -5.0293038,-8.024042 -5.0292966,-13.818359 -7.2e-6,-7.259056 2.3437405,-12.841733 7.0312496,-16.748047 4.687481,-3.906178 11.425756,-5.859301 20.214844,-5.859375 4.003868,7.4e-5 8.121702,0.309319 12.353516,0.927734 4.23172,0.586011 8.60997,1.481192 13.134765,2.685547" +svg.B.path="m 38.378906,-44.677734 c 2.962198,4.4e-5 5.20829,-0.650997 6.738281,-1.953125 1.529902,-1.302036 2.294875,-3.222607 2.294922,-5.761719 -4.7e-5,-2.506456 -0.76502,-4.410751 -2.294922,-5.712891 -1.529991,-1.334576 -3.776083,-2.001893 -6.738281,-2.001953 l -10.40039,0 0,15.429688 10.40039,0 m 0.634766,31.884765 c 3.775999,1.3e-5 6.608027,-0.797512 8.496094,-2.392578 1.920523,-1.595035 2.880809,-4.003887 2.880859,-7.226562 -5e-5,-3.157527 -0.94406,-5.517551 -2.832031,-7.080078 -1.888067,-1.595021 -4.736371,-2.392547 -8.544922,-2.392579 l -11.035156,0 0,19.091797 11.035156,0 m 17.480469,-26.220703 c 4.036397,1.171913 7.161394,3.336624 9.375,6.494141 2.213473,3.157581 3.320243,7.031275 3.320312,11.621094 -6.9e-5,7.031263 -2.376369,12.2721435 -7.128906,15.7226558 C 57.307885,-1.7252587 50.08133,0 40.380859,0 l -31.2011715,0 0,-72.900391 28.2226565,0 c 10.12365,7.3e-5 17.447862,1.53002 21.972656,4.589844 4.557228,3.059961 6.835871,7.959045 6.835937,14.697266 -6.6e-5,3.548227 -0.830143,6.575568 -2.490234,9.082031 -1.660218,2.474 -4.06907,4.313191 -7.226562,5.517578" +svg.D.path="m 27.978516,-58.691406 0,44.482422 6.738281,0 c 7.682249,1.4e-5 13.541618,-1.904281 17.578125,-5.712891 4.068954,-3.80857 6.103457,-9.342419 6.103515,-16.601562 -5.8e-5,-7.226519 -2.018285,-12.727816 -6.054687,-16.503907 -4.036507,-3.775985 -9.912152,-5.664004 -17.626953,-5.664062 l -6.738281,0 m -18.7988285,-14.208985 19.8242185,0 c 11.067669,7.3e-5 19.303337,0.797599 24.707031,2.392579 5.436139,1.562568 10.091083,4.231837 13.964844,8.007812 3.417898,3.28782 5.956958,7.080134 7.617188,11.376953 1.660079,4.296922 2.490156,9.163454 2.490234,14.59961 -7.8e-5,5.501333 -0.830155,10.416692 -2.490234,14.746093 -1.66023,4.296893 -4.19929,8.089207 -7.617188,11.376953 -3.906314,3.7760487 -8.593809,6.4615929 -14.0625,8.056641 C 48.144483,-0.78124922 39.941366,0 29.003906,0 l -19.8242185,0 0,-72.900391" +svg.H.path="m 9.1796875,-72.900391 18.7988285,0 0,27.783204 27.734375,0 0,-27.783204 18.798828,0 0,72.900391 -18.798828,0 0,-30.908203 -27.734375,0 0,30.908203 -18.7988285,0 0,-72.900391" +svg.V.path="m 0.48828125,-72.900391 18.89648475,0 19.335937,53.808594 19.287109,-53.808594 18.896485,0 L 49.902344,0 27.490234,0 0.48828125,-72.900391" +svg.N.path="m 9.1796875,-72.900391 20.9960935,0 26.513672,50 0,-50 17.822266,0 0,72.900391 -20.996094,0 -26.513672,-50 0,50 -17.8222655,0 0,-72.900391" + +svg.dash.path="m 5.4199219,-35.888672 l 30.6640621,0 0,14.208985 -30.6640621,0 0,-14.208985" + +#' Compute the cubic bezier function +#' +#' The \code{bezier3} function computes the point of the cubic bezier +#' curve linking the point P0 to P3 and using P1 and P2 as control points +#' +#' @param t the position on the curve estimated as a float between 0 the +#' starting point and 1 the ending point +#' +#' @param p0 a vector of numeric describing the coordinates of the p0 point, +#' the starting point of the curve. +#' +#' @param p1 a vector of numeric describing the coordinates of the p1 point, +#' the first control point. +#' +#' @param p2 a vector of numeric describing the coordinates of the p2 point, +#' the second control point. +#' +#' @param p3 a vector of numeric describing the coordinates of the p3 point, +#' the final point of the curve. +#' +#' @return a numric matrix containing the coordinates of the bezier curve/ +#' +#' @examples +#' +#' bezier3((1:10)/10,c(1,1),c(1,2),c(2,2),c(2,1)) +#' +#' @author Eric Coissac +#' @export +bezier3 = function(t,p0,p1,p2,p3) { + outer((1-t)^3,p0) + outer(t*(1-t)^2,3*p1) + outer(t^2*(1-t),3*p2) + outer(t^3,p3) +} + +lmin = function(l) min(sapply(l,min)) +lmax = function(l) max(sapply(l,max)) + +path.to.polygon = function(path,scalex=TRUE,scaley=TRUE) { + + x = strsplit(path," ")[[1]] + y = c() + for (c in x) { + if (length(grep(',',c))==0) + current=c + else { + y = c(y,current,c) + if (current=='m') + current='l' + if (current=='M') + current='L' + + } + } + + dim(y)=c(2,length(y)/2) + y=t(y) + operations = y[,1] + positions = do.call(rbind,strsplit(y[,2],",")) + positions = apply(positions,2,as.numeric) + + positions = data.frame(operations,x=positions[,1],y=positions[,2]) + + + relatives = positions$operations == tolower(positions$operations) + operations=toupper(operations) + + current.x=0 + current.y=0 + + n = dim(positions)[1] + + absolute.x=c() + absolute.y=c() + + remains=0 + + for (i in 1:n) { + + if (remains==0) { + if (operations[i]=='C') + remains=3 + else + remains=1 + } + if (relatives[i]) { + new.x = current.x + positions$x[i] + new.y = current.y + positions$y[i] + } + else { + new.x = positions$x[i] + new.y = positions$y[i] + } + + absolute.x = c(absolute.x,new.x) + absolute.y = c(absolute.y,new.y) + + remains=remains-1 + + if (remains==0) { + current.x=new.x + current.y=new.y + } + + } + + c = (1:length(operations))[operations=='C'] + + if (length(c)>0){ + p0=c[0:(length(c)/3-1)*3+1] + operations[p0+1]="X" + operations[p0+2]="X" + } + + allpath.x=list() + allpath.y=list() + path.x = c() + path.y = c() + for (i in 1:length(operations)) { + if (operations[i]=='M' & length(path.x)>0) { + allpath.x[[length(allpath.x)+1]]=path.x + allpath.y[[length(allpath.y)+1]]=path.y + path.x = c() + path.y = c() + } + + + if (operations[i]=='M' | operations[i]=='L') { + path.x = append(path.x,absolute.x[i]) + path.y = append(path.y,absolute.y[i]) + } + if (operations[i]=='C') { + b = bezier3((0:10)/10,c(absolute.x[i-1],absolute.y[i-1]), + c(absolute.x[i],absolute.y[i]), + c(absolute.x[i+1],absolute.y[i+1]), + c(absolute.x[i+2],absolute.y[i+2])) + path.x = c(path.x,b[-1,1]) + path.y = c(path.y,b[-1,2]) + } + } + + allpath.x[[length(allpath.x)+1]]=path.x + allpath.y[[length(allpath.y)+1]]=path.y + + allpath.y=lapply(allpath.y,"-") + + + if (scalex) { + xmin = lmin(allpath.x) + sx=lmax(allpath.x)-xmin + allpath.x=lapply(allpath.x,function(x) (x-xmin)/sx) + } + else + allpath.x=lapply(allpath.x,function(x) x/100) + + if (scaley) { + ymin = lmin(allpath.y) + sy=lmax(allpath.y)-ymin + allpath.y=lapply(allpath.y,function(x) (x-ymin)/sy) + } + else + allpath.y=lapply(allpath.y,function(x) x/100) + + o = order(-sapply(allpath.x,length)) + + return(list(x=allpath.x[o],y=allpath.y[o])) +} + +#' Draw an empy plot without axis +#' +#' The \code{whitepaper} function open a new plot of the given size where +#' you can add graphical elements. Coordinates on this plot range from +#' 0 to \code{width} and 0 to \code{height}. +#' +#' @param width a numeric value indicating the plot width +#' +#' @param height a numeric value indicating the plot height +#' +#' @examples +#' +#' # open a new empty plot +#' whitepaper(20,10) +#' +#' # add two point on this plot +#' points(c(10,15),c(3,8)) +#' +#' @author Eric Coissac +#' +#' @export +whitepaper= function(width,height,xmin=0,ymin=0,asp=NA) { + plot(c(xmin,xmin+width),c(ymin,ymin+height), + xlab="", + ylab="",xaxt="n",yaxt="n",type="n",asp=asp) +} + + +# +# We just prepare the polygon coordinates for all the 16 DNA letters +# + +letter.polygons = list(A=path.to.polygon(svg.A.path), + C=path.to.polygon(svg.C.path), + G=path.to.polygon(svg.G.path), + T=path.to.polygon(svg.T.path), + R=path.to.polygon(svg.R.path), + Y=path.to.polygon(svg.Y.path), + M=path.to.polygon(svg.M.path), + K=path.to.polygon(svg.K.path), + W=path.to.polygon(svg.W.path), + S=path.to.polygon(svg.S.path), + B=path.to.polygon(svg.B.path), + D=path.to.polygon(svg.D.path), + H=path.to.polygon(svg.H.path), + V=path.to.polygon(svg.V.path), + N=path.to.polygon(svg.N.path), + dash=path.to.polygon(svg.dash.path,scaley=FALSE) + ) + + +#' Draw a single DNA letter on a plot +#' +#' The function \code{plotDNAletter} draws a single DNA letter on an existing +#' plot. The alphabet is restricted to the IUPAC DNA characters plus the dash +#' '-' allowing to indicate gaps. +#' +#' @param x an value indicating the x coordinate for locating the letter +#' on the plot. +#' +#' @param y an value indicating the y coordinate for locating the letter +#' on the plot. +#' +#' @param cex the X character expension factor. By default a letter width is of +#' one unit in the user coordinate system. +#' +#' @param cey the Y character expension factor. By default a letter height is of +#' one unit in the user coordinate system. +#' +#' @param col the color used to fill the letter. +#' +#' @param background the background color of the letter. +#' +#' @param border the color of the border of the letter. +#' +#' @examples +#' +#' # open an empty plot +#' whitepaper(10,10) +#' +#' # plot some DNA letters +#' plotDNAletter(5,5,'A',col='green') +#' plotDNAletter(7,6,'C',cex=2,cey=1.5,col='blue') +#' plotDNAletter(2,3,'-') +#' plotDNAletter(2,7,'A',col='green',background="yellow",border="black") +#' +#' @seealso \code{\link{whitepaper}} +#' @author Eric Coissac +#' @export +plotDNAletter = function(x,y,c,cex=1,cey=1,col="black",background="white",border=col) { + if (cex > 0 & cey > 0){ + if (c=="-") + p=letter.polygons[['dash']] + else + p=letter.polygons[[c]] + + px = lapply(p$x,function(a) a*cex+x) + py = lapply(p$y,function(a) a*cey+y) + color=c(col,rep(background,length(px)-1)) + border=c(border,rep(background,length(px)-1)) + polygon(c(x,x,x+cex,x+cex),c(y,y+cey,y+cey,y),col=background,border=background) + mapply(polygon,px,py,col=color,border=border) + } +} + +#' Draw a DNA logo on a graph +#' +#' The function \code{dnalogo} draws a DNA logo on an already existing plot. +#' +#' @param data a matrix where each line represents a symbol and each column +#' represents a position. The values stored in the matrice indicate +#' the relative weight of a symbol at the considered position. +#' +#' @param x an value indicating the x coordinate for locating the logo +#' on the plot. +#' +#' @param y an value indicating the y coordinate for locating the logo +#' on the plot. +#' +#' @param width a value indicating the total width of the logo +#' +#' @param height a value indicating the total height of the logo +#' +#' @param col a named character vector (e.g \code{(A="purple",T="yellow")}) +#' or a matrix of the same size than data indicating the color +#' for each letter. +#' +#' @param cex a float between 0 and 1 indicating the relative width +#' of a letter column. +#' +#' +#' @examples +#' # Load the sample ecoPCR data file +#' data(GH.ecopcr) +#' +#' # create a blank plot +#' whitepaper(25,10) +#' +#' # computes the logo shape with the shanon formula +#' G.shanon = ecopcr.forward.shanon(GH.ecopcr) +#' +#' # plot the logo +#' dnalogo(G.shanon,2,6,width=20,height=2) +#' +#' # computes the logo shape with the shanon formula +#' # by grouping matches according to their mismatches +#' G.shanon.error = ecopcr.forward.shanon(GH.ecopcr, +#' group=GH.ecopcr$forward_mismatch>=1) +#' +#' # Display the structure +#' G.shanon.error +#' +#' # Plot the logo corresponding only to matches with errors +#' dnalogo(G.shanon.error$'TRUE',2,3,width=20,height=2) +#' +#' @seealso \code{\link{dnalogoplot}} +#' @author Eric Coissac +#' @keywords metabarcodes +#' +#' @export +dnalogo = function(data,x=0,y=0,width=NULL,height=NULL,col=NULL,cex=0.8) +{ + computey = function(p) { + o = draworder[,p] + x = c(0,cumsum(data[o,p])[2:length(o) - 1]) + names(x)=letters[o] + return(x[letters]) + } + + ddata = dim(data) + ncol = ddata[2] + nrow = ddata[1] + letters = row.names(data) + + + if (is.character(col) | is.null(col)) { + dnacol = c(A='green',C='blue',G='orange',T='red') + name.color = names(col) + dnacol[name.color]=col + dnacol=dnacol[letters] + dnacol=sapply(dnacol,function(x) do.call(rgb,as.list(col2rgb(x)/255))) + dnacol=matrix(rep(dnacol,ncol),nrow=nrow) + } + + draworder = apply(data,2,order) + ypos = sapply(1:ncol,computey) + xpos = matrix(rep(1:ncol,rep(nrow,ncol)),nrow=4) - 0.5 + + + if (! is.null(width)) { + actualwidth = ncol + 1 + xpos = xpos / actualwidth * width + cex = cex / actualwidth * width + } + + if (! is.null(height)) { + actualheight= max(colSums(data)) + ypos = ypos / actualheight * height + data = data / actualheight * height + } + + if (! is.null(x)) + xpos = xpos + x + + if (! is.null(y)) + ypos = ypos + y + + hide = mapply(plotDNAletter, + as.vector(xpos),as.vector(ypos), + rep(letters,ncol), + cex,as.vector(data), + as.vector(dnacol)) +} + +#' Plot a DNA logo +#' +#' The function \code{dnalogoplot} draws a DNA logo. +#' +#' @param data a matrix where each line represents a symbol and each column +#' represents a position. The values stored in the matrice indicate +#' the relative weight of a symbol at the considered position. +#' +#' @param col a named character vector (e.g \code{(A="purple",T="yellow")}) +#' or a matrix of the same size than data indicating the color +#' for each letter. +#' +#' @param primer the primer sequence. THe letters will be used to label the +#' X axis. +#' +#' @param xlab X axis label using font and character expansion +#' par("font.lab") and color par("col.lab") +#' +#' @param ylab Y axis label, same font attributes as xlab. +#' +#' @param main The main title (on top) using font and size (character expansion) +#' \code{par("font.main")} and color \code{par("col.main")}. +#' +#' @param sub Sub-title (at bottom) using font and size \code{par("font.sub")} +#' and color \code{par("col.sub")}. +#' +#' @param line specifying a value for line overrides the default placement of +#' labels, and places them this many lines outwards +#' from the plot edge. +#' +#' @param outer a logical value. If \code{TRUE}, the titles are placed in the outer +#' margins of the plot. +#' +#' @param cex a float between 0 and 1 indicating the relative width +#' of a letter column. +#' +#' @param cex.primer a float between 0 and 1 indicating the size +#' of the primer axis. +#' +#' @examples +#' # Load the sample ecoPCR data file +#' data(GH.ecopcr) +#' +#' # computes the logo shape with the shanon formula +#' G.shanon = ecopcr.forward.shanon(GH.ecopcr) +#' +#' par(mfrow=c(2,1)) +#' +#' # plot the logo +#' dnalogoplot(G.shanon,primer="GGGCAATCCTGAGCCAA", +#' xlab="Primer H",ylab='bits', +#' main="Primer conservation") +#' +#' # computes the logo shape with the shanon formula +#' # by grouping matches according to their mismatches +#' G.shanon.error = ecopcr.forward.shanon(GH.ecopcr, +#' group=GH.ecopcr$forward_mismatch>=1) +#' +#' # Display the structure +#' G.shanon.error +#' +#' # Plot the logo corresponding only to matches with errors +#' dnalogoplot(G.shanon.error$'TRUE',ylab='bits') +#' +#' @seealso \code{\link{dnalogo}} +#' @author Eric Coissac +#' @keywords metabarcodes +#' +#' @export +dnalogoplot = function(data,col=NULL,primer=NULL,cex=0.8,cex.lab=1.0,xlab=NULL,ylab=NULL,main=NULL,sub=NULL,line=NA,outer=FALSE) { + ddata = dim(data) + ncol = ddata[2] + nrow = ddata[1] + actualwidth = ncol + 1 + actualheight= max(colSums(data)) + + whitepaper(actualwidth,actualheight) + if (is.null(primer)) + labels= TRUE + else + labels = strsplit(primer,"")[[1]] + axis(1,at=1:ncol,labels=labels,cex.axis=cex.lab) + axis(2) + title(main=main,sub=sub,xlab=xlab,ylab=ylab,line=line,outer=outer) + dnalogo(data,col=col,cex=cex) +} + + diff --git a/R/mismatchplot.R b/R/mismatchplot.R new file mode 100644 index 0000000..9477c4a --- /dev/null +++ b/R/mismatchplot.R @@ -0,0 +1,106 @@ +#'@include ROBIBarcodes.R +#'@include logo.R +NULL + +#' Draw a scatter plot of the reverse mismatches as a function of forward mismatches. +#' +#' The \code{mismatchplot} function draws a scatter plot of the number of mismatches +#' observed in an ecoPCR result for the reverse primer as a function of the mismatches +#' for the reverse primer. Each point for a pair (forward_mismatch,reverse_mismatch) is +#' drawn as a circle having a surface proportional to the aboundance of this pair in the +#' results. If a grouping factor is specified, then the circle is replaced by a pie chart. +#' +#' @param ecopcr an ecoPCR result data.frame as returned by the \code{\link{read.ecopcr.result}} +#' function. +#' +#' @param group a factor decribing classes amongst the amplicon described in the ecoPCR +#' result +#' +#' @param col a vector describing the colored used for the bubble or the pie charts +#' +#' @param legend a character vector describing the legend for each modality of the +#' grouping factor. By default the factor levels are used for the legend +#' +#' @param legend.cex the expension factor for the legend text +#' +#' @param inset the distance to the margin of the legend box (see the \code{\link{legend}} +#' documentation) +#' +#' @examples +#' +#' # Load the ROBITools library +#' library(ROBITools) +#' +#' # Load the default taxonomy +#' taxo = default.taxonomy() +#' +#' # Load the sample ecoPCR data file +#' data(GH.ecopcr) +#' +#' # Computes classes associated to each taxid +#' orders = as.factor(taxonatrank(taxo,GH.ecopcr$taxid,'order',name=T)) +#' +#' # Plot the graph +#' mismatchplot(GH.ecopcr,group=orders) +#' +#' @seealso \code{\link{read.ecopcr.result}} +#' @author Eric Coissac +#' @export +mismatchplot = function(ecopcr,group=NULL, + col=NULL,legend=NULL, + legend.cex=0.7,inset=c(0.02,0.02)) { + + maxforward_error = max(ecopcr$forward_mismatch) + maxreverse_error = max(ecopcr$reverse_mismatch) + maxerror=max(maxforward_error,maxreverse_error) + + if (is.null(group)) + group=factor(rep("all",dim(ecopcr)[1])) + else + group=as.factor(group) + + if (is.null(legend)) + legend = levels(group) + + actualheight= maxerror + 1 + actualwidth = maxerror + 1 + + if (length(levels(group)) > 1) + actualwidth = actualwidth + 2 + + whitepaper(actualwidth,actualheight,xmin=-0.5,ymin=-0.5,asp=1) + + axis(1,at=0:maxerror, + labels=0:maxerror) + + axis(2,at=0:maxerror, + labels=0:maxerror) + + + data = aggregate(group,by=list(forward=ecopcr$forward_mismatch, + reverse=ecopcr$reverse_mismatch), + table) + + data <- data[rowSums(data[,c(-1,-2),drop=FALSE])>0, , drop=FALSE] + + if (is.null(col)) + col <- c("white", "lightblue", "mistyrose", "lightcyan", + "lavender", "cornsilk") + + + value=data[,c(-1,-2),drop=FALSE] + x = as.integer(data[,1]) + y = as.integer(data[,2]) + diam = sqrt(rowSums(value)) + radius = diam / max(diam) / 2 + + hide = mapply(pie.xy,x,y, + data=lapply(1:(dim(value)[1]),function(y) value[y,]), + radius=radius, + label="",MoreArgs=list(col=col)) + + + if (length(levels(group)) > 1) + legend('topright',legend=legend,fill=col, cex=legend.cex, inset=inset) + +} \ No newline at end of file diff --git a/R/piexy.R b/R/piexy.R new file mode 100644 index 0000000..ea423e5 --- /dev/null +++ b/R/piexy.R @@ -0,0 +1,68 @@ +#'@include ROBIBarcodes.R +NULL + +#' @export +pie.xy = function (x,y,data, labels = names(x), edges = 200, radius = 0.8, clockwise = FALSE, + init.angle = if (clockwise) 90 else 0, density = NULL, angle = 45, + col = NULL, border = NULL, lty = NULL, ...) +{ + if (!is.numeric(data) || any(is.na(data) | data < 0)) + stop("'data' values must be positive.") + if (is.null(labels)) + labels <- as.character(seq_along(data)) + else labels <- as.graphicsAnnot(labels) + data <- c(0, cumsum(data)/sum(data)) + dx <- diff(data) + nx <- length(dx) +# plot.new() +# pin <- par("pin") + xlim <- ylim <- c(-1, 1) +# if (pin[1L] > pin[2L]) +# xlim <- (pin[1L]/pin[2L]) * xlim +# else ylim <- (pin[2L]/pin[1L]) * ylim +# dev.hold() +# on.exit(dev.flush()) +# plot.window(xlim, ylim, "", asp = 1) + if (is.null(col)) + col <- if (is.null(density)) + c("white", "lightblue", "mistyrose", "lightcyan", + "lavender", "cornsilk") + else par("fg") + if (!is.null(col)) + col <- rep_len(col, nx) + if (!is.null(border)) + border <- rep_len(border, nx) + if (!is.null(lty)) + lty <- rep_len(lty, nx) + angle <- rep(angle, nx) + if (!is.null(density)) + density <- rep_len(density, nx) + twopi <- if (clockwise) + -2 * pi + else 2 * pi + t2xy <- function(t) { + t2p <- twopi * t + init.angle * pi/180 + list(x = radius * cos(t2p) , y = radius * sin(t2p)) + } + for (i in 1L:nx) { + n <- max(2, floor(edges * dx[i])) + P <- t2xy(seq.int(data[i], data[i + 1], length.out = n)) + if (nx>1) + polygon(c(P$x + x, x), c(P$y + y , y), + density = density[i], angle = angle[i], + border = border[i], col = col[i], lty = lty[i]) + else + polygon(P$x + x, P$y + y, + density = density[i], angle = angle[i], + border = border[i], col = col[i], lty = lty[i]) + P <- t2xy(mean(data[i + 0:1])) + lab <- as.character(labels[i]) + if (!is.na(lab) && nzchar(lab)) { + lines(c(1 , 1.05) * P$x + x, c(1, 1.05) * P$y + y) + text(1.1 * P$x + x , 1.1 * P$y + y, labels[i], xpd = TRUE, + adj = ifelse(P$x < 0, 1, 0), ...) + } + } +# title(main = main, ...) + invisible(NULL) +} \ No newline at end of file diff --git a/R/primer_table.R b/R/primer_table.R new file mode 100644 index 0000000..f2c7acc --- /dev/null +++ b/R/primer_table.R @@ -0,0 +1,57 @@ +#Commentaires lus par roxygen +#'@include ROBIBarcodes.R +#'@import XML +#' +NULL + +#NULL termine le commentaire pour roxygen + +extractPrimers <-function(primer){ + + id=xmlAttrs(primer)["ID"] + name=xmlValue(xmlChildren(xmlChildren(primer)$name)$text) + sequence=xmlValue(xmlChildren(xmlChildren(primer)$sequence)$text) + coding=as.logical(xmlValue(xmlChildren(xmlChildren(primer)$coding)$text)) + + p=list(id=id, name=name, sequence=sequence, coding=coding) + + return(p) +} + +#Export pour rendre publique la fonction + +#' Builds primer data frame from metabarcodedb +#' +#' The \code{primers.data.frame} function extracts all the primer information +#' from the \code{metabarcodedb} database. +#' +#' @param barcodedb a xml document containing a metabarcodedb. +#' +#' @return a \code{data.frame} describing primers. +#' +#' @examples +#' # load the XML library +#' library(XML) +#' +#' # load the example metabarcodedb database +#' db = xmlParseDoc(system.file("extdata/barcodedb.xml", package="ROBIBarcodes")) +#' +#' # extracts the primer table +#' primers.data.frame(db) +#' +#' @author Aurelie Bonin +#' @keywords metabarcodes +#' +#' @export +primers.data.frame <-function(barcodedb){ + p=getNodeSet(db, + path="/obi:obimetabarcodedb/obi:primers/obi:primer" , + namespaces=c(obi="http://metabarcoding.org/OBIMetabarcodes")) + + primerTable=as.data.frame(do.call(rbind,lapply(p,extractPrimers))) + + rownames(primerTable)=primerTable$id + primerTable=primerTable[,-1] + + return(primerTable) +} diff --git a/R/primers.R b/R/primers.R new file mode 100644 index 0000000..2a0b431 --- /dev/null +++ b/R/primers.R @@ -0,0 +1,18 @@ +#'@include xmlMods.R +#'@import XML +#' +NULL + + +add.primer.barcodedb = function(barcodedb, + name, + sequence, + coding, + documentation) { +' + Xxx + CGATCGATGCTAGCTAGCTGAT + false + ' + +} \ No newline at end of file diff --git a/R/resolution.R b/R/resolution.R new file mode 100644 index 0000000..ebc606d --- /dev/null +++ b/R/resolution.R @@ -0,0 +1,15 @@ +#'@import ROBITaxonomy +#'@include ROBIBarcodes.R +NULL + + +#'@export +resolution = function(taxonomy,ecopcr) { + l = aggregate(ecopcr$taxid, + by=list(barcode=ecopcr$sequence), + function(x) lowest.common.ancestor(taxo,x)) + r = taxonomicrank(taxo,l$x) + names(r)=as.character(l$barcode) + + return(r[as.character(ecopcr$sequence)]) +} \ No newline at end of file diff --git a/R/taxonomy.R b/R/taxonomy.R new file mode 100644 index 0000000..9c99e94 --- /dev/null +++ b/R/taxonomy.R @@ -0,0 +1,81 @@ +#'@include xmlMods.R +#'@import XML +#'@import ROBITaxonomy +#'@include ROBIBarcodes.R +NULL + + +taxon.data.frame = function(taxonomy,taxids,strict=TRUE,known.taxid=c()) { + taxids = as.integer(sub("TX.","",as.character(taxids))) + good.taxid = validate(taxonomy,taxids) + + if (strict & any(is.na(good.taxid))) + stop(sprintf("The following taxids are absent from the taxonomy : %s", + toString(taxids[is.na(good.taxid)]))) + + good.taxid = good.taxid[! is.na(good.taxid)] + all.path = path(taxonomy,good.taxid) + all.taxid = Reduce(union,all.path) + all.taxid = sort(union(all.taxid,known.taxid))[-1] + all.parent = sprintf("TX.%d",parent(taxonomy,all.taxid)) + all.rank = taxonomicrank(taxonomy,all.taxid) + all.scientificname = scientificname(taxonomy,all.taxid) + + all.id = sprintf("TX.%d",all.taxid) + + rep = data.frame(taxid=all.id, + name=all.scientificname, + rank=all.rank, + partof=all.parent, + stringsAsFactors=FALSE) + + return(rep) +} + +build.taxon.node = function(taxid,name,rank,partof) { + nodes = lapply(sprintf('\n%s%s%s', + taxid, + name, + rank, + partof), + xmlParseString) + + + return(nodes) + +} + +#'@export +add.taxon.barcodedb = function(barcodedb, + taxonomy, + taxids) { + + taxonomy.node = getNodeSet(barcodedb, + path='/obi:obimetabarcodedb/obi:taxonomy', + c(obi="http://metabarcoding.org/OBIMetabarcodes"))[[1]] + + known.taxid = as.character( + getNodeSet( + taxonomy.node, + path="./obi:taxon/@ID", + c(obi="http://metabarcoding.org/OBIMetabarcodes"))) + + known.taxid = as.integer(sub("TX.","",known.taxid)) + + taxon = taxon.data.frame(taxonomy,taxids,strict=TRUE,known.taxid) + + taxon.nodes = c(xmlChildren(taxonomy.node)$root, + build.taxon.node(taxon$taxid, + taxon$name, + taxon$rank, + taxon$partof)) + spare = sparexmltree() + new.taxonomy.node = getNodeSet(spare, + path='/obi:obimetabarcodedb/obi:taxonomy', + c(obi="http://metabarcoding.org/OBIMetabarcodes"))[[1]] + + replaceNodes(taxonomy.node,new.taxonomy.node) + + hidden = addChildren(new.taxonomy.node,kids=taxon.nodes,append=FALSE) +} + diff --git a/R/taxonomy_table.R b/R/taxonomy_table.R new file mode 100644 index 0000000..0fac141 --- /dev/null +++ b/R/taxonomy_table.R @@ -0,0 +1,97 @@ +#' @include ROBIBarcodes.R +#' @import ROBITaxonomy +#' @import XML +#' @useDynLib ROBIBarcodes +#' +NULL + + +extractTaxa <-function(taxon){ + + id=xmlAttrs(taxon)["ID"] + name=xmlValue(xmlChildren(xmlChildren(taxon)$name)$text) + rank=xmlValue(xmlChildren(xmlChildren(taxon)$rank)$text) + partof=xmlValue(xmlChildren(xmlChildren(taxon)$partof)$text) + + p=list(id=id, name=name, rank=rank, partof=partof) + + return(p) +} + +#' Builds taxa data frame from metabarcodedb +#' +#' The \code{taxonomy.data.frame} function extracts all the taxon information +#' from the \code{metabarcodedb} database. +#' +#' @param barcodedb a xml document containing a metabarcodedb. +#' +#' @return a \code{data.frame} describing taxa. +#' +#' @examples +#' # load the XML library +#' library(XML) +#' +#' # load the example metabarcodedb database +#' db = xmlParseDoc(system.file("extdata/barcodedb.xml", package="ROBIBarcodes")) +#' +#' # extracts the taxonomy table +#' taxonomy.data.frame(db) +#' +#' @author Eric Coissac +#' @keywords metabarcodes +#' +#' @export +taxonomy.data.frame = function(barcodedb) { + p=getNodeSet(db, + path="/obi:obimetabarcodedb/obi:taxonomy/obi:taxon" , + namespaces=c(obi="http://metabarcoding.org/OBIMetabarcodes")) + + taxonomyTable=as.data.frame(do.call(rbind,lapply(p,extractTaxa))) + + + rownames(taxonomyTable)=unlist(taxonomyTable$id) + taxonomyTable=taxonomyTable[,-1] + + taxonomyTable$name=unlist(taxonomyTable$name) + taxonomyTable$rank=unlist(taxonomyTable$rank) + taxonomyTable$partof=unlist(taxonomyTable$partof) + + return(taxonomyTable) + +} + +#' Builds a \code{taxonomy.obitools} from a metabarcodedb +#' +#' The \code{metabarcodedb.taxonomy} function extracts all the taxon information +#' from the \code{metabarcodedb} database and create a \code{taxonomy.obitools} +#' instance with them. +#' +#' @param barcodedb a xml document containing a metabarcodedb. +#' +#' @return a \code{taxonomy.obitools} instance. +#' +#' @examples +#' # load the XML library +#' library(XML) +#' +#' # load the example metabarcodedb database +#' db = xmlParseDoc(system.file("extdata/barcodedb.xml", package="ROBIBarcodes")) +#' +#' # extracts the taxonomy table +#' barcodetaxo = metabarcodedb.taxonomy(db) +#' +#' # Look for the Verbrata taxid +#' ecofind(barcodetaxo,"vertebrata") +#' +#' @author Eric Coissac +#' @keywords metabarcodes +#' +#' @export +metabarcodedb.taxonomy = function(barcodedb) { + + table = taxonomy.data.frame(barcodedb) + + t <- .Call('R_buildbarcodetaxo',table,TRUE,PACKAGE="ROBIBarcodes") + + return(ROBITools:::build.taxonomy.obitools(t,"barcodedb",getwd(),FALSE)) +} diff --git a/R/xmlMods.R b/R/xmlMods.R new file mode 100644 index 0000000..59ceab5 --- /dev/null +++ b/R/xmlMods.R @@ -0,0 +1,88 @@ +#'@include ROBIBarcodes.R +#'@import XML +#' +NULL + + +# +# Checks that bibutils is installed on the system +# +hasBibUtils = !system("bib2xml -h", + intern=FALSE, + ignore.stdout = TRUE, + ignore.stderr = TRUE) + +if (!hasBibUtils) { + message("\n============================================================\n", + "Bibutils are not installed on your system\n", + "or is not correctly setup\n", + "Consider to visit: http://sourceforge.net/projects/bibutils/\n", + "============================================================\n") +} + + +#'Qualify the elements of the `mods` elements with the \code{mods:} namespace. +#' +#' The \code{bibutils} programs generate XML file not qualified by a schema. +#' To respect the OBIBarcodes schema the mods elements must be qualified. +#' This function take a XML document produce by a \code{bibutils} program and +#' add the \code{mods:} namespace. +#' +#' @note This function modifies the document past in argument and returns +#' nothing +#' +#' @note This is an internal function and consequently has not to be called by +#' end users. +#' +#' @param modsdoc a XMLInternalDocument instance corresponding to a +#' modsCollectionDefinition element. +#' +#' @export +addmodsnamespace = function(modsdoc) { + + root = xmlRoot(modsdoc) + xmlNamespaces(root,set=TRUE)=c(mods="http://www.loc.gov/mods/v3") + hiden=xpathApply(modsdoc, + path='/.//*', + fun= function(n) xmlNamespace(n,set=TRUE)="mods") + +} + +patchmodsID = function(modsdoc) { + hiden= xpathApply(modsdoc,'/.//*[attribute::ID]', + function(n) xmlAttrs(n)=list(ID=paste('BI.', + toupper(gsub('[^A-Za-z0-9_]', + '_', + xmlAttrs(n)['ID'] + ) + ), + sep="" + )) + ) +} + +#'@export +bib2mods = function(bibfile,bibutils='bib2xml') { + + tmp=tempfile() + xmlerr=system(paste(bibutils,bibfile,'>',tmp,sep=' '), + intern=FALSE, + ignore.stderr=TRUE) + + if (xmlerr!=0) + stop(paste("Cannot run ",bibutils)) + xml = paste(tmp,collapse='\n') + xml = xmlParseDoc(tmp,asText=FALSE) + file.remove(tmp) + addmodsnamespace(xml) + patchmodsID(xml) + mods=getNodeSet(xml,'/.//mods:mods[attribute::ID]') + + return(mods) +} + +if (!hasBibUtils) { + bib2mods = function(bibfile,bibutils) { + stop("Bibutils not install visit: http://sourceforge.net/projects/bibutils/") + } +} \ No newline at end of file diff --git a/R/xmlspare.R b/R/xmlspare.R new file mode 100644 index 0000000..6597566 --- /dev/null +++ b/R/xmlspare.R @@ -0,0 +1,23 @@ +#'@include ROBIBarcodes.R +#'@import XML +#' +NULL + +.__spare_tree__ = NULL + +sparexmltree = function() { + + if (is.null(get(".__spare_tree__",envir = environment()))) { + + sparefile = paste(system.file("extdata", + package="ROBIBarcodes"), + 'spare.xml', + sep='/') + + spare = xmlParseDoc(sparefile) + + assign(".__spare_tree__",spare, envir=globalenv()) + } + + return(get(".__spare_tree__",envir = globalenv())) +} diff --git a/ROBIBarcodes.Rproj b/ROBIBarcodes.Rproj new file mode 100644 index 0000000..3e48f41 --- /dev/null +++ b/ROBIBarcodes.Rproj @@ -0,0 +1,17 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: LATIN1 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +BuildType: Package +PackageInstallArgs: --no-multiarch +PackageRoxygenize: rd,collate,namespace diff --git a/data/GH.ecopcr.rda b/data/GH.ecopcr.rda new file mode 100644 index 0000000..472386c Binary files /dev/null and b/data/GH.ecopcr.rda differ diff --git a/inst/extdata/OBIMetabarcodes.xsd b/inst/extdata/OBIMetabarcodes.xsd new file mode 100644 index 0000000..8d4d64a --- /dev/null +++ b/inst/extdata/OBIMetabarcodes.xsd @@ -0,0 +1,565 @@ + + + + + + + + + + + + + + + + + + + The documentation_t type is a string of characters allowing to document an instance in the database. + + + + + + + + + + The organelle_t type allows to specify a value + indicating on which organelle is located another element + + + + + + + Indicates that the marker corresponds to a locus + belonging the nuclear genome + + + + + + + Indicates that the marker corresponds to a locus + belonging the chloroplastic genome + + + + + + + Indicates that the marker corresponds to a locus + belonging the mitochondrial genome + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The iupc_t type is a string of characters symbolizing + nucleotides in the iupac system, and thus belonging to + the folowing list of letters: + A,C,G,T,U,R,Y,M,K,W,S,B,D,H,V,N. The type also contrains + the sequence to be written in upper cases. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The primers_t type gathers several (usually two) primer_t elements + + + + + + + + + + + This type describes a PCR primer. + It has a mandatory attribut "name" containing a unique identifier. + Primer is described by ... + The primer_t type has an attribute name specifying its name, + and includes a sequence element specifying its nucleotide sequence. + + + + + + + + + The nucleic sequence of the primer. The primer + length must be greater than 15bp + + + + + + + + + + + The name of the primer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/inst/extdata/barcodedb.xml b/inst/extdata/barcodedb.xml new file mode 100644 index 0000000..e397c27 --- /dev/null +++ b/inst/extdata/barcodedb.xml @@ -0,0 +1,730 @@ + + + + + + mRNA + + + Mitochondrial mRNA + LO.RRNA + + + Internal Transcribed Spacer (ITS) + + + 16S rRNA + LO.MRRNA + mitochondria + + + 12S rRNA + LO.MRRNA + mitochondria + + + P6-Loop trnL + chloroplast + + + ITS1 + LO.ITS + nucleus + + + + + + Root + no rank + EukaryotasuperkingdomTX.131567Embryophytano rankTX.131221Bryophytano rankTX.3193FungikingdomTX.33154Eumetazoano rankTX.33208AnnelidaphylumTX.193545OligochaetasubclassTX.42113HaplotaxidaorderTX.6381TubificinasuborderTX.6382EnchytraeidaefamilyTX.6383ArthropodaphylumTX.88770HexapodasuperclassTX.197562ColeopteraorderTX.33392Pterygotano rankTX.85512ChordataphylumTX.33511Vertebratano rankTX.89593GnathostomatasuperclassTX.7742Sarcopterygiino rankTX.117571Sauropsidano rankTX.32524Archosauriano rankTX.32561AvesclassTX.436492Tetrapodano rankTX.8287Amniotano rankTX.32523Sauriano rankTX.8457ViridiplantaekingdomTX.2759Opisthokontano rankTX.2759MetazoakingdomTX.33154Bilateriano rankTX.6072Coelomatano rankTX.33213Protostomiano rankTX.33316NeopterasubclassTX.7496EndopterygotainfraclassTX.33340Deuterostomiano rankTX.33316StreptophytaphylumTX.33090Clitellatano rankTX.6340InsectaclassTX.6960Dicondyliano rankTX.50557Panarthropodano rankTX.33317CraniatasubphylumTX.7711Teleostomino rankTX.7776Euteleostomino rankTX.117570Streptophytinano rankTX.35493cellular organismsno rankTX.1Annelida/Echiura/Pogonophora groupno rankTX.33317Pancrustaceano rankTX.197563Mandibulatano rankTX.6656Dinosauriano rankTX.8492Saurischiano rankTX.436486Theropodano rankTX.436489Coelurosauriano rankTX.436491 + + + G + GGGCAATCCTGAGCCAA + false + + + H + CCATTGAGTCTCTGCACCTATC + false + + + ITS5 + GGAAGTAAAAGTCGTAACAAGG + false + + + 5.8S_fungi + CAAGAGATCCGTTGTTGAAAGTT + false + + + bryo_P6F + GATTCAGGGAAACTTAGGTTG + false + + + bryo_P6R + CCATTGAGTCTCTGCACC + false + + + Ench_12Sa + GCTGCACTTTGACTTGAC + false + + + Ench_12Sc + AGCCTGTGTACTGCTGTC + false + + + Coleop_16Sc + TGCAAAGGTAGCATAATMATTAG + false + + + Coleop_16Sd + TCCATAGGGTCTTCTCGTC + false + + + Aves_12Sa + GATTAGATACCCCACTATGC + false + + + Aves_12Sc + GTTTTAAGCGTTTGTGCTCG + false + + + + + GH + LO.P6_LOOP + PR.G + PR.H + TX.35493 + + 8 + 150 + + + + + + + + Fungi_ITS1 + LO.ITS1 + PR.ITS5 + PR.58S_FUNGI + TX.4751 + + BI.BELLEMAIN_10_00 + BI.EPP_12_00 + + + Bryophytes_GH + LO.P6_LOOP + PR.BRYO_P6F + PR.BRYO_P6R + TX.3208 + + BI.EPP_12_00 + + + Enchytraeids_12S + LO.M12SRRNA + PR.ENCH_12SA + PR.ENCH_12SC + TX.6388 + + BI.EPP_12_00 + + + Coleopters_16S + LO.M16SRRNA + PR.COLEOP_16SC + PR.COLEOP_16SD + TX.7041 + + BI.EPP_12_00 + + + Birds_12S + LO.M12SRRNA + PR.AVES_12SA + PR.AVES_12SC + TX.8782 + + BI.EPP_12_00 + + + + + + Power and limitations of the chloroplast trnL (UAA) intron for plant DNA barcoding. + + + Pierre + Taberlet + + author + + + + Eric + Coissac + + author + + + + Francois + Pompanon + + author + + + + Ludovic + Gielly + + author + + + + Christian + Miquel + + author + + + + Alice + Valentini + + author + + + + Thierry + Vermat + + author + + + + Gerard + Corthier + + author + + + + Christian + Brochmann + + author + + + + Eske + Willerslev + + author + + + + 2007 + + text + journal article + + + Nucleic Acids Research + + + continuing + + periodical + academic journal + + Taberlet:07:00 + + 2007 + + 35 + + + 3 + + + e14 + + + + + + DNA from soil mirrors plant taxonomic and growth form diversity + + + N + G + Yoccoz + + author + + + + K + A + Bråthen + + author + + + + L + Gielly + + author + + + + J + Haile + + author + + + + M + E + Edwards + + author + + + + T + Goslar + + author + + + + H + Von Stedingk + + author + + + + A + K + Brysting + + author + + + + E + Coissac + + author + + + + F + Pompanon + + author + + + + J + H + Sønstebø + + author + + + + C + Miquel + + author + + + + A + Valentini + + author + + + + F + De Bello + + author + + + + J + Chave + + author + + + + W + Thuiller + + author + + + + P + Wincker + + author + + + + C + Cruaud + + author + + + + F + Gavory + + author + + + + M + Rasmussen + + author + + + + M + T + P + Gilbert + + author + + + + L + Orlando + + author + + + + C + Brochmann + + author + + + + E + Willerslev + + author + + + + P + Taberlet + + author + + + + 2012-Aug + + text + journal article + + + Mol Ecol + + + continuing + + periodical + academic journal + + Yoccoz:12:00 + + 2012-Aug + + 21 + + + 15 + + + 3647 + 55 + + + + + + New environmental metabarcodes for analysing soil DNA + potential for studying past and present ecosystems + + + Laura + S + Epp + + author + + + + Sanne + Boessenkool + + author + + + + Eva + P + Bellemain + + author + + + + James + Haile + + author + + + + Alfonso + Esposito + + author + + + + Tiayyba + Riaz + + author + + + + Christer + Erséus + + author + + + + Vladimir + I + Gusarov + + author + + + + Mary + E + Edwards + + author + + + + Arild + Johnsen + + author + + + + Hans + K + Stenøien + + author + + + + Kristian + Hassel + + author + + + + Håvard + Kauserud + + author + + + + Nigel + G + Yoccoz + + author + + + + Kari + Anne + Bråthen + + author + + + + Eske + Willerslev + + author + + + + Pierre + Taberlet + + author + + + + Eric + Coissac + + author + + + + Christian + Brochmann + + author + + + + 2012-Apr + + text + journal article + + + Molecular Ecology + + + continuing + + periodical + academic journal + + Epp:12:00 + + 2012-Apr + + 21 + + + 8 + + + 1821 + 1833 + + + + + + ITS as an environmental DNA barcode for fungi + an in silico approach reveals potential PCR biases + + + Eva + Bellemain + + author + + + + Tor + Carlsen + + author + + + + Christian + Brochmann + + author + + + + Eric + Coissac + + author + + + + Pierre + Taberlet + + author + + + + Håvard + Kauserud + + author + + + + 2010 + + text + journal article + + + BMC Microbiology + + + continuing + + periodical + academic journal + + Bellemain:10:00 + + 2010 + 10 + 189 + + + diff --git a/inst/extdata/empty.xml b/inst/extdata/empty.xml new file mode 100644 index 0000000..dba553f --- /dev/null +++ b/inst/extdata/empty.xml @@ -0,0 +1,22 @@ + + + + + + + + + + Root + no rank + + + + + + + \ No newline at end of file diff --git a/inst/extdata/mods-3-5.xsd b/inst/extdata/mods-3-5.xsd new file mode 100644 index 0000000..cdbfdae --- /dev/null +++ b/inst/extdata/mods-3-5.xsd @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/inst/extdata/spare.xml b/inst/extdata/spare.xml new file mode 100644 index 0000000..fa535c0 --- /dev/null +++ b/inst/extdata/spare.xml @@ -0,0 +1,24 @@ + + + + + + + + + + Root + no rank + + + + + + + + + \ No newline at end of file diff --git a/inst/extdata/taberlet2007.bib b/inst/extdata/taberlet2007.bib new file mode 100644 index 0000000..b59cc50 --- /dev/null +++ b/inst/extdata/taberlet2007.bib @@ -0,0 +1,9 @@ + +@article{Taberlet:07:00, + Author = {Taberlet, Pierre and Coissac, E and Pompanon, Francois and Gielly, Ludovic and Miquel, Christian and Valentini, Alice and Vermat, Thierry and Corthier, Gerard and Brochmann, Christian and Willerslev, Eske}, + Journal = {Nucleic Acids Res}, + Number = {3}, + Pages = {e14}, + Title = {Power and limitations of the chloroplast trnL (UAA) intron for plant DNA barcoding.}, + Volume = {35}, + Year = {2007}} diff --git a/inst/extdata/taberlet2007.xml b/inst/extdata/taberlet2007.xml new file mode 100644 index 0000000..fd5deb9 --- /dev/null +++ b/inst/extdata/taberlet2007.xml @@ -0,0 +1,100 @@ + + + + + Power and limitations of the chloroplast trnL (UAA) intron for plant DNA barcoding. + + + Pierre + Taberlet + + author + + + + E + Coissac + + author + + + + Francois + Pompanon + + author + + + + Ludovic + Gielly + + author + + + + Christian + Miquel + + author + + + + Alice + Valentini + + author + + + + Thierry + Vermat + + author + + + + Gerard + Corthier + + author + + + + Christian + Brochmann + + author + + + + Eske + Willerslev + + author + + + + 2007 + + text + journal article + + + Nucleic Acids Res + + + continuing + + periodical + academic journal + + Taberlet:07:00 + + 2007 + 35 + 3 + e14 + + + diff --git a/inst/extdata/yoccoz_2012.bib b/inst/extdata/yoccoz_2012.bib new file mode 100644 index 0000000..8f05e3e --- /dev/null +++ b/inst/extdata/yoccoz_2012.bib @@ -0,0 +1,10 @@ + +@article{Yoccoz:12:00, + Author = {Yoccoz, N G and Br{\aa}then, K A and Gielly, L and Haile, J and Edwards, M E and Goslar, T and Von Stedingk, H and Brysting, A K and Coissac, E and Pompanon, F and S{\o}nsteb{\o}, J H and Miquel, C and Valentini, A and De Bello, F and Chave, J and Thuiller, W and Wincker, P and Cruaud, C and Gavory, F and Rasmussen, M and Gilbert, M T P and Orlando, L and Brochmann, C and Willerslev, E and Taberlet, P}, + Journal = {Mol Ecol}, + Month = {Aug}, + Number = {15}, + Pages = {3647-55}, + Title = {DNA from soil mirrors plant taxonomic and growth form diversity}, + Volume = {21}, + Year = {2012}} diff --git a/src/ROBIBarcodes.so b/src/ROBIBarcodes.so new file mode 100755 index 0000000..eb6aee2 Binary files /dev/null and b/src/ROBIBarcodes.so differ diff --git a/src/ecoError.c b/src/ecoError.c new file mode 100644 index 0000000..00bbfa2 --- /dev/null +++ b/src/ecoError.c @@ -0,0 +1,26 @@ +#include "ecoPCR.h" +#include +#include + +/* + * print the message given as argument and exit the program + * @param error error number + * @param message the text explaining what's going on + * @param filename the file source where the program failed + * @param linenumber the line where it has failed + * filename and linenumber are written at pre-processing + * time by a macro + */ +void ecoError(int32_t error, + const char* message, + const char * filename, + int linenumber) +{ + fprintf(stderr,"Error %d in file %s line %d : %s\n", + error, + filename, + linenumber, + message); + + abort(); +} diff --git a/src/ecoError.o b/src/ecoError.o new file mode 100644 index 0000000..19eb12a Binary files /dev/null and b/src/ecoError.o differ diff --git a/src/ecoIOUtils.c b/src/ecoIOUtils.c new file mode 100644 index 0000000..8d7ce82 --- /dev/null +++ b/src/ecoIOUtils.c @@ -0,0 +1,122 @@ +#include "ecoPCR.h" +#include +#include + +#define SWAPINT32(x) ((((x) << 24) & 0xFF000000) | (((x) << 8) & 0xFF0000) | \ + (((x) >> 8) & 0xFF00) | (((x) >> 24) & 0xFF)) + + +int32_t is_big_endian() +{ + int32_t i=1; + + return (int32_t)((char*)&i)[0]; +} + + + + +int32_t swap_int32_t(int32_t i) +{ + return SWAPINT32(i); +} + + +/** + * Read part of the file + * @param *f the database + * @param recordSize the size to be read + * + * @return buffer + */ +void *read_ecorecord(FILE *f,int32_t *recordSize) +{ + static void *buffer =NULL; + int32_t buffersize=0; + int32_t read; + + if (!recordSize) + ECOERROR(ECO_ASSERT_ERROR, + "recordSize cannot be NULL"); + + read = fread(recordSize, + 1, + sizeof(int32_t), + f); + + if (feof(f)) + return NULL; + + if (read != sizeof(int32_t)) + ECOERROR(ECO_IO_ERROR,"Reading record size error"); + + if (is_big_endian()) + *recordSize=swap_int32_t(*recordSize); + + if (buffersize < *recordSize) + { + if (buffer) + buffer = ECOREALLOC(buffer,*recordSize, + "Increase size of record buffer"); + else + buffer = ECOMALLOC(*recordSize, + "Allocate record buffer"); + } + + read = fread(buffer, + 1, + *recordSize, + f); + + if (read != *recordSize) + ECOERROR(ECO_IO_ERROR,"Reading record data error"); + + return buffer; +}; + + + + + +/** + * Open the database and check it's readable + * @param filename name of the database (.sdx, .rdx, .tbx) + * @param sequencecount buffer - pointer to variable storing the number of occurence + * @param abort_on_open_error boolean to define the behaviour in case of error + * while opening the database + * @return FILE type + **/ +FILE *open_ecorecorddb(const char *filename, + int32_t *sequencecount, + int32_t abort_on_open_error) +{ + FILE *f; + int32_t read; + + f = fopen(filename,"rb"); + + if (!f) + { + if (abort_on_open_error) + ECOERROR(ECO_IO_ERROR,"Cannot open file"); + else + { + *sequencecount=0; + return NULL; + } + } + + read = fread(sequencecount, + 1, + sizeof(int32_t), + f); + + if (read != sizeof(int32_t)) + ECOERROR(ECO_IO_ERROR,"Reading record size error"); + + if (is_big_endian()) + *sequencecount=swap_int32_t(*sequencecount); + + return f; +} + diff --git a/src/ecoIOUtils.o b/src/ecoIOUtils.o new file mode 100644 index 0000000..42057eb Binary files /dev/null and b/src/ecoIOUtils.o differ diff --git a/src/ecoMalloc.c b/src/ecoMalloc.c new file mode 100644 index 0000000..0ea8d3b --- /dev/null +++ b/src/ecoMalloc.c @@ -0,0 +1,79 @@ +#include "ecoPCR.h" +#include + +static int eco_log_malloc = 0; + +void eco_trace_memory_allocation() +{ + eco_log_malloc=1; +} + +void eco_untrace_memory_allocation() +{ + eco_log_malloc=0; +} + + +void *eco_malloc(int32_t chunksize, + const char *error_message, + const char *filename, + int32_t line) +{ + void * chunk; + + chunk = calloc(1,chunksize); + + if (!chunk) + ecoError(ECO_MEM_ERROR,error_message,filename,line); + + if (eco_log_malloc) + fprintf(stderr, + "Memory segment located at %p of size %d is allocated (file : %s [%d])", + chunk, + chunksize, + filename, + line); + + return chunk; +} + +void *eco_realloc(void *chunk, + int32_t newsize, + const char *error_message, + const char *filename, + int32_t line) +{ + void *newchunk; + + newchunk = realloc(chunk,newsize); + + if (!newchunk) + ecoError(ECO_MEM_ERROR,error_message,filename,line); + + if (eco_log_malloc) + fprintf(stderr, + "Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])", + chunk, + newchunk, + newsize, + filename, + line); + + return newchunk; +} + +void eco_free(void *chunk, + const char *error_message, + const char *filename, + int32_t line) +{ + free(chunk); + + if (eco_log_malloc) + fprintf(stderr, + "Memory segment %p is released => %s (file : %s [%d])", + chunk, + error_message, + filename, + line); +} diff --git a/src/ecoMalloc.o b/src/ecoMalloc.o new file mode 100644 index 0000000..ac07f66 Binary files /dev/null and b/src/ecoMalloc.o differ diff --git a/src/ecoPCR.h b/src/ecoPCR.h new file mode 100644 index 0000000..8084fb9 --- /dev/null +++ b/src/ecoPCR.h @@ -0,0 +1,283 @@ +#ifndef ECOPCR_H_ +#define ECOPCR_H_ + +#include +#include + +#include +#include +#include + + +//#ifndef H_apat +//#include "../libapat/apat.h" +//#endif + +/***************************************************** + * + * Data type declarations + * + *****************************************************/ + +/* + * + * Sequence types + * + */ + +typedef struct { + + int32_t taxid; + char AC[20]; + int32_t DE_length; + int32_t SQ_length; + int32_t CSQ_length; + + char data[1]; + +} ecoseqformat_t; + +typedef struct { + int32_t taxid; + int32_t SQ_length; + char *AC; + char *DE; + char *SQ; +} ecoseq_t; + +/* + * + * Taxonomy taxon types + * + */ + + +typedef struct { + int32_t taxid; + int32_t rank; + int32_t parent; + int32_t namelength; + char name[1]; + +} ecotxformat_t; + +typedef struct ecotxnode { + int32_t taxid; + int32_t rank; + int32_t farest; + struct ecotxnode *parent; + char *name; +} ecotx_t; + +typedef struct { + int32_t count; + int32_t maxtaxid; + int32_t buffersize; + ecotx_t taxon[1]; +} ecotxidx_t; + + +/* + * + * Taxonomy rank types + * + */ + +typedef struct { + int32_t count; + char* label[1]; +} ecorankidx_t; + +/* + * + * Taxonomy name types + * + */ + +typedef struct { + int32_t is_scientificname; + int32_t namelength; + int32_t classlength; + int32_t taxid; + char names[1]; +} econameformat_t; + + + typedef struct { + char *name; + char *classname; + int32_t is_scientificname; + struct ecotxnode *taxon; +} econame_t; + + +typedef struct { + int32_t count; + econame_t names[1]; +} econameidx_t; + + + typedef struct { + ecorankidx_t *ranks; + econameidx_t *names; + ecotxidx_t *taxons; +} ecotaxonomy_t; + + +/***************************************************** + * + * Function declarations + * + *****************************************************/ + +/* + * + * Low level system functions + * + */ + +int32_t is_big_endian(); +int32_t swap_int32_t(int32_t); + +void *eco_malloc(int32_t chunksize, + const char *error_message, + const char *filename, + int32_t line); + + +void *eco_realloc(void *chunk, + int32_t chunksize, + const char *error_message, + const char *filename, + int32_t line); + +void eco_free(void *chunk, + const char *error_message, + const char *filename, + int32_t line); + +void eco_trace_memory_allocation(); +void eco_untrace_memory_allocation(); + +#define ECOMALLOC(size,error_message) \ + eco_malloc((size),(error_message),__FILE__,__LINE__) + +#define ECOREALLOC(chunk,size,error_message) \ + eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__) + +#define ECOFREE(chunk,error_message) \ + eco_free((chunk),(error_message),__FILE__,__LINE__) + + + + +/* + * + * Error managment + * + */ + + +void ecoError(int32_t,const char*,const char *,int); + +#define ECOERROR(code,message) ecoError((code),(message),__FILE__,__LINE__) + +#define ECO_IO_ERROR (1) +#define ECO_MEM_ERROR (2) +#define ECO_ASSERT_ERROR (3) +#define ECO_NOTFOUND_ERROR (4) + + +/* + * + * Low level Disk access functions + * + */ + +FILE *open_ecorecorddb(const char *filename, + int32_t *sequencecount, + int32_t abort_on_open_error); + +void *read_ecorecord(FILE *,int32_t *recordSize); + + + +/* + * Read function in internal binary format + */ + +FILE *open_ecoseqdb(const char *filename, + int32_t *sequencecount); + +ecoseq_t *readnext_ecoseq(FILE *); + +ecorankidx_t *read_rankidx(const char *filename); + +econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy); + + + + /** + * Read taxonomy data as formated by the ecoPCRFormat.py script. + * + * This function is normaly uses internaly by the read_taxonomy + * function and should not be called directly. + * + * @arg filename path to the *.tdx file of the reformated db + * + * @return pointer to a taxonomy index structure + */ + +ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2); + +ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName); + +ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, int32_t rankidx); + +ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid); + +int eco_isundertaxon(ecotx_t *taxon, int other_taxid); + +ecoseq_t *ecoseq_iterator(const char *prefix); + + + +ecoseq_t *new_ecoseq(); +int32_t delete_ecoseq(ecoseq_t *); +ecoseq_t *new_ecoseq_with_data( char *AC, + char *DE, + char *SQ, + int32_t taxid + ); + + +int32_t delete_taxon(ecotx_t *taxon); +int32_t delete_taxonomy(ecotxidx_t *index); +int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy); + + +int32_t rank_index(const char* label,ecorankidx_t* ranks); + +//int32_t delete_apatseq(SeqPtr pseq); +//PatternPtr buildPattern(const char *pat, int32_t error_max); +//PatternPtr complementPattern(PatternPtr pat); +// +//SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular); + +//char *ecoComplementPattern(char *nucAcSeq); +//char *ecoComplementSequence(char *nucAcSeq); +//char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end); + +ecotx_t *eco_getspecies(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getgenus(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy); + +//int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid); +//int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid); + + +ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy); + +#endif /*ECOPCR_H_*/ diff --git a/src/econame.c b/src/econame.c new file mode 100644 index 0000000..5ef112a --- /dev/null +++ b/src/econame.c @@ -0,0 +1,64 @@ +#include "ecoPCR.h" +#include +#include + +static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy); + +econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy) +{ + + int32_t count; + FILE *f; + econameidx_t *indexname; + int32_t i; + + f = open_ecorecorddb(filename,&count,0); + + if (f==NULL) + return NULL; + + indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names"); + + indexname->count=count; + + for (i=0; i < count; i++){ + readnext_econame(f,(indexname->names)+i,taxonomy); + } + + return indexname; +} + +econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy) +{ + + econameformat_t *raw; + int32_t rs; + + raw = read_ecorecord(f,&rs); + + if (!raw) + return NULL; + + if (is_big_endian()) + { + raw->is_scientificname = swap_int32_t(raw->is_scientificname); + raw->namelength = swap_int32_t(raw->namelength); + raw->classlength = swap_int32_t(raw->classlength); + raw->taxid = swap_int32_t(raw->taxid); + } + + name->is_scientificname=raw->is_scientificname; + + name->name = ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name"); + strncpy(name->name,raw->names,raw->namelength); + name->name[raw->namelength]=0; + + name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname"); + strncpy(name->classname,(raw->names+raw->namelength),raw->classlength); + name->classname[raw->classlength]=0; + + name->taxon = taxonomy->taxons->taxon + raw->taxid; + + return name; +} + diff --git a/src/econame.o b/src/econame.o new file mode 100644 index 0000000..3cf56c9 Binary files /dev/null and b/src/econame.o differ diff --git a/src/ecorank.c b/src/ecorank.c new file mode 100644 index 0000000..27d9f40 --- /dev/null +++ b/src/ecorank.c @@ -0,0 +1,55 @@ +#include "ecoPCR.h" +#include +#include + +static int compareRankLabel(const void *label1, const void *label2); + +ecorankidx_t *read_rankidx(const char *filename) +{ + int32_t count; + FILE *f; + ecorankidx_t *index; + int32_t i; + int32_t rs; + char *buffer; + + f = open_ecorecorddb(filename,&count,0); + + if (f==NULL) + return NULL; + + index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (count-1), + "Allocate rank index"); + + index->count=count; + + for (i=0; i < count; i++) + { + buffer = read_ecorecord(f,&rs); + index->label[i]=(char*) ECOMALLOC(rs+1, + "Allocate rank label"); + strncpy(index->label[i],buffer,rs); + } + + return index; +} + +int32_t rank_index(const char* label,ecorankidx_t* ranks) +{ + char **rep; + + rep = bsearch(label,ranks->label,ranks->count,sizeof(char*),compareRankLabel); + + if (rep) + return rep-ranks->label; +// else +// ECOERROR(ECO_NOTFOUND_ERROR,"Rank label not found"); + + return -1; +} + + +int compareRankLabel(const void *label1, const void *label2) +{ + return strcmp((const char*)label1,*(const char**)label2); +} diff --git a/src/ecorank.o b/src/ecorank.o new file mode 100644 index 0000000..791f0df Binary files /dev/null and b/src/ecorank.o differ diff --git a/src/ecotax.c b/src/ecotax.c new file mode 100644 index 0000000..52e79f9 --- /dev/null +++ b/src/ecotax.c @@ -0,0 +1,422 @@ +#include "ecoPCR.h" +#include +#include +#include + +#include + +#ifndef MAX +#define MAX(x,y) (((x)>(y)) ? (x):(y)) +#endif + +static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon); + + /** + * Open the taxonomy database + * @param pointer to the database (.tdx file) + * @return a ecotxidx_t structure + */ +ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2) +{ + int32_t count; + int32_t count2; + FILE *f; + FILE *f2; + ecotxidx_t *index; + struct ecotxnode *t; + int32_t i; + int32_t j; + + f = open_ecorecorddb(filename,&count,0); + + if (f==NULL) return NULL; + + f2 = open_ecorecorddb(filename2,&count2,0); + + index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count+count2-1), + "Allocate taxonomy"); + + index->count=count+count2; + index->buffersize = index->count; + + index->maxtaxid=0; + REprintf("Readind %d taxa...\n",count); + for (i=0; i < count; i++){ + readnext_ecotaxon(f,&(index->taxon[i])); + index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent; + index->taxon[i].parent->farest=0; + if (index->taxon[i].taxid > index->maxtaxid) + index->maxtaxid=index->taxon[i].taxid; + } + + + if (count2>0) + REprintf("Readind %d local taxa...\n",count2); + else + REprintf("No local taxon\n"); + + count = index->count; + + for (; i < count; i++){ + readnext_ecotaxon(f2,&(index->taxon[i])); + index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent; + index->taxon[i].parent->farest=0; + if (index->taxon[i].taxid > index->maxtaxid) + index->maxtaxid=index->taxon[i].taxid; + } + + REprintf("Computing longest branches...\n",count); + + for (i=0; i < count; i++){ + t=index->taxon+i; + if (t->farest==-1) + { + t->farest=0; + while(t->parent != t) + { + j = t->farest + 1; + if (j > t->parent->farest) + { + t->parent->farest = j; + t=t->parent; + } + else + t=index->taxon; + } + } + } + + return index; +} + + +int32_t delete_taxonomy(ecotxidx_t *index) +{ + int32_t i; + + if (index) + { + for (i=0; i< index->count; i++) + if (index->taxon[i].name) + ECOFREE(index->taxon[i].name,"Free scientific name"); + + ECOFREE(index,"Free Taxonomy"); + + return 0; + } + + return 1; +} + + + +int32_t delete_taxon(ecotx_t *taxon) +{ + if (taxon) + { + if (taxon->name) + ECOFREE(taxon->name,"Free scientific name"); + + ECOFREE(taxon,"Free Taxon"); + + return 0; + } + + return 1; +} + + +/** + * Read the database for a given taxon a save the data + * into the taxon structure(if any found) + * @param *f pointer to FILE type returned by fopen + * @param *taxon pointer to the structure + * + * @return a ecotx_t structure if any taxon found else NULL + */ +ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon) +{ + + ecotxformat_t *raw; + int32_t rs; + + raw = read_ecorecord(f,&rs); + + if (!raw) + return NULL; + + if (is_big_endian()) + { + raw->namelength = swap_int32_t(raw->namelength); + raw->parent = swap_int32_t(raw->parent); + raw->rank = swap_int32_t(raw->rank); + raw->taxid = swap_int32_t(raw->taxid); + } + + taxon->parent = (ecotx_t*)((size_t)raw->parent); + taxon->taxid = raw->taxid; + taxon->rank = raw->rank; + taxon->farest = -1; + + taxon->name = ECOMALLOC((raw->namelength+1) * sizeof(char), + "Allocate taxon scientific name"); + + strncpy(taxon->name,raw->name,raw->namelength); + + return taxon; +} + + +ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName) +{ + ecotaxonomy_t *tax; + char *filename; + char *filename2; + int buffsize; + + tax = ECOMALLOC(sizeof(ecotaxonomy_t), + "Allocate taxonomy structure"); + + tax->ranks =NULL; + tax->taxons=NULL; + tax->names =NULL; + + buffsize = strlen(prefix)+10; + + filename = ECOMALLOC(buffsize, + "Allocate filename"); + filename2= ECOMALLOC(buffsize, + "Allocate filename"); + + snprintf(filename,buffsize,"%s.rdx",prefix); + + tax->ranks = read_rankidx(filename); + + if (tax->ranks == NULL) + { + ECOFREE(filename,"Desallocate filename 1"); + ECOFREE(filename2,"Desallocate filename 2"); + + delete_ecotaxonomy(tax); + return NULL; + } + + snprintf(filename,buffsize,"%s.tdx",prefix); + snprintf(filename2,buffsize,"%s.ldx",prefix); + + tax->taxons = read_taxonomyidx(filename,filename2); + + if (tax->taxons == NULL) + { + ECOFREE(filename,"Desallocate filename 1"); + ECOFREE(filename,"Desallocate filename 2"); + + delete_ecotaxonomy(tax); + return NULL; + } + + if (readAlternativeName) + { + snprintf(filename,buffsize,"%s.ndx",prefix); + tax->names=read_nameidx(filename,tax); + } + else + tax->names=NULL; + + ECOFREE(filename,"Desallocate filename 1"); + ECOFREE(filename2,"Desallocate filename 2"); + + return tax; + +} + + + +int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy) +{ + if (taxonomy) + { + if (taxonomy->ranks) + ECOFREE(taxonomy->ranks,"Free rank index"); + + if (taxonomy->names) + ECOFREE(taxonomy->names,"Free names index"); + + if (taxonomy->taxons) + ECOFREE(taxonomy->taxons,"Free taxon index"); + + ECOFREE(taxonomy,"Free taxonomy structure"); + + return 0; + } + + return 1; +} + +ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, + int32_t rankidx) +{ + ecotx_t *current_taxon; + ecotx_t *next_taxon; + + current_taxon = taxon; + next_taxon = current_taxon->parent; + + while ((current_taxon!=next_taxon) && // I' am the root node + (current_taxon->rank!=rankidx)) + { + current_taxon = next_taxon; + next_taxon = current_taxon->parent; + } + + if (current_taxon->rank==rankidx) + return current_taxon; + else + return NULL; +} + +/** + * Get back information concerning a taxon from a taxonomic id + * @param *taxonomy the taxonomy database + * @param taxid the taxonomic id + * + * @result a ecotx_t structure containing the taxonimic information + **/ +ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, + int32_t taxid) +{ + ecotx_t *current_taxon; + int32_t taxoncount; + int32_t i; + + taxoncount=taxonomy->taxons->count; + + for (current_taxon=taxonomy->taxons->taxon, + i=0; + i < taxoncount; + i++, + current_taxon++){ + if (current_taxon->taxid==taxid){ + return current_taxon; + } + } + + return (ecotx_t*)NULL; +} + +/** + * Find out if taxon is son of other taxon (identified by its taxid) + * @param *taxon son taxon + * @param parent_taxid taxonomic id of the other taxon + * + * @return 1 is the other taxid math a parent taxid, else 0 + **/ +int eco_isundertaxon(ecotx_t *taxon, + int other_taxid) +{ + ecotx_t *next_parent; + + next_parent = taxon->parent; + + while ( (other_taxid != next_parent->taxid) && + (strcmp(next_parent->name, "root")) ) + { + next_parent = next_parent->parent; + } + + if (other_taxid == next_parent->taxid) + return 1; + else + return 0; +} + +ecotx_t *eco_getspecies(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("species",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + +ecotx_t *eco_getgenus(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("genus",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + + +ecotx_t *eco_getfamily(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("family",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + +ecotx_t *eco_getkingdom(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("kingdom",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + +ecotx_t *eco_getsuperkingdom(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("superkingdom",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} diff --git a/src/ecotax.o b/src/ecotax.o new file mode 100644 index 0000000..06dd353 Binary files /dev/null and b/src/ecotax.o differ diff --git a/src/robitax.h b/src/robitax.h new file mode 100644 index 0000000..52dec45 --- /dev/null +++ b/src/robitax.h @@ -0,0 +1,6 @@ +#include "ecoPCR.h" + + +ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy); +SEXP R_delete_taxonomy(SEXP Rtaxonomy); + diff --git a/src/taxonomy.c b/src/taxonomy.c new file mode 100644 index 0000000..cb77d74 --- /dev/null +++ b/src/taxonomy.c @@ -0,0 +1,199 @@ +/* + * taxonomy.c + * + * Created on: 17 janv. 2013 + * Author: coissac + */ + +#include "robitax.h" + + +static ecorankidx_t *new_bdbecorankidx(void); +static ecotxidx_t *new_bdbecotxidx(int nbtaxa, int rootrank); + +static ecorankidx_t *new_bdbecorankidx(void) +{ + ecorankidx_t *index; + int i; + size_t size; + char* rank[]={"class", "family", "forma", + "genus", "infraclass", "infraorder", + "kingdom", "motu", "no rank", + "order", "parvorder", "phylum", + "species", "species group", "species subgroup", + "subclass", "subfamily", "subgenus", + "subkingdom", "suborder", "subphylum", + "subspecies", "subtribe", "superclass", + "superfamily","superkingdom", "superorder", + "superphylum","tribe", "varietas"}; + + + + index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (29), + "Allocate rank index"); + + index->count=30; + for (i=0; i < index->count; i++) + { + size = strlen(rank[i]); + index->label[i]=(char*) ECOMALLOC(size+1, + "Allocate rank label"); + strcpy(index->label[i],rank[i]); + } + + return index; +} + +static ecotxidx_t *new_bdbecotxidx(int nbtaxa, int rootrank) { + + ecotxidx_t *index; + int rootnamelen = 4; // "rootname="root" + + index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (nbtaxa), + "Allocate taxonomy"); + + //initialize the taxonomy with the root taxon + index->count=nbtaxa+1; + index->maxtaxid=1; + index->buffersize=nbtaxa+1; + + // Create the root taxon + index->taxon[0].taxid=1; + index->taxon[0].rank=rootrank; + index->taxon[0].farest=0; + index->taxon[0].parent=(ecotx_t*)1; + + index->taxon[0].name= (char*) ECOMALLOC(sizeof(char) * rootnamelen +1, + "Allocate taxonomy root name"); + + strcpy(index->taxon[0].name, "root"); + + return index; + +} + +static int cmptaxid(const void* t1, const void* t2) { + int et1 = ((ecotx_t*)t1)->taxid; + int et2 = ((ecotx_t*)t2)->taxid; + + if (et1 < et2) + return -1; + + if (et2 < et1) + return 1; + + return 0; +} + +static int findparentcmp(const void* p, const void* t) { + int parent = (int)p; + int et = ((ecotx_t*)t)->taxid; + + if (parent < et) + return -1; + + if (et < parent) + return 1; + + return 0; +} + +SEXP R_buildbarcodetaxo(SEXP rdata) { + SEXP names; + SEXP taxids; + SEXP ranks; + SEXP partofs; + SEXP Rtax; + + int nbtaxa; + int rootrank; + int i; + + int taxid; + char *name; + int rank; + int partof; + + ecotxidx_t* parent; + + taxids = getAttrib(rdata, R_RowNamesSymbol); + names = VECTOR_ELT(rdata, 0); + ranks = VECTOR_ELT(rdata, 1); + partofs= VECTOR_ELT(rdata, 2); + + nbtaxa = GET_LENGTH(taxids); + + ecotaxonomy_t *tax; + + tax = ECOMALLOC(sizeof(ecotaxonomy_t), + "Allocate taxonomy structure"); + + tax->ranks =new_bdbecorankidx(); + + rootrank = rank_index("no rank",tax->ranks); + + tax->taxons=new_bdbecotxidx(nbtaxa,rootrank); + tax->names =NULL; + + + for (i=0; i< nbtaxa; i++) { + taxid = atol(CHAR(STRING_ELT(taxids, i)) + 3); + name = (char*) CHAR(STRING_ELT(names, i)); + rank = rank_index(CHAR(STRING_ELT(ranks, i)),tax->ranks); + partof = atol(CHAR(STRING_ELT(partofs, i)) + 3); + + if (taxid > tax->taxons->maxtaxid) + tax->taxons->maxtaxid=taxid; + + tax->taxons->taxon[i+1].taxid=taxid; + tax->taxons->taxon[i+1].rank=rootrank; + tax->taxons->taxon[i+1].farest=0; + tax->taxons->taxon[i+1].parent=(ecotx_t*)((size_t)partof); + + tax->taxons->taxon[i+1].name= (char*) ECOMALLOC(sizeof(char) * strlen(name) +1, + "Allocate taxonomy root name"); + + strcpy(tax->taxons->taxon[i+1].name, name); + } + + qsort((void*)tax->taxons->taxon,nbtaxa+1,sizeof(ecotx_t),cmptaxid); + + for (i=0; i< nbtaxa+1; i++) { + parent = (ecotxidx_t*) bsearch((void*)(tax->taxons->taxon[i].parent), + (void*)(tax->taxons->taxon), + nbtaxa+1, + sizeof(ecotx_t), + findparentcmp); + + + if (parent==NULL) + error("Error during taxonomy indexing"); + + tax->taxons->taxon[i].parent=(struct ecotxnode *)parent; + } + + Rtax = PROTECT(R_MakeExternalPtr(tax, mkString("ROBITools NCBI Taxonomy pointer"), R_NilValue)); + R_RegisterCFinalizerEx(Rtax, (R_CFinalizer_t)R_delete_taxonomy,TRUE); + + UNPROTECT(1); + + return Rtax; + +} + + +SEXP R_delete_taxonomy(SEXP Rtaxonomy) +{ + ecotaxonomy_t *ptax; + SEXP pointer; + + ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(Rtaxonomy); + + (void) delete_ecotaxonomy(ptax); + + // Clear the external pointer + R_ClearExternalPtr(Rtaxonomy); + + return R_NilValue; + +} diff --git a/src/taxonomy.o b/src/taxonomy.o new file mode 100644 index 0000000..ca31918 Binary files /dev/null and b/src/taxonomy.o differ