From 8c7017a99d0d42e8ebcc7110372bc4999806b163 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 7 Apr 2026 08:36:50 +0200 Subject: [PATCH] :arrow_up: version bump to v4.5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update obioptions.Version from "Release 4.4.29" to "/v/ Release v5" - Update version.txt from 4.29 → .30 (automated by Makefile) --- .gitignore | 2 +- autodoc/cmd/obicomplement.md | 300 +++++++ autodoc/cmd/obiconsensus.md | 188 +++++ autodoc/cmd/obiconvert.md | 179 ++++ autodoc/cmd/obicount.md | 190 +++++ autodoc/cmd/obicsv.md | 315 +++++++ autodoc/cmd/obidemerge.md | 321 +++++++ autodoc/cmd/obidistribute.md | 296 +++++++ autodoc/cmd/obigrep.md | 326 ++++++++ autodoc/cmd/obijoin.md | 257 ++++++ autodoc/cmd/obimicrosat.md | 205 +++++ autodoc/cmd/obiscript.md | 384 +++++++++ autodoc/cmd/obisummary.md | 271 ++++++ autodoc/cmd/obiuniq.md | 347 ++++++++ autodoc/docmd/pkg.md | 48 ++ autodoc/docmd/pkg/obialign/alignment.md | 22 + autodoc/docmd/pkg/obialign/backtracking.md | 30 + autodoc/docmd/pkg/obialign/dnamatrix.md | 26 + autodoc/docmd/pkg/obialign/fastlcs.md | 23 + autodoc/docmd/pkg/obialign/fastlcsegf.md | 42 + autodoc/docmd/pkg/obialign/fourbitsencode.md | 15 + autodoc/docmd/pkg/obialign/is_d0_or_d1.md | 19 + autodoc/docmd/pkg/obialign/locatepattern.md | 29 + autodoc/docmd/pkg/obialign/pairedendalign.md | 37 + autodoc/docmd/pkg/obialign/readalign.md | 58 ++ autodoc/docmd/pkg/obiapat/pattern.md | 25 + autodoc/docmd/pkg/obiapat/pattern_test.md | 32 + autodoc/docmd/pkg/obiapat/pcr.md | 27 + autodoc/docmd/pkg/obiapat/predicat.md | 23 + autodoc/docmd/pkg/obichunk/chunk.md | 15 + autodoc/docmd/pkg/obichunk/chunk_on_disk.md | 18 + .../docmd/pkg/obichunk/chunks_on_memory.md | 21 + autodoc/docmd/pkg/obichunk/options.md | 26 + autodoc/docmd/pkg/obichunk/subchunks.md | 29 + autodoc/docmd/pkg/obichunk/unique.md | 45 + autodoc/docmd/pkg/obicorazick/worker.md | 28 + autodoc/docmd/pkg/obidefault/batch.md | 34 + autodoc/docmd/pkg/obidefault/compressed.md | 35 + autodoc/docmd/pkg/obidefault/logger.md | 38 + autodoc/docmd/pkg/obidefault/progressbar.md | 33 + autodoc/docmd/pkg/obidefault/quality.md | 26 + autodoc/docmd/pkg/obidefault/taxonomy.md | 21 + autodoc/docmd/pkg/obidefault/workers.md | 35 + autodoc/docmd/pkg/obidist/dist_matrix.md | 28 + autodoc/docmd/pkg/obidist/dist_matrix_test.md | 28 + .../pkg/obiformats/batch_of_files_reader.md | 43 + .../docmd/pkg/obiformats/batch_reader_type.md | 36 + autodoc/docmd/pkg/obiformats/csv_read.md | 30 + autodoc/docmd/pkg/obiformats/csv_writer.md | 22 + autodoc/docmd/pkg/obiformats/csviterator.md | 24 + .../docmd/pkg/obiformats/csvtaxdump_read.md | 27 + autodoc/docmd/pkg/obiformats/dispatcher.md | 14 + autodoc/docmd/pkg/obiformats/ecopcr_read.md | 29 + autodoc/docmd/pkg/obiformats/embl_read.md | 17 + autodoc/docmd/pkg/obiformats/empty_file.md | 22 + autodoc/docmd/pkg/obiformats/fastaseq_read.md | 34 + autodoc/docmd/pkg/obiformats/fastqseq_read.md | 41 + .../pkg/obiformats/fastqseq_write_generic.md | 11 + .../docmd/pkg/obiformats/fastseq_header.md | 27 + .../docmd/pkg/obiformats/fastseq_interface.md | 28 + .../pkg/obiformats/fastseq_json_header.md | 21 + .../pkg/obiformats/fastseq_obi_header.md | 31 + autodoc/docmd/pkg/obiformats/fastseq_read.md | 26 + .../pkg/obiformats/fastseq_write_fasta.md | 35 + .../pkg/obiformats/fastseq_write_fastq.md | 35 + .../obiformats/fastseq_write_with_index.md | 19 + .../docmd/pkg/obiformats/file_chunk_read.md | 25 + .../docmd/pkg/obiformats/file_chunk_write.md | 26 + autodoc/docmd/pkg/obiformats/genbank_read.md | 34 + autodoc/docmd/pkg/obiformats/json_writer.md | 27 + .../docmd/pkg/obiformats/ncbitaxdump_read.md | 17 + .../pkg/obiformats/ncbitaxdump_readtar.md | 31 + autodoc/docmd/pkg/obiformats/newick_write.md | 31 + .../docmd/pkg/obiformats/ngsfilter_read.md | 47 ++ autodoc/docmd/pkg/obiformats/options.md | 14 + autodoc/docmd/pkg/obiformats/rope_scanner.md | 27 + autodoc/docmd/pkg/obiformats/taxonomy_read.md | 34 + .../docmd/pkg/obiformats/universal_read.md | 26 + .../docmd/pkg/obiformats/universal_write.md | 29 + autodoc/docmd/pkg/obifp/uint128.md | 13 + autodoc/docmd/pkg/obifp/uint128_test.md | 17 + autodoc/docmd/pkg/obifp/uint256.md | 30 + autodoc/docmd/pkg/obifp/uint64.md | 34 + autodoc/docmd/pkg/obifp/unint.md | 32 + autodoc/docmd/pkg/obigraph/graph.md | 30 + autodoc/docmd/pkg/obigraph/graphbuffer.md | 14 + autodoc/docmd/pkg/obiiter/batch.md | 29 + autodoc/docmd/pkg/obiiter/batchiterator.md | 47 ++ autodoc/docmd/pkg/obiiter/distribute.md | 32 + autodoc/docmd/pkg/obiiter/extract_taxonomy.md | 24 + autodoc/docmd/pkg/obiiter/fragment.md | 28 + autodoc/docmd/pkg/obiiter/limitmemory.md | 29 + autodoc/docmd/pkg/obiiter/merge.md | 19 + autodoc/docmd/pkg/obiiter/numbering.md | 35 + autodoc/docmd/pkg/obiiter/paired.md | 17 + autodoc/docmd/pkg/obiiter/pipe.md | 17 + autodoc/docmd/pkg/obiiter/sequence_workers.md | 28 + autodoc/docmd/pkg/obiiter/speed.md | 31 + autodoc/docmd/pkg/obiiter/workers.md | 20 + autodoc/docmd/pkg/obiitercsv/csv.md | 33 + autodoc/docmd/pkg/obikmer/counting.md | 36 + autodoc/docmd/pkg/obikmer/debruijn.md | 44 + autodoc/docmd/pkg/obikmer/encodefourmer.md | 35 + autodoc/docmd/pkg/obikmer/encodekmer.md | 39 + autodoc/docmd/pkg/obikmer/encodekmer_test.md | 36 + autodoc/docmd/pkg/obikmer/entropy.md | 31 + autodoc/docmd/pkg/obikmer/kdi_merge.md | 37 + autodoc/docmd/pkg/obikmer/kdi_merge_test.md | 27 + autodoc/docmd/pkg/obikmer/kdi_reader.md | 27 + autodoc/docmd/pkg/obikmer/kdi_test.md | 34 + autodoc/docmd/pkg/obikmer/kdi_writer.md | 38 + autodoc/docmd/pkg/obikmer/kdx.md | 29 + autodoc/docmd/pkg/obikmer/kmer_match.md | 14 + autodoc/docmd/pkg/obikmer/kmer_set_builder.md | 49 ++ .../pkg/obikmer/kmer_set_builder_test.md | 44 + autodoc/docmd/pkg/obikmer/kmer_set_disk.md | 44 + .../docmd/pkg/obikmer/kmer_set_disk_ops.md | 26 + .../pkg/obikmer/kmer_set_disk_ops_test.md | 28 + autodoc/docmd/pkg/obikmer/kmermap.md | 37 + autodoc/docmd/pkg/obikmer/minimizer_utils.md | 27 + autodoc/docmd/pkg/obikmer/skm_reader.md | 24 + autodoc/docmd/pkg/obikmer/skm_test.md | 23 + autodoc/docmd/pkg/obikmer/skm_writer.md | 24 + autodoc/docmd/pkg/obikmer/spectrum.md | 35 + autodoc/docmd/pkg/obikmer/superkmer.md | 48 ++ autodoc/docmd/pkg/obikmer/superkmer_iter.md | 32 + .../docmd/pkg/obikmer/superkmer_iter_test.md | 39 + autodoc/docmd/pkg/obikmer/varint.md | 33 + autodoc/docmd/pkg/obikmer/varint_test.md | 37 + autodoc/docmd/pkg/obilog/warning.md | 30 + autodoc/docmd/pkg/obilua/lua.md | 33 + autodoc/docmd/pkg/obilua/lua_obicontext.md | 29 + .../docmd/pkg/obilua/lua_push_interface.md | 31 + autodoc/docmd/pkg/obilua/lua_table.md | 28 + autodoc/docmd/pkg/obilua/mutex.md | 30 + autodoc/docmd/pkg/obilua/obilib.md | 30 + autodoc/docmd/pkg/obilua/obiseq.md | 34 + autodoc/docmd/pkg/obilua/obiseqslice.md | 31 + autodoc/docmd/pkg/obilua/obitaxon.md | 30 + autodoc/docmd/pkg/obilua/obitaxonomy.md | 29 + autodoc/docmd/pkg/obingslibrary/marker.md | 40 + autodoc/docmd/pkg/obingslibrary/match.md | 32 + autodoc/docmd/pkg/obingslibrary/multimatch.md | 43 + autodoc/docmd/pkg/obingslibrary/ngslibrary.md | 17 + autodoc/docmd/pkg/obingslibrary/worker.md | 31 + autodoc/docmd/pkg/obioptions/options.md | 47 ++ autodoc/docmd/pkg/obioptions/subcommand.md | 23 + autodoc/docmd/pkg/obioptions/version.md | 35 + autodoc/docmd/pkg/obiphylo/tree.md | 30 + autodoc/docmd/pkg/obiseq/attributes.md | 22 + autodoc/docmd/pkg/obiseq/biosequence.md | 41 + autodoc/docmd/pkg/obiseq/biosequence_test.md | 35 + autodoc/docmd/pkg/obiseq/biosequenceslice.md | 37 + autodoc/docmd/pkg/obiseq/class.md | 32 + autodoc/docmd/pkg/obiseq/compare.md | 20 + autodoc/docmd/pkg/obiseq/eval.md | 28 + autodoc/docmd/pkg/obiseq/iupac_nog.md | 27 + autodoc/docmd/pkg/obiseq/join.md | 35 + autodoc/docmd/pkg/obiseq/kmers.md | 20 + autodoc/docmd/pkg/obiseq/language.md | 41 + autodoc/docmd/pkg/obiseq/merge.md | 39 + autodoc/docmd/pkg/obiseq/paired_reads.md | 19 + autodoc/docmd/pkg/obiseq/pool.md | 34 + autodoc/docmd/pkg/obiseq/predicate.md | 33 + autodoc/docmd/pkg/obiseq/revcomp.md | 35 + autodoc/docmd/pkg/obiseq/revcomp_test.md | 19 + autodoc/docmd/pkg/obiseq/subseq.md | 13 + autodoc/docmd/pkg/obiseq/subseq_test.md | 29 + .../docmd/pkg/obiseq/taxonomy_classifier.md | 26 + autodoc/docmd/pkg/obiseq/taxonomy_lca.md | 22 + autodoc/docmd/pkg/obiseq/taxonomy_methods.md | 41 + .../docmd/pkg/obiseq/taxonomy_predicate.md | 20 + autodoc/docmd/pkg/obiseq/taxonomy_workers.md | 22 + autodoc/docmd/pkg/obiseq/worker.md | 18 + autodoc/docmd/pkg/obistats/algo.md | 20 + autodoc/docmd/pkg/obistats/beta.md | 33 + autodoc/docmd/pkg/obistats/betabinom.md | 39 + autodoc/docmd/pkg/obistats/data.md | 31 + autodoc/docmd/pkg/obistats/delta.md | 25 + autodoc/docmd/pkg/obistats/kmeans.md | 34 + autodoc/docmd/pkg/obistats/kolmogorovbeta.md | 26 + autodoc/docmd/pkg/obistats/mannwhitney.md | 37 + autodoc/docmd/pkg/obistats/mathx.md | 27 + autodoc/docmd/pkg/obistats/minmax.md | 29 + autodoc/docmd/pkg/obistats/normaldist.md | 30 + autodoc/docmd/pkg/obistats/random.md | 31 + autodoc/docmd/pkg/obistats/sample.md | 22 + autodoc/docmd/pkg/obistats/scaler.md | 23 + autodoc/docmd/pkg/obistats/sort.md | 24 + autodoc/docmd/pkg/obistats/stats.md | 25 + autodoc/docmd/pkg/obistats/table.md | 31 + autodoc/docmd/pkg/obistats/tdist.md | 30 + autodoc/docmd/pkg/obistats/ttest.md | 37 + autodoc/docmd/pkg/obistats/udist.md | 39 + autodoc/docmd/pkg/obistats/utils.md | 21 + autodoc/docmd/pkg/obisuffix/suffix_array.md | 23 + autodoc/docmd/pkg/obitable/table.md | 22 + autodoc/docmd/pkg/obitax/default_taxonomy.md | 30 + autodoc/docmd/pkg/obitax/filter_on_name.md | 28 + autodoc/docmd/pkg/obitax/filter_on_rank.md | 12 + .../docmd/pkg/obitax/filter_on_subclade_of.md | 31 + autodoc/docmd/pkg/obitax/inner.md | 40 + autodoc/docmd/pkg/obitax/issuubcladeof.md | 19 + autodoc/docmd/pkg/obitax/iterator.md | 31 + autodoc/docmd/pkg/obitax/lca.md | 31 + autodoc/docmd/pkg/obitax/string_parser.md | 41 + autodoc/docmd/pkg/obitax/taxid.md | 19 + autodoc/docmd/pkg/obitax/taxon.md | 29 + autodoc/docmd/pkg/obitax/taxonnode.md | 36 + autodoc/docmd/pkg/obitax/taxonomy.md | 18 + autodoc/docmd/pkg/obitax/taxonset.md | 24 + autodoc/docmd/pkg/obitax/taxonslice.md | 25 + .../pkg/obitools/obiannotate/obiannotate.md | 31 + .../docmd/pkg/obitools/obiannotate/options.md | 37 + .../docmd/pkg/obitools/obiclean/chimera.md | 30 + autodoc/docmd/pkg/obitools/obiclean/graph.md | 41 + .../docmd/pkg/obitools/obiclean/obiclean.md | 48 ++ .../docmd/pkg/obitools/obiclean/options.md | 27 + .../pkg/obitools/obicleandb/obicleandb.md | 45 + .../docmd/pkg/obitools/obicleandb/options.md | 18 + .../docmd/pkg/obitools/obiclust/obiclust.md | 40 + .../docmd/pkg/obitools/obiclust/options.md | 31 + .../pkg/obitools/obiconsensus/obiconsensus.md | 45 + .../pkg/obitools/obiconsensus/options.md | 26 + .../docmd/pkg/obitools/obiconvert/options.md | 36 + .../obitools/obiconvert/sequence_reader.md | 24 + .../obitools/obiconvert/sequence_writer.md | 30 + .../docmd/pkg/obitools/obicount/options.md | 25 + .../docmd/pkg/obitools/obicsv/csvoption.md | 33 + autodoc/docmd/pkg/obitools/obicsv/obicsv.md | 23 + autodoc/docmd/pkg/obitools/obicsv/options.md | 28 + autodoc/docmd/pkg/obitools/obicsv/sequence.md | 27 + autodoc/docmd/pkg/obitools/obicsv/writer.md | 21 + .../docmd/pkg/obitools/obidemerge/demerge.md | 22 + .../docmd/pkg/obitools/obidemerge/options.md | 18 + .../pkg/obitools/obidistribute/distribute.md | 26 + .../pkg/obitools/obidistribute/options.md | 33 + autodoc/docmd/pkg/obitools/obigrep/grep.md | 26 + autodoc/docmd/pkg/obitools/obigrep/options.md | 48 ++ autodoc/docmd/pkg/obitools/obijoin/join.md | 22 + autodoc/docmd/pkg/obitools/obijoin/options.md | 42 + autodoc/docmd/pkg/obitools/obik/cp.md | 29 + autodoc/docmd/pkg/obitools/obik/filter.md | 34 + autodoc/docmd/pkg/obitools/obik/index.md | 39 + autodoc/docmd/pkg/obitools/obik/lowmask.md | 29 + autodoc/docmd/pkg/obitools/obik/ls.md | 33 + autodoc/docmd/pkg/obitools/obik/match.md | 44 + autodoc/docmd/pkg/obitools/obik/mv.md | 29 + autodoc/docmd/pkg/obitools/obik/obik.md | 32 + autodoc/docmd/pkg/obitools/obik/options.md | 42 + autodoc/docmd/pkg/obitools/obik/rm.md | 27 + autodoc/docmd/pkg/obitools/obik/spectrum.md | 28 + autodoc/docmd/pkg/obitools/obik/summary.md | 32 + autodoc/docmd/pkg/obitools/obik/super.md | 25 + .../pkg/obitools/obikmersim/obikmersim.md | 35 + .../docmd/pkg/obitools/obikmersim/options.md | 18 + .../pkg/obitools/obilandmark/obilandmark.md | 28 + .../docmd/pkg/obitools/obilandmark/options.md | 21 + .../pkg/obitools/obilandmark/taxostat.md | 39 + .../docmd/pkg/obitools/obimatrix/obimatrix.md | 17 + .../docmd/pkg/obitools/obimatrix/options.md | 32 + .../pkg/obitools/obimicrosat/microsat.md | 51 ++ .../docmd/pkg/obitools/obimicrosat/options.md | 29 + .../pkg/obitools/obimultiplex/demultiplex.md | 23 + .../pkg/obitools/obimultiplex/options.md | 35 + .../docmd/pkg/obitools/obipairing/options.md | 25 + .../docmd/pkg/obitools/obipairing/pairing.md | 39 + autodoc/docmd/pkg/obitools/obipcr/options.md | 24 + autodoc/docmd/pkg/obitools/obipcr/pcr.md | 40 + .../pkg/obitools/obirefidx/famlilyindexing.md | 37 + .../pkg/obitools/obirefidx/geomindexing.md | 25 + .../docmd/pkg/obitools/obirefidx/obirefidx.md | 37 + .../docmd/pkg/obitools/obirefidx/options.md | 18 + .../docmd/pkg/obitools/obiscript/obiscript.md | 25 + .../docmd/pkg/obitools/obiscript/options.md | 43 + .../docmd/pkg/obitools/obisplit/obisplit.md | 30 + .../docmd/pkg/obitools/obisplit/options.md | 16 + .../pkg/obitools/obisummary/obisummary.md | 47 ++ .../docmd/pkg/obitools/obisummary/options.md | 26 + .../docmd/pkg/obitools/obitag/obigeomtag.md | 36 + autodoc/docmd/pkg/obitools/obitag/obitag.md | 41 + autodoc/docmd/pkg/obitools/obitag/options.md | 33 + .../docmd/pkg/obitools/obitagpcr/options.md | 29 + .../docmd/pkg/obitools/obitagpcr/pcrtag.md | 23 + .../pkg/obitools/obitaxonomy/obitaxonomy.md | 11 + .../docmd/pkg/obitools/obitaxonomy/options.md | 31 + autodoc/docmd/pkg/obitools/obiuniq/options.md | 27 + autodoc/docmd/pkg/obitools/obiuniq/unique.md | 17 + autodoc/docmd/pkg/obiutils/abs.md | 41 + autodoc/docmd/pkg/obiutils/abs_test.md | 23 + autodoc/docmd/pkg/obiutils/array.md | 23 + autodoc/docmd/pkg/obiutils/array_test.md | 15 + autodoc/docmd/pkg/obiutils/bytes.md | 27 + autodoc/docmd/pkg/obiutils/bytes_test.md | 20 + autodoc/docmd/pkg/obiutils/cast_interface.md | 15 + autodoc/docmd/pkg/obiutils/counter.md | 25 + autodoc/docmd/pkg/obiutils/download.md | 37 + autodoc/docmd/pkg/obiutils/goutils.md | 16 + autodoc/docmd/pkg/obiutils/gzipfile.md | 37 + autodoc/docmd/pkg/obiutils/memsize.md | 15 + autodoc/docmd/pkg/obiutils/mimetypes.md | 32 + autodoc/docmd/pkg/obiutils/minmax.md | 36 + autodoc/docmd/pkg/obiutils/minmultiset.md | 35 + autodoc/docmd/pkg/obiutils/path.md | 16 + autodoc/docmd/pkg/obiutils/path_test.md | 19 + autodoc/docmd/pkg/obiutils/pipe.md | 36 + autodoc/docmd/pkg/obiutils/ranks.md | 35 + autodoc/docmd/pkg/obiutils/set.md | 34 + autodoc/docmd/pkg/obiutils/set_test.md | 36 + autodoc/docmd/pkg/obiutils/slices.md | 22 + autodoc/docmd/pkg/obiutils/strings.md | 33 + autodoc/docmd/pkg/obiutils/tar.md | 24 + autodoc/docmd/pkg/obiutils/unsafe.md | 25 + autodoc/docmd/pkg/obiutils/xopen.md | 38 + autodoc/docmd/pkg/obiutils/xopen_test.md | 19 + autodoc/docmd/pkg_obialign.md | 76 ++ autodoc/docmd/pkg_obiapat.md | 88 ++ autodoc/docmd/pkg_obichunk.md | 103 +++ autodoc/docmd/pkg_obicorazick.md | 70 ++ autodoc/docmd/pkg_obidefault.md | 56 ++ autodoc/docmd/pkg_obidist.md | 52 ++ autodoc/docmd/pkg_obiformats.md | 121 +++ autodoc/docmd/pkg_obifp.md | 82 ++ autodoc/docmd/pkg_obigraph.md | 87 ++ autodoc/docmd/pkg_obiiter.md | 79 ++ autodoc/docmd/pkg_obiitercsv.md | 61 ++ autodoc/docmd/pkg_obikmer.md | 101 +++ autodoc/docmd/pkg_obilog.md | 30 + autodoc/docmd/pkg_obilua.md | 64 ++ autodoc/docmd/pkg_obingslibrary.md | 77 ++ autodoc/docmd/pkg_obioptions.md | 75 ++ autodoc/docmd/pkg_obiphylo.md | 61 ++ autodoc/docmd/pkg_obiseq.md | 70 ++ autodoc/docmd/pkg_obistats.md | 126 +++ autodoc/docmd/pkg_obisuffix.md | 57 ++ autodoc/docmd/pkg_obitable.md | 39 + autodoc/docmd/pkg_obitax.md | 76 ++ autodoc/docmd/pkg_obitools_obiannotate.md | 42 + autodoc/docmd/pkg_obitools_obiclean.md | 86 ++ autodoc/docmd/pkg_obitools_obicleandb.md | 54 ++ autodoc/docmd/pkg_obitools_obiclust.md | 76 ++ autodoc/docmd/pkg_obitools_obiconsensus.md | 49 ++ autodoc/docmd/pkg_obitools_obiconvert.md | 62 ++ autodoc/docmd/pkg_obitools_obicount.md | 55 ++ autodoc/docmd/pkg_obitools_obicsv.md | 55 ++ autodoc/docmd/pkg_obitools_obidemerge.md | 53 ++ autodoc/docmd/pkg_obitools_obidistribute.md | 58 ++ autodoc/docmd/pkg_obitools_obigrep.md | 54 ++ autodoc/docmd/pkg_obitools_obijoin.md | 62 ++ autodoc/docmd/pkg_obitools_obik.md | 113 +++ autodoc/docmd/pkg_obitools_obikmersim.md | 107 +++ autodoc/docmd/pkg_obitools_obilandmark.md | 49 ++ autodoc/docmd/pkg_obitools_obimatrix.md | 40 + autodoc/docmd/pkg_obitools_obimicrosat.md | 54 ++ autodoc/docmd/pkg_obitools_obimultiplex.md | 54 ++ autodoc/docmd/pkg_obitools_obipairing.md | 62 ++ autodoc/docmd/pkg_obitools_obipcr.md | 42 + autodoc/docmd/pkg_obitools_obirefidx.md | 120 +++ autodoc/docmd/pkg_obitools_obiscript.md | 50 ++ autodoc/docmd/pkg_obitools_obisplit.md | 48 ++ autodoc/docmd/pkg_obitools_obisummary.md | 64 ++ autodoc/docmd/pkg_obitools_obitag.md | 82 ++ autodoc/docmd/pkg_obitools_obitagpcr.md | 59 ++ autodoc/docmd/pkg_obitools_obitaxonomy.md | 51 ++ autodoc/docmd/pkg_obitools_obiuniq.md | 36 + autodoc/docmd/pkg_obiutils.md | 61 ++ autodoc/examples/obiconvert/output.json | 23 + autodoc/examples/obicount/out_default.txt | 7 + autodoc/examples/obicount/out_fastq_reads.txt | 4 + autodoc/examples/obicount/out_symbols.txt | 5 + autodoc/examples/obicount/out_variants.txt | 5 + autodoc/examples/obicsv/output6.csv.gz | Bin 0 -> 85 bytes autodoc/examples/obiscript/annotate.lua | 9 + autodoc/examples/obiscript/enrich.lua | 5 + autodoc/examples/obiscript/enriched.json | 38 + autodoc/examples/obiscript/my_script.lua | 17 + autodoc/examples/obiscript/process_pairs.lua | 4 + autodoc/examples/obisummary/out_json.json | 17 + autodoc/prompt_doc.md | 60 ++ autodoc/prompt_examples.md | 407 +++++++++ autodoc/prompt_full.md | 791 ++++++++++++++++++ autodoc/prompt_hugo.md | 414 +++++++++ autodoc/prompt_v2.md | 230 +++++ entities.json | 10 + obitests/obitools/.DS_Store | Bin 6148 -> 8196 bytes pkg/obilua/luahttp.go | 65 ++ pkg/obilua/obilib.go | 1 + prompt_documentation_globale.md | 415 +++++---- scripts/find_setattribute.go | 222 +++++ scripts/find_setattribute.sh | 36 + setattribute_refs.json | 308 +++++++ x | 19 + 392 files changed, 18875 insertions(+), 141 deletions(-) create mode 100644 autodoc/cmd/obicomplement.md create mode 100644 autodoc/cmd/obiconsensus.md create mode 100644 autodoc/cmd/obiconvert.md create mode 100644 autodoc/cmd/obicount.md create mode 100644 autodoc/cmd/obicsv.md create mode 100644 autodoc/cmd/obidemerge.md create mode 100644 autodoc/cmd/obidistribute.md create mode 100644 autodoc/cmd/obigrep.md create mode 100644 autodoc/cmd/obijoin.md create mode 100644 autodoc/cmd/obimicrosat.md create mode 100644 autodoc/cmd/obiscript.md create mode 100644 autodoc/cmd/obisummary.md create mode 100644 autodoc/cmd/obiuniq.md create mode 100644 autodoc/docmd/pkg.md create mode 100644 autodoc/docmd/pkg/obialign/alignment.md create mode 100644 autodoc/docmd/pkg/obialign/backtracking.md create mode 100644 autodoc/docmd/pkg/obialign/dnamatrix.md create mode 100644 autodoc/docmd/pkg/obialign/fastlcs.md create mode 100644 autodoc/docmd/pkg/obialign/fastlcsegf.md create mode 100644 autodoc/docmd/pkg/obialign/fourbitsencode.md create mode 100644 autodoc/docmd/pkg/obialign/is_d0_or_d1.md create mode 100644 autodoc/docmd/pkg/obialign/locatepattern.md create mode 100644 autodoc/docmd/pkg/obialign/pairedendalign.md create mode 100644 autodoc/docmd/pkg/obialign/readalign.md create mode 100644 autodoc/docmd/pkg/obiapat/pattern.md create mode 100644 autodoc/docmd/pkg/obiapat/pattern_test.md create mode 100644 autodoc/docmd/pkg/obiapat/pcr.md create mode 100644 autodoc/docmd/pkg/obiapat/predicat.md create mode 100644 autodoc/docmd/pkg/obichunk/chunk.md create mode 100644 autodoc/docmd/pkg/obichunk/chunk_on_disk.md create mode 100644 autodoc/docmd/pkg/obichunk/chunks_on_memory.md create mode 100644 autodoc/docmd/pkg/obichunk/options.md create mode 100644 autodoc/docmd/pkg/obichunk/subchunks.md create mode 100644 autodoc/docmd/pkg/obichunk/unique.md create mode 100644 autodoc/docmd/pkg/obicorazick/worker.md create mode 100644 autodoc/docmd/pkg/obidefault/batch.md create mode 100644 autodoc/docmd/pkg/obidefault/compressed.md create mode 100644 autodoc/docmd/pkg/obidefault/logger.md create mode 100644 autodoc/docmd/pkg/obidefault/progressbar.md create mode 100644 autodoc/docmd/pkg/obidefault/quality.md create mode 100644 autodoc/docmd/pkg/obidefault/taxonomy.md create mode 100644 autodoc/docmd/pkg/obidefault/workers.md create mode 100644 autodoc/docmd/pkg/obidist/dist_matrix.md create mode 100644 autodoc/docmd/pkg/obidist/dist_matrix_test.md create mode 100644 autodoc/docmd/pkg/obiformats/batch_of_files_reader.md create mode 100644 autodoc/docmd/pkg/obiformats/batch_reader_type.md create mode 100644 autodoc/docmd/pkg/obiformats/csv_read.md create mode 100644 autodoc/docmd/pkg/obiformats/csv_writer.md create mode 100644 autodoc/docmd/pkg/obiformats/csviterator.md create mode 100644 autodoc/docmd/pkg/obiformats/csvtaxdump_read.md create mode 100644 autodoc/docmd/pkg/obiformats/dispatcher.md create mode 100644 autodoc/docmd/pkg/obiformats/ecopcr_read.md create mode 100644 autodoc/docmd/pkg/obiformats/embl_read.md create mode 100644 autodoc/docmd/pkg/obiformats/empty_file.md create mode 100644 autodoc/docmd/pkg/obiformats/fastaseq_read.md create mode 100644 autodoc/docmd/pkg/obiformats/fastqseq_read.md create mode 100644 autodoc/docmd/pkg/obiformats/fastqseq_write_generic.md create mode 100644 autodoc/docmd/pkg/obiformats/fastseq_header.md create mode 100644 autodoc/docmd/pkg/obiformats/fastseq_interface.md create mode 100644 autodoc/docmd/pkg/obiformats/fastseq_json_header.md create mode 100644 autodoc/docmd/pkg/obiformats/fastseq_obi_header.md create mode 100644 autodoc/docmd/pkg/obiformats/fastseq_read.md create mode 100644 autodoc/docmd/pkg/obiformats/fastseq_write_fasta.md create mode 100644 autodoc/docmd/pkg/obiformats/fastseq_write_fastq.md create mode 100644 autodoc/docmd/pkg/obiformats/fastseq_write_with_index.md create mode 100644 autodoc/docmd/pkg/obiformats/file_chunk_read.md create mode 100644 autodoc/docmd/pkg/obiformats/file_chunk_write.md create mode 100644 autodoc/docmd/pkg/obiformats/genbank_read.md create mode 100644 autodoc/docmd/pkg/obiformats/json_writer.md create mode 100644 autodoc/docmd/pkg/obiformats/ncbitaxdump_read.md create mode 100644 autodoc/docmd/pkg/obiformats/ncbitaxdump_readtar.md create mode 100644 autodoc/docmd/pkg/obiformats/newick_write.md create mode 100644 autodoc/docmd/pkg/obiformats/ngsfilter_read.md create mode 100644 autodoc/docmd/pkg/obiformats/options.md create mode 100644 autodoc/docmd/pkg/obiformats/rope_scanner.md create mode 100644 autodoc/docmd/pkg/obiformats/taxonomy_read.md create mode 100644 autodoc/docmd/pkg/obiformats/universal_read.md create mode 100644 autodoc/docmd/pkg/obiformats/universal_write.md create mode 100644 autodoc/docmd/pkg/obifp/uint128.md create mode 100644 autodoc/docmd/pkg/obifp/uint128_test.md create mode 100644 autodoc/docmd/pkg/obifp/uint256.md create mode 100644 autodoc/docmd/pkg/obifp/uint64.md create mode 100644 autodoc/docmd/pkg/obifp/unint.md create mode 100644 autodoc/docmd/pkg/obigraph/graph.md create mode 100644 autodoc/docmd/pkg/obigraph/graphbuffer.md create mode 100644 autodoc/docmd/pkg/obiiter/batch.md create mode 100644 autodoc/docmd/pkg/obiiter/batchiterator.md create mode 100644 autodoc/docmd/pkg/obiiter/distribute.md create mode 100644 autodoc/docmd/pkg/obiiter/extract_taxonomy.md create mode 100644 autodoc/docmd/pkg/obiiter/fragment.md create mode 100644 autodoc/docmd/pkg/obiiter/limitmemory.md create mode 100644 autodoc/docmd/pkg/obiiter/merge.md create mode 100644 autodoc/docmd/pkg/obiiter/numbering.md create mode 100644 autodoc/docmd/pkg/obiiter/paired.md create mode 100644 autodoc/docmd/pkg/obiiter/pipe.md create mode 100644 autodoc/docmd/pkg/obiiter/sequence_workers.md create mode 100644 autodoc/docmd/pkg/obiiter/speed.md create mode 100644 autodoc/docmd/pkg/obiiter/workers.md create mode 100644 autodoc/docmd/pkg/obiitercsv/csv.md create mode 100644 autodoc/docmd/pkg/obikmer/counting.md create mode 100644 autodoc/docmd/pkg/obikmer/debruijn.md create mode 100644 autodoc/docmd/pkg/obikmer/encodefourmer.md create mode 100644 autodoc/docmd/pkg/obikmer/encodekmer.md create mode 100644 autodoc/docmd/pkg/obikmer/encodekmer_test.md create mode 100644 autodoc/docmd/pkg/obikmer/entropy.md create mode 100644 autodoc/docmd/pkg/obikmer/kdi_merge.md create mode 100644 autodoc/docmd/pkg/obikmer/kdi_merge_test.md create mode 100644 autodoc/docmd/pkg/obikmer/kdi_reader.md create mode 100644 autodoc/docmd/pkg/obikmer/kdi_test.md create mode 100644 autodoc/docmd/pkg/obikmer/kdi_writer.md create mode 100644 autodoc/docmd/pkg/obikmer/kdx.md create mode 100644 autodoc/docmd/pkg/obikmer/kmer_match.md create mode 100644 autodoc/docmd/pkg/obikmer/kmer_set_builder.md create mode 100644 autodoc/docmd/pkg/obikmer/kmer_set_builder_test.md create mode 100644 autodoc/docmd/pkg/obikmer/kmer_set_disk.md create mode 100644 autodoc/docmd/pkg/obikmer/kmer_set_disk_ops.md create mode 100644 autodoc/docmd/pkg/obikmer/kmer_set_disk_ops_test.md create mode 100644 autodoc/docmd/pkg/obikmer/kmermap.md create mode 100644 autodoc/docmd/pkg/obikmer/minimizer_utils.md create mode 100644 autodoc/docmd/pkg/obikmer/skm_reader.md create mode 100644 autodoc/docmd/pkg/obikmer/skm_test.md create mode 100644 autodoc/docmd/pkg/obikmer/skm_writer.md create mode 100644 autodoc/docmd/pkg/obikmer/spectrum.md create mode 100644 autodoc/docmd/pkg/obikmer/superkmer.md create mode 100644 autodoc/docmd/pkg/obikmer/superkmer_iter.md create mode 100644 autodoc/docmd/pkg/obikmer/superkmer_iter_test.md create mode 100644 autodoc/docmd/pkg/obikmer/varint.md create mode 100644 autodoc/docmd/pkg/obikmer/varint_test.md create mode 100644 autodoc/docmd/pkg/obilog/warning.md create mode 100644 autodoc/docmd/pkg/obilua/lua.md create mode 100644 autodoc/docmd/pkg/obilua/lua_obicontext.md create mode 100644 autodoc/docmd/pkg/obilua/lua_push_interface.md create mode 100644 autodoc/docmd/pkg/obilua/lua_table.md create mode 100644 autodoc/docmd/pkg/obilua/mutex.md create mode 100644 autodoc/docmd/pkg/obilua/obilib.md create mode 100644 autodoc/docmd/pkg/obilua/obiseq.md create mode 100644 autodoc/docmd/pkg/obilua/obiseqslice.md create mode 100644 autodoc/docmd/pkg/obilua/obitaxon.md create mode 100644 autodoc/docmd/pkg/obilua/obitaxonomy.md create mode 100644 autodoc/docmd/pkg/obingslibrary/marker.md create mode 100644 autodoc/docmd/pkg/obingslibrary/match.md create mode 100644 autodoc/docmd/pkg/obingslibrary/multimatch.md create mode 100644 autodoc/docmd/pkg/obingslibrary/ngslibrary.md create mode 100644 autodoc/docmd/pkg/obingslibrary/worker.md create mode 100644 autodoc/docmd/pkg/obioptions/options.md create mode 100644 autodoc/docmd/pkg/obioptions/subcommand.md create mode 100644 autodoc/docmd/pkg/obioptions/version.md create mode 100644 autodoc/docmd/pkg/obiphylo/tree.md create mode 100644 autodoc/docmd/pkg/obiseq/attributes.md create mode 100644 autodoc/docmd/pkg/obiseq/biosequence.md create mode 100644 autodoc/docmd/pkg/obiseq/biosequence_test.md create mode 100644 autodoc/docmd/pkg/obiseq/biosequenceslice.md create mode 100644 autodoc/docmd/pkg/obiseq/class.md create mode 100644 autodoc/docmd/pkg/obiseq/compare.md create mode 100644 autodoc/docmd/pkg/obiseq/eval.md create mode 100644 autodoc/docmd/pkg/obiseq/iupac_nog.md create mode 100644 autodoc/docmd/pkg/obiseq/join.md create mode 100644 autodoc/docmd/pkg/obiseq/kmers.md create mode 100644 autodoc/docmd/pkg/obiseq/language.md create mode 100644 autodoc/docmd/pkg/obiseq/merge.md create mode 100644 autodoc/docmd/pkg/obiseq/paired_reads.md create mode 100644 autodoc/docmd/pkg/obiseq/pool.md create mode 100644 autodoc/docmd/pkg/obiseq/predicate.md create mode 100644 autodoc/docmd/pkg/obiseq/revcomp.md create mode 100644 autodoc/docmd/pkg/obiseq/revcomp_test.md create mode 100644 autodoc/docmd/pkg/obiseq/subseq.md create mode 100644 autodoc/docmd/pkg/obiseq/subseq_test.md create mode 100644 autodoc/docmd/pkg/obiseq/taxonomy_classifier.md create mode 100644 autodoc/docmd/pkg/obiseq/taxonomy_lca.md create mode 100644 autodoc/docmd/pkg/obiseq/taxonomy_methods.md create mode 100644 autodoc/docmd/pkg/obiseq/taxonomy_predicate.md create mode 100644 autodoc/docmd/pkg/obiseq/taxonomy_workers.md create mode 100644 autodoc/docmd/pkg/obiseq/worker.md create mode 100644 autodoc/docmd/pkg/obistats/algo.md create mode 100644 autodoc/docmd/pkg/obistats/beta.md create mode 100644 autodoc/docmd/pkg/obistats/betabinom.md create mode 100644 autodoc/docmd/pkg/obistats/data.md create mode 100644 autodoc/docmd/pkg/obistats/delta.md create mode 100644 autodoc/docmd/pkg/obistats/kmeans.md create mode 100644 autodoc/docmd/pkg/obistats/kolmogorovbeta.md create mode 100644 autodoc/docmd/pkg/obistats/mannwhitney.md create mode 100644 autodoc/docmd/pkg/obistats/mathx.md create mode 100644 autodoc/docmd/pkg/obistats/minmax.md create mode 100644 autodoc/docmd/pkg/obistats/normaldist.md create mode 100644 autodoc/docmd/pkg/obistats/random.md create mode 100644 autodoc/docmd/pkg/obistats/sample.md create mode 100644 autodoc/docmd/pkg/obistats/scaler.md create mode 100644 autodoc/docmd/pkg/obistats/sort.md create mode 100644 autodoc/docmd/pkg/obistats/stats.md create mode 100644 autodoc/docmd/pkg/obistats/table.md create mode 100644 autodoc/docmd/pkg/obistats/tdist.md create mode 100644 autodoc/docmd/pkg/obistats/ttest.md create mode 100644 autodoc/docmd/pkg/obistats/udist.md create mode 100644 autodoc/docmd/pkg/obistats/utils.md create mode 100644 autodoc/docmd/pkg/obisuffix/suffix_array.md create mode 100644 autodoc/docmd/pkg/obitable/table.md create mode 100644 autodoc/docmd/pkg/obitax/default_taxonomy.md create mode 100644 autodoc/docmd/pkg/obitax/filter_on_name.md create mode 100644 autodoc/docmd/pkg/obitax/filter_on_rank.md create mode 100644 autodoc/docmd/pkg/obitax/filter_on_subclade_of.md create mode 100644 autodoc/docmd/pkg/obitax/inner.md create mode 100644 autodoc/docmd/pkg/obitax/issuubcladeof.md create mode 100644 autodoc/docmd/pkg/obitax/iterator.md create mode 100644 autodoc/docmd/pkg/obitax/lca.md create mode 100644 autodoc/docmd/pkg/obitax/string_parser.md create mode 100644 autodoc/docmd/pkg/obitax/taxid.md create mode 100644 autodoc/docmd/pkg/obitax/taxon.md create mode 100644 autodoc/docmd/pkg/obitax/taxonnode.md create mode 100644 autodoc/docmd/pkg/obitax/taxonomy.md create mode 100644 autodoc/docmd/pkg/obitax/taxonset.md create mode 100644 autodoc/docmd/pkg/obitax/taxonslice.md create mode 100644 autodoc/docmd/pkg/obitools/obiannotate/obiannotate.md create mode 100644 autodoc/docmd/pkg/obitools/obiannotate/options.md create mode 100644 autodoc/docmd/pkg/obitools/obiclean/chimera.md create mode 100644 autodoc/docmd/pkg/obitools/obiclean/graph.md create mode 100644 autodoc/docmd/pkg/obitools/obiclean/obiclean.md create mode 100644 autodoc/docmd/pkg/obitools/obiclean/options.md create mode 100644 autodoc/docmd/pkg/obitools/obicleandb/obicleandb.md create mode 100644 autodoc/docmd/pkg/obitools/obicleandb/options.md create mode 100644 autodoc/docmd/pkg/obitools/obiclust/obiclust.md create mode 100644 autodoc/docmd/pkg/obitools/obiclust/options.md create mode 100644 autodoc/docmd/pkg/obitools/obiconsensus/obiconsensus.md create mode 100644 autodoc/docmd/pkg/obitools/obiconsensus/options.md create mode 100644 autodoc/docmd/pkg/obitools/obiconvert/options.md create mode 100644 autodoc/docmd/pkg/obitools/obiconvert/sequence_reader.md create mode 100644 autodoc/docmd/pkg/obitools/obiconvert/sequence_writer.md create mode 100644 autodoc/docmd/pkg/obitools/obicount/options.md create mode 100644 autodoc/docmd/pkg/obitools/obicsv/csvoption.md create mode 100644 autodoc/docmd/pkg/obitools/obicsv/obicsv.md create mode 100644 autodoc/docmd/pkg/obitools/obicsv/options.md create mode 100644 autodoc/docmd/pkg/obitools/obicsv/sequence.md create mode 100644 autodoc/docmd/pkg/obitools/obicsv/writer.md create mode 100644 autodoc/docmd/pkg/obitools/obidemerge/demerge.md create mode 100644 autodoc/docmd/pkg/obitools/obidemerge/options.md create mode 100644 autodoc/docmd/pkg/obitools/obidistribute/distribute.md create mode 100644 autodoc/docmd/pkg/obitools/obidistribute/options.md create mode 100644 autodoc/docmd/pkg/obitools/obigrep/grep.md create mode 100644 autodoc/docmd/pkg/obitools/obigrep/options.md create mode 100644 autodoc/docmd/pkg/obitools/obijoin/join.md create mode 100644 autodoc/docmd/pkg/obitools/obijoin/options.md create mode 100644 autodoc/docmd/pkg/obitools/obik/cp.md create mode 100644 autodoc/docmd/pkg/obitools/obik/filter.md create mode 100644 autodoc/docmd/pkg/obitools/obik/index.md create mode 100644 autodoc/docmd/pkg/obitools/obik/lowmask.md create mode 100644 autodoc/docmd/pkg/obitools/obik/ls.md create mode 100644 autodoc/docmd/pkg/obitools/obik/match.md create mode 100644 autodoc/docmd/pkg/obitools/obik/mv.md create mode 100644 autodoc/docmd/pkg/obitools/obik/obik.md create mode 100644 autodoc/docmd/pkg/obitools/obik/options.md create mode 100644 autodoc/docmd/pkg/obitools/obik/rm.md create mode 100644 autodoc/docmd/pkg/obitools/obik/spectrum.md create mode 100644 autodoc/docmd/pkg/obitools/obik/summary.md create mode 100644 autodoc/docmd/pkg/obitools/obik/super.md create mode 100644 autodoc/docmd/pkg/obitools/obikmersim/obikmersim.md create mode 100644 autodoc/docmd/pkg/obitools/obikmersim/options.md create mode 100644 autodoc/docmd/pkg/obitools/obilandmark/obilandmark.md create mode 100644 autodoc/docmd/pkg/obitools/obilandmark/options.md create mode 100644 autodoc/docmd/pkg/obitools/obilandmark/taxostat.md create mode 100644 autodoc/docmd/pkg/obitools/obimatrix/obimatrix.md create mode 100644 autodoc/docmd/pkg/obitools/obimatrix/options.md create mode 100644 autodoc/docmd/pkg/obitools/obimicrosat/microsat.md create mode 100644 autodoc/docmd/pkg/obitools/obimicrosat/options.md create mode 100644 autodoc/docmd/pkg/obitools/obimultiplex/demultiplex.md create mode 100644 autodoc/docmd/pkg/obitools/obimultiplex/options.md create mode 100644 autodoc/docmd/pkg/obitools/obipairing/options.md create mode 100644 autodoc/docmd/pkg/obitools/obipairing/pairing.md create mode 100644 autodoc/docmd/pkg/obitools/obipcr/options.md create mode 100644 autodoc/docmd/pkg/obitools/obipcr/pcr.md create mode 100644 autodoc/docmd/pkg/obitools/obirefidx/famlilyindexing.md create mode 100644 autodoc/docmd/pkg/obitools/obirefidx/geomindexing.md create mode 100644 autodoc/docmd/pkg/obitools/obirefidx/obirefidx.md create mode 100644 autodoc/docmd/pkg/obitools/obirefidx/options.md create mode 100644 autodoc/docmd/pkg/obitools/obiscript/obiscript.md create mode 100644 autodoc/docmd/pkg/obitools/obiscript/options.md create mode 100644 autodoc/docmd/pkg/obitools/obisplit/obisplit.md create mode 100644 autodoc/docmd/pkg/obitools/obisplit/options.md create mode 100644 autodoc/docmd/pkg/obitools/obisummary/obisummary.md create mode 100644 autodoc/docmd/pkg/obitools/obisummary/options.md create mode 100644 autodoc/docmd/pkg/obitools/obitag/obigeomtag.md create mode 100644 autodoc/docmd/pkg/obitools/obitag/obitag.md create mode 100644 autodoc/docmd/pkg/obitools/obitag/options.md create mode 100644 autodoc/docmd/pkg/obitools/obitagpcr/options.md create mode 100644 autodoc/docmd/pkg/obitools/obitagpcr/pcrtag.md create mode 100644 autodoc/docmd/pkg/obitools/obitaxonomy/obitaxonomy.md create mode 100644 autodoc/docmd/pkg/obitools/obitaxonomy/options.md create mode 100644 autodoc/docmd/pkg/obitools/obiuniq/options.md create mode 100644 autodoc/docmd/pkg/obitools/obiuniq/unique.md create mode 100644 autodoc/docmd/pkg/obiutils/abs.md create mode 100644 autodoc/docmd/pkg/obiutils/abs_test.md create mode 100644 autodoc/docmd/pkg/obiutils/array.md create mode 100644 autodoc/docmd/pkg/obiutils/array_test.md create mode 100644 autodoc/docmd/pkg/obiutils/bytes.md create mode 100644 autodoc/docmd/pkg/obiutils/bytes_test.md create mode 100644 autodoc/docmd/pkg/obiutils/cast_interface.md create mode 100644 autodoc/docmd/pkg/obiutils/counter.md create mode 100644 autodoc/docmd/pkg/obiutils/download.md create mode 100644 autodoc/docmd/pkg/obiutils/goutils.md create mode 100644 autodoc/docmd/pkg/obiutils/gzipfile.md create mode 100644 autodoc/docmd/pkg/obiutils/memsize.md create mode 100644 autodoc/docmd/pkg/obiutils/mimetypes.md create mode 100644 autodoc/docmd/pkg/obiutils/minmax.md create mode 100644 autodoc/docmd/pkg/obiutils/minmultiset.md create mode 100644 autodoc/docmd/pkg/obiutils/path.md create mode 100644 autodoc/docmd/pkg/obiutils/path_test.md create mode 100644 autodoc/docmd/pkg/obiutils/pipe.md create mode 100644 autodoc/docmd/pkg/obiutils/ranks.md create mode 100644 autodoc/docmd/pkg/obiutils/set.md create mode 100644 autodoc/docmd/pkg/obiutils/set_test.md create mode 100644 autodoc/docmd/pkg/obiutils/slices.md create mode 100644 autodoc/docmd/pkg/obiutils/strings.md create mode 100644 autodoc/docmd/pkg/obiutils/tar.md create mode 100644 autodoc/docmd/pkg/obiutils/unsafe.md create mode 100644 autodoc/docmd/pkg/obiutils/xopen.md create mode 100644 autodoc/docmd/pkg/obiutils/xopen_test.md create mode 100644 autodoc/docmd/pkg_obialign.md create mode 100644 autodoc/docmd/pkg_obiapat.md create mode 100644 autodoc/docmd/pkg_obichunk.md create mode 100644 autodoc/docmd/pkg_obicorazick.md create mode 100644 autodoc/docmd/pkg_obidefault.md create mode 100644 autodoc/docmd/pkg_obidist.md create mode 100644 autodoc/docmd/pkg_obiformats.md create mode 100644 autodoc/docmd/pkg_obifp.md create mode 100644 autodoc/docmd/pkg_obigraph.md create mode 100644 autodoc/docmd/pkg_obiiter.md create mode 100644 autodoc/docmd/pkg_obiitercsv.md create mode 100644 autodoc/docmd/pkg_obikmer.md create mode 100644 autodoc/docmd/pkg_obilog.md create mode 100644 autodoc/docmd/pkg_obilua.md create mode 100644 autodoc/docmd/pkg_obingslibrary.md create mode 100644 autodoc/docmd/pkg_obioptions.md create mode 100644 autodoc/docmd/pkg_obiphylo.md create mode 100644 autodoc/docmd/pkg_obiseq.md create mode 100644 autodoc/docmd/pkg_obistats.md create mode 100644 autodoc/docmd/pkg_obisuffix.md create mode 100644 autodoc/docmd/pkg_obitable.md create mode 100644 autodoc/docmd/pkg_obitax.md create mode 100644 autodoc/docmd/pkg_obitools_obiannotate.md create mode 100644 autodoc/docmd/pkg_obitools_obiclean.md create mode 100644 autodoc/docmd/pkg_obitools_obicleandb.md create mode 100644 autodoc/docmd/pkg_obitools_obiclust.md create mode 100644 autodoc/docmd/pkg_obitools_obiconsensus.md create mode 100644 autodoc/docmd/pkg_obitools_obiconvert.md create mode 100644 autodoc/docmd/pkg_obitools_obicount.md create mode 100644 autodoc/docmd/pkg_obitools_obicsv.md create mode 100644 autodoc/docmd/pkg_obitools_obidemerge.md create mode 100644 autodoc/docmd/pkg_obitools_obidistribute.md create mode 100644 autodoc/docmd/pkg_obitools_obigrep.md create mode 100644 autodoc/docmd/pkg_obitools_obijoin.md create mode 100644 autodoc/docmd/pkg_obitools_obik.md create mode 100644 autodoc/docmd/pkg_obitools_obikmersim.md create mode 100644 autodoc/docmd/pkg_obitools_obilandmark.md create mode 100644 autodoc/docmd/pkg_obitools_obimatrix.md create mode 100644 autodoc/docmd/pkg_obitools_obimicrosat.md create mode 100644 autodoc/docmd/pkg_obitools_obimultiplex.md create mode 100644 autodoc/docmd/pkg_obitools_obipairing.md create mode 100644 autodoc/docmd/pkg_obitools_obipcr.md create mode 100644 autodoc/docmd/pkg_obitools_obirefidx.md create mode 100644 autodoc/docmd/pkg_obitools_obiscript.md create mode 100644 autodoc/docmd/pkg_obitools_obisplit.md create mode 100644 autodoc/docmd/pkg_obitools_obisummary.md create mode 100644 autodoc/docmd/pkg_obitools_obitag.md create mode 100644 autodoc/docmd/pkg_obitools_obitagpcr.md create mode 100644 autodoc/docmd/pkg_obitools_obitaxonomy.md create mode 100644 autodoc/docmd/pkg_obitools_obiuniq.md create mode 100644 autodoc/docmd/pkg_obiutils.md create mode 100644 autodoc/examples/obiconvert/output.json create mode 100644 autodoc/examples/obicount/out_default.txt create mode 100644 autodoc/examples/obicount/out_fastq_reads.txt create mode 100644 autodoc/examples/obicount/out_symbols.txt create mode 100644 autodoc/examples/obicount/out_variants.txt create mode 100644 autodoc/examples/obicsv/output6.csv.gz create mode 100644 autodoc/examples/obiscript/annotate.lua create mode 100644 autodoc/examples/obiscript/enrich.lua create mode 100644 autodoc/examples/obiscript/enriched.json create mode 100644 autodoc/examples/obiscript/my_script.lua create mode 100644 autodoc/examples/obiscript/process_pairs.lua create mode 100644 autodoc/examples/obisummary/out_json.json create mode 100644 autodoc/prompt_doc.md create mode 100644 autodoc/prompt_examples.md create mode 100644 autodoc/prompt_full.md create mode 100644 autodoc/prompt_hugo.md create mode 100644 autodoc/prompt_v2.md create mode 100644 entities.json create mode 100644 pkg/obilua/luahttp.go create mode 100644 scripts/find_setattribute.go create mode 100755 scripts/find_setattribute.sh create mode 100644 setattribute_refs.json create mode 100644 x diff --git a/.gitignore b/.gitignore index 2b0487c..4404995 100644 --- a/.gitignore +++ b/.gitignore @@ -23,7 +23,7 @@ xx /.vscode /build /bugs - +autodoc /ncbitaxo !/obitests/** diff --git a/autodoc/cmd/obicomplement.md b/autodoc/cmd/obicomplement.md new file mode 100644 index 0000000..1df7f6c --- /dev/null +++ b/autodoc/cmd/obicomplement.md @@ -0,0 +1,300 @@ +# NAME + +obicomplement — reverse complement of sequences + +--- + +# SYNOPSIS + +``` +obicomplement [--batch-mem ] [--batch-size ] + [--batch-size-max ] [--compress|-Z] [--csv] [--debug] + [--ecopcr] [--embl] [--fail-on-taxonomy] [--fasta] + [--fasta-output] [--fastq] [--fastq-output] [--genbank] + [--help|-h|-?] [--input-OBI-header] [--input-json-header] + [--json-output] [--max-cpu ] [--no-order] + [--no-progressbar] [--out|-o ] + [--output-OBI-header|-O] [--output-json-header] + [--paired-with ] [--raw-taxid] [--silent-warning] + [--skip-empty] [--solexa] [--taxonomy|-t ] [--u-to-t] + [--update-taxid] [--with-leaves] [] +``` + +--- + +# DESCRIPTION + +`obicomplement` computes the reverse complement of every sequence in the +input. For each input sequence, the nucleotides are first reversed, then +each base is replaced by its Watson–Crick complement (A↔T, C↔G), yielding +the strand that would pair with the original sequence read in the opposite +direction. + +When quality scores are present (FASTQ data), they are reversed in the same +order as the sequence so that each quality value remains associated with its +corresponding base. Ambiguous IUPAC characters (e.g. `N`, `R`, `Y`) are +handled correctly and preserved in the output. + +This operation is commonly needed when sequences have been sequenced on the +wrong strand, when a primer is designed on the reverse strand, or when +preparing sequences for strand-aware downstream analyses. + +The command reads from standard input or from one or more files, processes +sequences in parallel, and writes the result to standard output or to the +file specified with `--out`. + +--- + +# INPUT + +`obicomplement` accepts biological sequence data in FASTA, FASTQ, EMBL, +GenBank, ecoPCR output, and CSV formats. When no format flag is given, the +format is inferred automatically from the file contents or extension. + +Input is read from standard input when no filename argument is provided, or +from one or more files passed as positional arguments. Gzip-compressed files +are handled transparently. + +Paired-end data can be provided with `--paired-with`, which specifies the +file containing the second mate. Both mates are reverse-complemented and +written to separate output files. + +--- + +# OUTPUT + +The output is a sequence file in which every sequence is the reverse +complement of the corresponding input sequence. The output format matches +the input by default (FASTA if no quality data, FASTQ if quality data are +present), and can be overridden with `--fasta-output`, `--fastq-output`, or +`--json-output`. + +All annotations (attributes stored in the sequence header) are preserved +unchanged. Quality scores, when present, are reversed to stay aligned with +their bases. + +## Observed output example + +``` +>seq001 {"definition":"basic DNA sequence"} +cgatcgatcgatcgatcgat +>seq002 {"definition":"GC-rich sequence"} +gcgcgcgcgcgcgcgcgcgc +>seq003 {"definition":"AT-rich sequence"} +atatatatatatatatatat +>seq004 {"definition":"palindromic sequence"} +aattccggaattccggaatt +>seq005 {"definition":"mixed sequence"} +agctagcatgcatagccgat +``` + +--- + +# OPTIONS + +## Input format + +**`--fasta`** +: Default: false. Force parsing of input as FASTA format. + +**`--fastq`** +: Default: false. Force parsing of input as FASTQ format. + +**`--embl`** +: Default: false. Force parsing of input as EMBL flatfile format. + +**`--genbank`** +: Default: false. Force parsing of input as GenBank flatfile format. + +**`--ecopcr`** +: Default: false. Force parsing of input as ecoPCR output format. + +**`--csv`** +: Default: false. Force parsing of input as CSV format. + +**`--solexa`** +: Default: false. Decode quality scores using the Solexa/Illumina pre-1.3 + convention instead of the standard Phred+33 encoding. + +**`--input-OBI-header`** +: Default: false. Interpret FASTA/FASTQ header annotations using the OBI + key=value format. + +**`--input-json-header`** +: Default: false. Interpret FASTA/FASTQ header annotations using JSON + format. + +**`--no-order`** +: Default: false. When several input files are given, declare that no + ordering relationship exists among them, allowing the reader to interleave + records freely. + +**`--paired-with `** +: Default: none. File containing the paired (R2) reads. When set, + `obicomplement` processes both mates and writes them to separate output + files. + +## Sequence preprocessing + +**`--u-to-t`** +: Default: false. Convert Uracil (U) to Thymine (T) before computing the + reverse complement. Useful when processing RNA sequences that must be + treated as DNA. + +**`--skip-empty`** +: Default: false. Discard sequences of length zero from the output. + +## Output format + +**`--fasta-output`** +: Default: false. Write output in FASTA format regardless of whether quality + scores are present. + +**`--fastq-output`** +: Default: false. Write output in FASTQ format (requires quality data). + +**`--json-output`** +: Default: false. Write output in JSON format. + +**`--out|-o `** +: Default: `-` (standard output). File used to save the output. + +**`--output-OBI-header|-O`** +: Default: false. Write FASTA/FASTQ header annotations in OBI key=value + format. + +**`--output-json-header`** +: Default: false. Write FASTA/FASTQ header annotations in JSON format. + +**`--compress|-Z`** +: Default: false. Compress the output with gzip. + +## Taxonomy + +**`--taxonomy|-t `** +: Default: none. Path to a taxonomy database. Required only when the input + sequences carry taxid annotations that need to be validated or updated. + +**`--fail-on-taxonomy`** +: Default: false. Cause `obicomplement` to exit with an error if a taxid + referenced in the data is not a currently valid node in the loaded + taxonomy. + +**`--update-taxid`** +: Default: false. Automatically replace taxids that have been declared + merged into a newer node by the taxonomy database. + +**`--raw-taxid`** +: Default: false. Print taxids without appending the taxon name and rank. + +**`--with-leaves`** +: Default: false. When the taxonomy is extracted from the sequence file, + attach sequences as leaves of their taxid node. + +## Performance and diagnostics + +**`--max-cpu `** +: Default: 16 (env: `OBIMAXCPU`). Number of parallel threads used to + process sequences. + +**`--batch-size `** +: Default: 1 (env: `OBIBATCHSIZE`). Minimum number of sequences per + processing batch. + +**`--batch-size-max `** +: Default: 2000 (env: `OBIBATCHSIZEMAX`). Maximum number of sequences per + processing batch. + +**`--batch-mem `** +: Default: `128M` (env: `OBIBATCHMEM`). Maximum memory allocated per batch + (e.g. `128K`, `64M`, `1G`). Set to `0` to disable the memory limit. + +**`--no-progressbar`** +: Default: false. Disable the progress bar printed to stderr. + +**`--silent-warning`** +: Default: false (env: `OBIWARNING`). Suppress warning messages. + +**`--debug`** +: Default: false (env: `OBIDEBUG`). Enable debug logging. + +--- + +# EXAMPLES + +```bash +# Reverse complement all sequences in a FASTA file +obicomplement sequences.fasta > out_default.fasta +``` + +**Expected output:** 5 sequences written to `out_default.fasta`. + +```bash +# Reverse complement a FASTQ file, preserving quality scores +obicomplement reads.fastq --fastq-output --out out_fastq.fastq +``` + +**Expected output:** 5 sequences written to `out_fastq.fastq`. + +```bash +# Convert RNA sequences to their reverse complement DNA strand +obicomplement --u-to-t rna_sequences.fasta > out_rna_rc.fasta +``` + +**Expected output:** 3 sequences written to `out_rna_rc.fasta`. + +```bash +# Reverse complement paired-end reads into two separate output files +obicomplement R1.fastq --paired-with R2.fastq --out out_paired.fastq +``` + +**Expected output:** 3 sequences written to `out_paired_R1.fastq` and 3 sequences to `out_paired_R2.fastq`. + +```bash +# Reverse complement and compress output, skipping any empty sequences +obicomplement --skip-empty --compress sequences.fasta --out out_compressed.fasta.gz +``` + +**Expected output:** 5 sequences written to `out_compressed.fasta.gz` (gzip-compressed FASTA). + +```bash +# Reverse complement with OBI-format header output +obicomplement --output-OBI-header sequences.fasta --out out_obi.fasta +``` + +**Expected output:** 5 sequences written to `out_obi.fasta`. + +```bash +# Reverse complement with explicit JSON-format header output +obicomplement --output-json-header sequences.fasta --out out_jsonheader.fasta +``` + +**Expected output:** 5 sequences written to `out_jsonheader.fasta`. + +```bash +# Reverse complement and write full JSON output format +obicomplement --json-output sequences.fasta --out out_json.json +``` + +**Expected output:** 5 sequences written to `out_json.json`. + +--- + +# SEE ALSO + +- `obiconvert` — format conversion and sequence filtering pipeline +- `obipairing` — paired-end read merging (uses reverse complement internally) +- `obigrep` — sequence filtering and selection + +--- + +# NOTES + +Quality scores (Phred-scaled) are reversed in lock-step with the sequence +so that positional quality information remains valid after the reverse +complement operation. This is essential for downstream tools that rely on +per-base quality for alignment or variant calling. + +Ambiguous IUPAC characters and gap symbols (`-`) are handled gracefully: +standard ambiguous bases are complemented according to IUPAC rules, while +gap and missing-data symbols are preserved unchanged. diff --git a/autodoc/cmd/obiconsensus.md b/autodoc/cmd/obiconsensus.md new file mode 100644 index 0000000..c73a7bc --- /dev/null +++ b/autodoc/cmd/obiconsensus.md @@ -0,0 +1,188 @@ +# obiconsensus(1) — OBITools4 Manual + +## NAME + +`obiconsensus` — denoise Oxford Nanopore Technology (ONT) reads by building consensus sequences + +## SYNOPSIS + +``` +obiconsensus [OPTIONS] [FILE...] +``` + +## DESCRIPTION + +`obiconsensus` is designed to correct sequencing errors in long reads produced by Oxford Nanopore Technology (ONT) sequencers. Because ONT reads have a relatively high error rate compared to short-read technologies, sequences originating from the same biological molecule can differ slightly from one another. `obiconsensus` groups these related reads and builds a single, more reliable consensus sequence for each group. + +The tool works by constructing a *difference graph*: each unique read is represented as a node, and two nodes are connected if their sequences differ by at most a small number of positions (controlled by `--distance`). Within each sample, clusters of closely related reads are identified, and a consensus is assembled from the cluster members using a *de Bruijn graph* approach. The result is a set of high-quality representative sequences, one per cluster. + +Two denoising strategies are available: + +- **Standard mode** (default): identifies hub nodes (likely true sequences) in the difference graph and builds a consensus from each hub and its immediate neighbours. +- **Clustering mode** (`--cluster`): groups reads around local abundance maxima and builds a consensus from each neighbourhood. + +Sequences are read from one or more files, or from standard input when no file is given. Results are written to standard output or to a file specified with `--out`. + +The tool processes data on a per-sample basis. Sample identity is taken from a sequence annotation attribute (default: `sample`). Each sample's reads are denoised independently. + +## INPUT FORMATS + +`obiconsensus` recognises the following input formats automatically. A specific format can be forced with the corresponding flag: + +| Flag | Format | +|------|--------| +| `--fasta` | FASTA | +| `--fastq` | FASTQ | +| `--embl` | EMBL flat file | +| `--genbank` | GenBank flat file | +| `--ecopcr` | ecoPCR output | +| `--csv` | CSV tabular format | + +Header annotation styles can be selected with `--input-OBI-header` (OBITools format) or `--input-json-header` (JSON format). + +## OUTPUT FORMATS + +By default, the output format matches the input format (FASTQ when quality scores are present, FASTA otherwise). The format can be forced: + +- `--fasta-output` — write FASTA +- `--fastq-output` — write FASTQ +- `--json-output` — write JSON +- `--output-OBI-header` / `-O` — annotate FASTA/FASTQ title lines in OBITools format +- `--output-json-header` — annotate FASTA/FASTQ title lines in JSON format +- `--compress` / `-Z` — compress output with gzip + +Use `--out FILE` / `-o FILE` to write results to a file instead of standard output. + +## DENOISING OPTIONS + +`--distance INT`, `-d INT` +: Maximum number of differences allowed between two reads for them to be considered related and placed in the same cluster. Default: 1. A value of 1 means reads differing by a single nucleotide substitution are grouped together. + +`--cluster`, `-C` +: Switch to clustering mode. Instead of identifying hub sequences, reads are grouped around local abundance maxima. This mode may produce fewer but more representative consensus sequences. + +`--kmer-size SIZE` +: Size of the short words (k-mers) used when building the de Bruijn graph for consensus assembly. The default value of `-1` means the size is estimated automatically from the data. Manual adjustment is rarely needed. + +`--no-singleton` +: Discard any read (or cluster) that occurs only once across the dataset. Singleton sequences are often the result of sequencing errors and carry little biological signal. + +`--low-coverage FLOAT` +: Discard any sample whose sequence coverage falls below this threshold. Default: 0 (no filtering). Useful for removing poorly sequenced samples. + +`--sample ATTRIBUTE`, `-s ATTRIBUTE` +: Name of the sequence annotation attribute that identifies the sample of origin. Default: `sample`. Each unique value of this attribute is treated as an independent sample during denoising. + +## OUTPUT ANNOTATION OPTIONS + +`--unique`, `-U` +: After denoising, dereplicate the output sequences (equivalent to running `obiuniq`). Identical consensus sequences across samples are merged into a single record carrying abundance information. + +`--save-graph DIRECTORY` +: Save the difference graphs built during denoising to the specified directory. Each graph is written in GraphML format, one file per sample. Useful for inspecting the clustering structure. + +`--save-ratio FILE` +: Save a table of abundance ratios on graph edges to the specified CSV file. Each row describes the relative abundance of a read compared to its neighbours. Useful for quality control and parameter tuning. + +## PERFORMANCE OPTIONS + +`--max-cpu INT` +: Number of parallel threads to use for computation. Default: all available processors (up to 16). Reducing this value limits memory and CPU usage. + +`--batch-size INT` +: Minimum number of sequences processed together in a single batch. Default: 1. + +`--batch-size-max INT` +: Maximum number of sequences in a single batch. Default: 2000. + +`--batch-mem STRING` +: Maximum memory allocated per batch (e.g., `128M`, `1G`). Default: `128M`. Set to `0` to disable the memory limit. + +`--no-progressbar` +: Disable the progress bar. + +`--no-order` +: When reading from multiple files, indicate that there is no meaningful order among them. This can improve performance for large multi-file inputs. + +## OTHER OPTIONS + +`--u-to-t` +: Convert uracil (U) to thymine (T) in all input sequences. Use this option when working with RNA data stored in a DNA context. + +`--skip-empty` +: Remove sequences of length zero from the output. + +`--solexa` +: Interpret quality scores using the Solexa encoding rather than the standard Phred encoding. + +`--silent-warning` +: Suppress warning messages. + +`--debug` +: Enable detailed logging for troubleshooting. + +`--version` +: Print the version number and exit. + +`--help`, `-h` +: Display a brief help message and exit. + +## OUTPUT ATTRIBUTES + +Each output consensus sequence carries several annotation attributes describing how it was built: + +| Attribute | Description | +|-----------|-------------| +| `consensus` | Boolean flag: `true` if the sequence is a true consensus, `false` if it was kept unchanged (e.g., isolated singleton) | +| `merged_sample` | Map of sample names to read counts contributing to this consensus | +| `count` | Total number of reads merged into this consensus across all samples | +| `kmer_size` | Size of the k-mers used to build the de Bruijn graph for this consensus | +| `seq_length` | Length of the consensus sequence | + +## EXAMPLES + +**Basic denoising of a FASTQ file:** + +```sh +obiconsensus reads.fastq > denoised.fastq +``` + +**Increase the allowed distance between reads to 2:** + +```sh +obiconsensus --distance 2 reads.fastq > denoised.fastq +``` + +**Use clustering mode and remove singletons:** + +```sh +obiconsensus --cluster --no-singleton reads.fastq > denoised.fastq +``` + +**Denoise, then dereplicate the output:** + +```sh +obiconsensus --unique reads.fastq > denoised_uniq.fastq +``` + +**Save denoising graphs for inspection:** + +```sh +obiconsensus --save-graph ./graphs reads.fastq > denoised.fastq +``` + +**Specify the sample annotation attribute:** + +```sh +obiconsensus --sample library reads.fastq > denoised.fastq +``` + +## SEE ALSO + +`obiuniq`(1), `obiclean`(1), `obigrep`(1), `obiconvert`(1) + +## NOTES + +`obiconsensus` was designed primarily for Oxford Nanopore Technology amplicon data, where individual reads of the same molecule may carry different sequencing errors. For short-read Illumina data, `obiclean` may be more appropriate. + +The automatic k-mer size selection (`--kmer-size -1`) works well in most cases. If the consensus assembly fails for a group (e.g., due to circular structures in the de Bruijn graph), the k-mer size is progressively increased until the assembly succeeds or a fallback strategy is used. diff --git a/autodoc/cmd/obiconvert.md b/autodoc/cmd/obiconvert.md new file mode 100644 index 0000000..457c9bf --- /dev/null +++ b/autodoc/cmd/obiconvert.md @@ -0,0 +1,179 @@ +# NAME + +obiconvert — convertion of sequence files to various formats + +--- + +# SYNOPSIS + +``` +obiconvert [--batch-mem ] [--batch-size ] + [--batch-size-max ] [--compress|-Z] [--csv] [--debug] + [--ecopcr] [--embl] [--fail-on-taxonomy] [--fasta] + [--fasta-output] [--fastq] [--fastq-output] [--genbank] + [--help|-h|-?] [--input-OBI-header] [--input-json-header] + [--json-output] [--max-cpu ] [--no-order] [--no-progressbar] + [--out|-o ] [--output-OBI-header|-O] + [--output-json-header] [--paired-with ] [--pprof] + [--pprof-goroutine ] [--pprof-mutex ] [--raw-taxid] + [--silent-warning] [--skip-empty] [--solexa] + [--taxonomy|-t ] [--u-to-t] [--update-taxid] [--version] + [--with-leaves] [] +``` + +--- + +# DESCRIPTION + +obiconvert is a versatile command-line tool that converts biological sequence data between multiple standard bioinformatics formats. It enables biologists to process large datasets by reading from one format and writing to another, with support for quality scores, taxonomic annotations, and various input/output combinations. The tool is optimized for high-performance processing with configurable batching, parallel execution, and memory management. + +Biologists use obiconvert to standardize sequence data for compatibility with different bioinformatics tools, extract quality information from FASTQ files into more readable formats, or convert between FASTA and FASTQ when working with DNA/RNA sequences that have associated quality data. The tool automatically detects input formats and intelligently selects output formats based on data presence (e.g., FASTQ when quality scores exist, FASTA otherwise). To force a specific output format regardless of input content, use the explicit output flags (`--fasta-output`, `--fastq-output`, `--json-output`). + +--- + +# INPUT + +obiconvert accepts input in multiple biological sequence formats: + +- **FASTA**: Standard text-based format with `>` headers and sequence data +- **FASTQ**: Binary quality-score format (default when both sequence and quality data present) +- **GenBank**: Comprehensive biological record format with annotations +- **EMBL**: EMBL flatfile format for sequence and feature information +- **ecoPCR**: Specialized output format from ecoPCR amplification tools +- **CSV**: Tabular sequence data with configurable delimiters + +Input is provided as positional arguments (file paths or `-` for stdin). The tool automatically detects the input format based on file content and can handle multiple files in sequence. When paired-end sequencing is used, the `--paired-with` option specifies the mate read file. + +--- + +# OUTPUT + +obiconvert produces sequence data in several output formats depending on input content and user selection: + +- **FASTA**: Text format with sequence only (use `--fasta-output` to force) +- **FASTQ**: Format including quality scores (default when quality data present; use `--fastq-output` to force) +- **JSON**: Structured output with all sequence metadata and attributes (use `--json-output`) + +The tool preserves all sequence annotations (taxonomic information, custom attributes) during conversion. When converting to FASTA/FASTQ formats, title line annotations can be formatted as OBI structured data or JSON using the `--output-OBI-header`/`--output-json-header` flags. Sequences of zero length can be suppressed with `--skip-empty`. + +## Observed output example + +``` +>seq001 {"definition":"DNA sequence with quality scores for FASTQ to FASTA conversion"} +atcgatcgatcgatcgatcgatcgatcgatcgatcgatcg +>seq002 {"definition":"Second sequence with moderate quality scores"} +gctagctagctagctagctagctagctagctagctagct +>seq003 {"definition":"Third sequence with high quality scores"} +ttaaccggttaaccggttaaccggttaaccggttaaccg +>seq004 {"definition":"Fourth sequence with variable quality scores"} +acgtacgtacgtacgtacgtacgtacgtacgtacgtacg +``` + +--- + +# OPTIONS + +## Input Format Options +- **--fasta**: Read data following the fasta format. (default: false) +- **--fastq**: Read data following the fastq format. (default: false) +- **--genbank**: Read data following the Genbank flatfile format. (default: false) +- **--embl**: Read data following the EMBL flatfile format. (default: false) +- **--ecopcr**: Read data following the ecoPCR output format. (default: false) +- **--csv**: Read data following the CSV format. (default: false) + +## Input Header Options +- **--input-OBI-header**: FASTA/FASTQ title line annotations follow OBI format. (default: false) +- **--input-json-header**: FASTA/FASTQ title line annotations follow json format. (default: false) + +## Output Format Options +- **--fasta-output**: Write sequence in fasta format (default if no quality data available). (default: false) +- **--fastq-output**: Write sequence in fastq format (default if quality data available). (default: false) +- **--json-output**: Write sequence in json format. (default: false) + +## Output Header Options +- **--output-OBI-header|-O**: output FASTA/FASTQ title line annotations follow OBI format. (default: false) +- **--output-json-header**: output FASTA/FASTQ title line annotations follow json format. (default: false) + +## Processing Options +- **--skip-empty**: Sequences of length equal to zero are suppressed from the output (default: false) +- **--no-order**: When several input files are provided, indicates that there is no order among them. (default: false) +- **--u-to-t**: Convert Uracil to Thymine. (default: false) +- **--update-taxid**: Make obitools automatically updating the taxid that are declared merged to a newest one. (default: false) +- **--raw-taxid**: When set, taxids are printed in files with any supplementary information (taxon name and rank) (default: false) +- **--fail-on-taxonomy**: Make obitools failing on error if a used taxid is not a currently valid one (default: false) +- **--with-leaves**: If taxonomy is extracted from a sequence file, sequences are added as leave of their taxid annotation (default: false) + +## File Options +- **--out|-o **: Filename used for saving the output (default: "-") +- **--paired-with **: Filename containing the paired reads (default: "") + +## Performance Options +- **--batch-mem **: Maximum memory per batch (e.g. 128K, 64M, 1G; default: 128M). Set to 0 to disable. (default: "", env: OBIBATCHMEM) +- **--batch-size **: Minimum number of sequences per batch (floor, default 1) (default: 1, env: OBIBATCHSIZE) +- **--batch-size-max **: Maximum number of sequences per batch (ceiling, default 2000) (default: 2000, env: OBIBATCHSIZEMAX) +- **--max-cpu **: Number of parallele threads computing the result (default: 16, env: OBIMAXCPU) +- **--compress|-Z**: Compress all the result using gzip (default: false) + +## Debug Options +- **--debug**: Enable debug mode, by setting log level to debug. (default: false, env: OBIDEBUG) +- **--silent-warning**: Stop printing of the warning message (default: false, env: OBIWARNING) +- **--no-progressbar**: Disable the progress bar printing (default: false) + +## Profiling Options +- **--pprof**: Enable pprof server. Look at the log for details. (default: false) +- **--pprof-goroutine **: Enable profiling of goroutine blocking profile. (default: 6060, env: OBIPPROFGOROUTINE) +- **--pprof-mutex **: Enable profiling of mutex lock. (default: 10, env: OBIPPROFMUTEX) + +## Utility Options +- **--taxonomy|-t **: Path to the taxonomy database. (default: "") +- **--solexa**: Decodes quality string according to the Solexa specification. (default: false, env: OBISOLEXA) +- **--help|-h|-?**: Show help message (default: false) +- **--version**: Prints the version and exits. (default: false) + +--- + +# EXAMPLES + +## Convert FASTQ to FASTA +```bash +# Convert quality-score data from FASTQ to readable FASTA format +obiconvert --fastq --fasta-output input.fastq -o output.fasta +``` + +**Expected output:** 4 sequences written to `output.fasta`. + +## Convert FASTA to JSON +```bash +# Convert sequences to structured JSON format preserving all annotations +obiconvert --fasta --json-output input.fasta -o output.json +``` + +**Expected output:** 3 sequences written to `output.json`. + +## Process paired-end sequencing data +```bash +# Convert paired FASTQ files preserving read pairing +obiconvert --fastq --fasta-output forward.fastq --paired-with reverse.fastq -o merged_sequences.fasta +``` + +**Expected output:** 4 sequences written to `merged_sequences_R1.fasta` and `merged_sequences_R2.fasta`. + +--- + +# SEE ALSO + +- obiannotate: Add taxonomic and functional annotations to sequences +- obicount: Count sequences in files +- obigrep: Filter sequences based on attributes or patterns +- obisummary: Generate statistics from sequence files +- obiuniq: Remove duplicate sequences + +--- + +# NOTES + +obiconvert automatically selects the optimal output format based on input data presence, preferring FASTQ when quality scores are available and FASTA otherwise. To force a specific output format, use `--fasta-output`, `--fastq-output`, or `--json-output` explicitly. + +Memory usage is controlled through batch processing, with configurable memory limits per batch to handle large datasets efficiently. Progress reporting can be disabled for scripting purposes using `--no-progressbar`. + +When working with taxonomic data, ensure the taxonomy database is accessible and properly formatted to avoid failures during sequence annotation processing. diff --git a/autodoc/cmd/obicount.md b/autodoc/cmd/obicount.md new file mode 100644 index 0000000..e5202b8 --- /dev/null +++ b/autodoc/cmd/obicount.md @@ -0,0 +1,190 @@ +# NAME + +obicount — counts the sequences present in a file of sequences + +--- + +# SYNOPSIS + +``` +obicount [--batch-mem ] [--batch-size ] [--batch-size-max ] + [--csv] [--debug] [--ecopcr] [--embl] [--fasta] [--fastq] + [--genbank] [--help|-h|-?] [--input-OBI-header] + [--input-json-header] [--max-cpu ] [--no-order] [--pprof] + [--pprof-goroutine ] [--pprof-mutex ] [--reads|-r] + [--silent-warning] [--solexa] [--symbols|-s] [--u-to-t] + [--variants|-v] [--version] [] +``` + +--- + +# DESCRIPTION + +obicount is a command-line tool designed to count biological sequences from various input formats. It helps biologists quickly obtain quantitative metrics about sequence collections, which is essential for quality control, data assessment, and pipeline monitoring. The tool can count reads (total sequences), variants (unique sequence strings), or symbols (sum of character lengths), providing flexibility to focus on specific aspects of sequence data depending on the analysis needs. + +--- + +# INPUT + +obicount accepts input from files or stdin, supporting multiple biological sequence formats: +- FASTA (.fasta[.gz]) +- FASTQ (.fastq[.fq][.gz]) +- GenBank/EMBL (.gb|.gbff|.dat[.gz]) +- ecoPCR format (.ecopcr[.gz]) +- CSV format (--csv flag) + +Input can be provided as multiple filenames or read from stdin. The tool automatically detects file formats and parses sequences accordingly. + +--- + +# OUTPUT + +obicount outputs one or more of the following metrics, depending on the flags used: + +- **Read counts**: Total number of sequences in the input +- **Variant counts**: Number of unique sequence strings (distinct sequences) +- **Symbol counts**: Sum of all character lengths across all sequences + +When no specific counting flags are provided (-r, -v, -s), all three metrics are reported by default. Output is printed to stdout in CSV format with headers: `entities,n` for the type of entity counted, followed by the count value. + +--- + +# OPTIONS + +## General Options +- --help|-h|-? + Show help message and exit. + +- --max-cpu + Number of parallel threads computing the result (default: 16, env: OBIMAXCPU). + +- --debug + Enable debug mode, by setting log level to debug. (default: false, env: OBIDEBUG) + +- --silent-warning + Stop printing of the warning message (default: false, env: OBIWARNING) + +## Input Format Options +- --fasta + Read data following the fasta format. (default: false) + +- --fastq + Read data following the fastq format. (default: false) + +- --genbank + Read data following the Genbank flatfile format. (default: false) + +- --embl + Read data following the EMBL flatfile format. (default: false) + +- --ecopcr + Read data following the ecoPCR output format. (default: false) + +- --csv + Read data following the CSV format. (default: false) + +## Input Header Options +- --input-OBI-header + FASTA/FASTQ title line annotations follow OBI format. (default: false) + +- --input-json-header + FASTA/FASTQ title line annotations follow json format. (default: false) + +## Counting Mode Options +- --reads|-r + Prints read counts. (default: false) + +- --variants|-v + Prints variant counts. (default: false) + +- --symbols|-s + Prints symbol counts. (default: false) + +## Processing Options +- --u-to-t + Convert Uracil to Thymine. (default: false, env: OBISOLEXA) + +- --solexa + Decodes quality string according to the Solexa specification. (default: false, env: OBISOLEXA) + +- --no-order + When several input files are provided, indicates that there is no order among them. (default: false) + +## Performance Options +- --batch-mem + Maximum memory per batch (e.g. 128K, 64M, 1G; default: 128M). Set to 0 to disable. (default: "", env: OBIBATCHMEM) + +- --batch-size + Minimum number of sequences per batch (floor, default 1) (default: 1, env: OBIBATCHSIZE) + +- --batch-size-max + Maximum number of sequences per batch (ceiling, default 2000) (default: 2000, env: OBIBATCHSIZEMAX) + +- --max-cpu + Number of parallele threads computing the result (default: 16, env: OBIMAXCPU) + +## Profiling Options +- --pprof + Enable pprof server. Look at the log for details. (default: false) + +- --pprof-goroutine + Enable profiling of goroutine blocking profile. (default: 6060, env: OBIPPROFGOROUTINE) + +- --pprof-mutex + Enable profiling of mutex lock. (default: 10, env: OBIPPROFMUTEX) + +- --version + Prints the version and exits. (default: false) + +--- + +# EXAMPLES + +# Count total number of sequences in a FASTA file +# Useful for quick assessment of dataset size +obicount input.fasta +**Expected output:** 4 sequences, out_default.txt + +# Count only the number of unique sequence variants +# Helpful for identifying genetic diversity in population data +obicount --variants input.fasta +**Expected output:** 4 sequences, out_variants.txt + +# Count sum of all sequence symbol lengths (nucleotides/amino acids) +# Useful for estimating total data volume or computing average read length +obicount --symbols input.fasta +**Expected output:** 4 sequences, out_symbols.txt + +# Count reads from FASTQ format with quality scores +# Essential for assessing read throughput in sequencing data +obicount --fastq --reads input.fastq +**Expected output:** 4 sequences, out_fastq_reads.txt + +--- + +# OUTPUT + +## Observed output example + +``` +time="2026-04-02T19:33:11+02:00" level=info msg="Number of workers set 16" +time="2026-04-02T19:33:11+02:00" level=info msg="Found 1 files to process" +time="2026-04-02T19:33:11+02:00" level=info msg="input.fasta mime type: text/fasta" +entities,n +variants,5 +reads,5 +symbols,435 +``` + +--- + +# SEE ALSO + +- obiconvert - Convert between biological sequence file formats +- obiuniq - Remove duplicate sequences from files + +--- + +# NOTES + +_(not available)_ \ No newline at end of file diff --git a/autodoc/cmd/obicsv.md b/autodoc/cmd/obicsv.md new file mode 100644 index 0000000..a2546c0 --- /dev/null +++ b/autodoc/cmd/obicsv.md @@ -0,0 +1,315 @@ +# NAME + +obicsv — converts sequence files to CSV format + +--- + +# SYNOPSIS + +``` +obicsv [--auto] [--batch-mem ] [--batch-size ] + [--batch-size-max ] [--compress|-Z] [--count] [--csv] [--debug] + [--definition|-d] [--ecopcr] [--embl] [--fail-on-taxonomy] [--fasta] + [--fastq] [--genbank] [--help|-h|-?] [--ids|-i] [--input-OBI-header] + [--input-json-header] [--keep|-k ]... [--max-cpu ] + [--na-value ] [--no-order] [--no-progressbar] [--obipairing] + [--out|-o ] [--pprof] [--pprof-goroutine ] + [--pprof-mutex ] [--quality|-q] [--raw-taxid] [--sequence|-s] + [--silent-warning] [--solexa] [--taxon] [--taxonomy|-t ] + [--u-to-t] [--update-taxid] [--version] [--with-leaves] [] +``` + +--- + +# DESCRIPTION + +obicsv converts biological sequence data into CSV format for easy inspection, spreadsheet analysis, or integration with other tools. A biologist might use it to export sequences from OBITools for quality control, taxonomic inspection, or downstream analysis in R or Python. + +Columns must be explicitly selected: use `--ids` for the identifier, `--sequence` for the nucleotide sequence, `--quality` for quality scores, `--taxon` for taxonomic information, `--auto` to auto-detect annotation attributes, or `--keep` for specific named attributes. Multiple flags can be combined freely. + +The command uses parallel workers to process large datasets efficiently and can write output to stdout or directly to a file. + +--- + +# INPUT + +obicsv accepts input from files or stdin. The input format is automatically detected based on the file extension, but can be explicitly specified using format flags. + +Supported input formats: +- FASTA (`--fasta`) +- FASTQ (`--fastq`) +- GenBank (`--genbank`) +- EMBL (`--embl`) +- ecoPCR output (`--ecopcr`) +- CSV (`--csv`) + +Input sources: +- Local files (specified as arguments) +- stdin (when no input file is provided) +- Remote URLs (`http://`, `https://`, `ftp://`) +- Directories (automatically scanned for valid files) + +Header formats: +- OBI format (`--input-OBI-header`) +- JSON format (`--input-json-header`) +- Auto-detection (default) + +Taxonomy database can be provided with `--taxonomy|-t`. + +--- + +# OUTPUT + +The output is a CSV file with one row per sequence. The columns included depend on the flags used: + +| Column | Flag | Description | +|--------|------|-------------| +| id | `--ids\|-i` | Sequence identifier | +| sequence | `--sequence\|-s` | DNA/RNA sequence | +| qualities | `--quality\|-q` | Quality scores (ASCII-encoded) | +| definition | `--definition\|-d` | Sequence description/annotation | +| count | `--count` | Number of reads represented by this sequence | +| taxid | `--taxon` | NCBI taxonomy identifier | +| scientific_name | `--taxon` | Taxonomic scientific name | +| custom attributes | `--keep\|-k` | Any attribute stored in sequence annotations | + +If `--auto` is used, columns are automatically determined based on the attributes present in the first batch of sequences. + +Missing values are written as the NA value (default: "NA"). + +## Observed output example + +```csv +id,sequence +seq001,atgcatgcatgcatgcatgcatgcatgcatgcatgcatgcatgcatgcatgcatgcatgc +seq002,ggggaaaattttccccggggaaaattttccccggggaaaattttccccggggaaaatttt +seq003,cccccccccccccccccccccccccccccccccccccccccccccccccccccccccc +``` + +--- + +# OPTIONS + +## Output Columns + +These flags control which columns appear in the CSV output. + +- **`--ids|-i`** + - Default: `false` + - Meaning: Include the sequence identifier column. Useful for tracking or linking sequences. + +- **`--sequence|-s`** + - Default: `false` + - Meaning: Include the nucleotide or amino acid sequence. This is the main biological data. + +- **`--quality|-q`** + - Default: `false` + - Meaning: Include quality scores for each position. Essential for quality control and filtering. + +- **`--definition|-d`** + - Default: `false` + - Meaning: Include the sequence description or definition from the source file. + +- **`--count`** + - Default: `false` + - Meaning: Include the count attribute, representing how many original reads were collapsed into this sequence (e.g., from clustering or demultiplexing). + +- **`--taxon`** + - Default: `false` + - Meaning: Include taxonomic information. Outputs both the NCBI taxid and the scientific name. Requires a taxonomy database (see `--taxonomy`). + +- **`--obipairing`** + - Default: `false` + - Meaning: Include attributes that were added by the `obipairing` command (pairing scores, mismatches, etc.). + +- **`--auto`** + - Default: `false` + - Meaning: Automatically detect which columns to output by examining the first batch of sequences. Outputs all annotation attributes found in the headers. Can be combined with `--ids`, `--sequence`, etc. to add those columns on top of the auto-detected ones. + +- **`--keep|-k `** + - Default: `none` + - Meaning: Keep only the specified attribute(s). Can be used multiple times to keep several columns. Useful for extracting specific annotations. + +- **`--na-value `** + - Default: `"NA"` + - Meaning: String to use for missing or unavailable values in the CSV. Customize for compatibility with other tools (e.g., empty string, "NA", "null"). + +## Input/Output Files + +- **`--out|-o `** + - Default: `"-"` (stdout) + - Meaning: Write output to the specified file instead of stdout. + +- **`--compress|-Z`** + - Default: `false` + - Meaning: Compress the output using gzip. + +## Input Format + +- **`--fasta`**, **`--fastq`**, **`--genbank`**, **`--embl`**, **`--ecopcr`**, **`--csv`** + - Default: auto-detection + - Meaning: Explicitly specify the input format. + +- **`--input-OBI-header`**, **`--input-json-header`** + - Default: auto-detection + - Meaning: Specify the header format in FASTA/FASTQ files (OBI or JSON annotations). + +- **`--u-to-t`** + - Default: `false` + - Meaning: Convert Uracil to Thymine. Useful for RNA sequences. + +- **`--solexa`** + - Default: `false` + - Meaning: Decode quality strings according to the Solexa specification instead of Phred. + +## Taxonomy + +- **`--taxonomy|-t `** + - Default: `""` + - Meaning: Path to the taxonomy database directory. Required for `--taxon` output. + +- **`--fail-on-taxonomy`** + - Default: `false` + - Meaning: Make OBITools fail if a used taxid is not currently valid. + +- **`--update-taxid`** + - Default: `false` + - Meaning: Automatically update taxids that have been merged to their newest valid taxid. + +- **`--raw-taxid`** + - Default: `false` + - Meaning: Print only taxids without supplementary information (name and rank). + +- **`--with-leaves`** + - Default: `false` + - Meaning: Add sequences as leaves of their taxid annotation when taxonomy is extracted from a sequence file. + +## Performance + +- **`--max-cpu `** + - Default: `16` + - Meaning: Number of parallel threads for processing. + +- **`--batch-size `** + - Default: `1` + - Meaning: Minimum number of sequences per batch. + +- **`--batch-size-max `** + - Default: `2000` + - Meaning: Maximum number of sequences per batch. + +- **`--batch-mem `** + - Default: `"128M"` + - Meaning: Maximum memory per batch (e.g., 128K, 64M, 1G). + +- **`--no-order`** + - Default: `false` + - Meaning: When multiple input files are provided, indicates there is no order among them. + +- **`--no-progressbar`** + - Default: `false` + - Meaning: Disable the progress bar. + +## Other Options + +- **`--debug`** + - Default: `false` + - Meaning: Enable debug mode by setting log level to debug. + +- **`--pprof`** + - Default: `false` + - Meaning: Enable pprof server. + +- **`--pprof-goroutine `** + - Default: `6060` + - Meaning: Enable profiling of goroutine blocking. + +- **`--pprof-mutex `** + - Default: `10` + - Meaning: Enable profiling of mutex lock. + +- **`--silent-warning`** + - Default: `false` + - Meaning: Suppress warning messages. + +- **`--version`** + - Default: `false` + - Meaning: Print version information and exit. + +- **`--help|-h|-?`** + - Default: `false` + - Meaning: Print help information. + +--- + +# EXAMPLES + +**Export sequences with identifiers to CSV** + +Extracts sequence IDs and sequences from a FASTQ file. +```bash +obicsv --ids --sequence sequences.fastq -o output1.csv +``` + +**Expected output:** 3 sequences written to `output1.csv`. + +**Export sequences with quality scores** + +Useful for quality control and filtering in downstream tools. +```bash +obicsv --ids --sequence --quality sequences.fastq -o output2.csv +``` + +**Expected output:** 3 sequences written to `output2.csv`. + +**Export with taxonomic information** + +Includes taxid and scientific name for taxonomic analysis. +```bash +obicsv --ids --sequence --taxon --taxonomy /path/to/taxonomy sequences.fasta -o output.csv +``` + +**Auto-detect annotation columns from sequence headers** + +Automatically discovers all annotation attributes present in the sequence headers and outputs them as CSV columns. Combined with `--ids` to also include the sequence identifier. +```bash +obicsv --auto --ids sequences.fasta -o output4.csv +``` + +**Expected output:** 3 rows in `output4.csv` with columns `id`, `sample`, `taxid` (attributes found in sequence headers). + +**Extract specific attributes** + +Keeps only the specified attributes as columns. Attributes not present in a sequence are written as the NA value. +```bash +obicsv --keep sample --keep taxid sequences.fasta -o output5.csv +``` + +**Expected output:** 3 rows in `output5.csv` with columns `taxid`, `sample`. + +**Export with compression** + +Writes gzip-compressed CSV output for large datasets. +```bash +obicsv --ids --sequence -Z sequences.fasta -o output6.csv.gz +``` + +**Expected output:** 3 sequences written to `output6.csv.gz`. + +--- + +# SEE ALSO + +- `obiconvert` — input/output handling framework +- `obipairing` — pairing information (used with `--obipairing`) +- Other export commands: `obifasta`, `obifastq`, `obijson` + +--- + +# NOTES + +- Without any column selection flag (`--ids`, `--sequence`, `--quality`, `--taxon`, `--auto`, `--keep`), the output contains no columns and no data. +- The `--taxon` option requires a valid taxonomy database specified with `--taxonomy`. +- Output is written to stdout by default; use `--out` to write to a file. +- Missing attributes are written as the NA value (customizable with `--na-value`). +- Input sequences are processed using streaming iterators to minimize memory footprint, even for large files. \ No newline at end of file diff --git a/autodoc/cmd/obidemerge.md b/autodoc/cmd/obidemerge.md new file mode 100644 index 0000000..3259380 --- /dev/null +++ b/autodoc/cmd/obidemerge.md @@ -0,0 +1,321 @@ +# obidemerge + +## NAME + +`obidemerge` — split merged sequence records back into individual, sample-annotated copies + +## SYNOPSIS + +``` +obidemerge [options] [input_files...] +``` + +## DESCRIPTION + +In a typical metabarcoding workflow, `obiuniq` or similar tools collapse identical sequences +from multiple samples into a single representative record. That record carries a statistics +attribute (for example `merged_sample`) that stores, for every original sample, how many +times the sequence was observed. This compact representation is convenient for clustering +and denoising, but some downstream analyses need the original, per-sample view. + +`obidemerge` reverses that merging step. For each input sequence, it reads the statistics +stored under a chosen attribute (by default `sample`) and produces one output sequence per +entry in that statistics map. Each output sequence is a copy of the original, but: + +- its `sample` attribute (or whichever slot you chose) is set to the name of the individual + sample, +- its read count is set to the abundance recorded for that sample. + +The original statistics attribute is removed from all output sequences. + +Sequences that carry no statistics for the chosen slot are passed through unchanged. + +The command reads sequences from one or more files, or from standard input when no file is +given, and writes the results to standard output or to the file specified with `--out`. + +## INPUT FORMATS + +`obidemerge` accepts all sequence formats supported by OBITools4: + +| Format | Description | +|--------|-------------| +| FASTA | Plain nucleotide sequences with annotation in the title line | +| FASTQ | Sequences with per-base quality scores | +| EMBL | European Nucleotide Archive flat-file format | +| GenBank | NCBI GenBank flat-file format | +| ecoPCR | Output produced by the ecoPCR tool | +| CSV | Comma-separated values with sequence and metadata columns | + +The format is detected automatically from the file extension or content. You can override +detection with the format flags listed under **Input format options** below. + +Annotations embedded in FASTA/FASTQ title lines can follow the OBI key=value style +(`--input-OBI-header`) or JSON style (`--input-json-header`). + +## OUTPUT FORMATS + +By default, the output format mirrors the input: + +- If the input contains quality scores, output is FASTQ. +- Otherwise, output is FASTA with OBI-style annotations. + +You can force a specific format with `--fasta-output`, `--fastq-output`, or `--json-output`. + +## OPTIONS + +### Demerge option + +`--demerge `, `-d ` +: Name of the sequence attribute that holds the per-sample statistics to expand. + Each key in that statistics map becomes a separate output sequence. + **Default:** `sample` + +### Output options + +`--out `, `-o ` +: Write output to this file instead of standard output. Use `-` for standard output. + **Default:** `-` (standard output) + +`--fasta-output` +: Write output in FASTA format, even when quality scores are available. + **Default:** false + +`--fastq-output` +: Write output in FASTQ format (requires quality scores in the input). + **Default:** false + +`--json-output` +: Write output in JSON format, one record per line. + **Default:** false + +`--output-OBI-header`, `-O` +: Write FASTA/FASTQ title lines in OBI key=value annotation style. + **Default:** false (JSON-style headers) + +`--output-json-header` +: Write FASTA/FASTQ title lines in JSON annotation style. + **Default:** false + +`--compress`, `-Z` +: Compress the output with gzip. + **Default:** false + +`--skip-empty` +: Discard sequences of length zero from the output. + **Default:** false + +### Input format options + +`--fasta` +: Force reading in FASTA format. + +`--fastq` +: Force reading in FASTQ format. + +`--embl` +: Force reading in EMBL flat-file format. + +`--genbank` +: Force reading in GenBank flat-file format. + +`--ecopcr` +: Force reading in ecoPCR output format. + +`--csv` +: Force reading in CSV format. + +`--input-OBI-header` +: Parse FASTA/FASTQ title lines as OBI-style key=value annotations. + +`--input-json-header` +: Parse FASTA/FASTQ title lines as JSON annotations. + +`--solexa` +: Decode quality scores using the Solexa/Illumina 1.0 convention instead of the standard + Phred scale. Use this only for very old sequencing data. + **Default:** false + +`--u-to-t` +: Convert uracil (U) to thymine (T) in all sequences. Useful when working with RNA-derived + data that should be treated as DNA. + **Default:** false + +`--no-order` +: When reading from several input files, do not attempt to preserve the order of records + across files. May improve speed when order does not matter. + **Default:** false + +### Taxonomy options + +`--taxonomy `, `-t ` +: Path to the OBITools4 taxonomy database. Required only if taxonomic identifiers need to + be resolved or validated during output. + **Default:** none + +`--fail-on-taxonomy` +: Stop with an error if a taxonomic identifier in the data is not found in the loaded + taxonomy database. + **Default:** false + +`--raw-taxid` +: Print taxonomic identifiers as plain numbers, without appending the taxon name and rank. + **Default:** false + +`--update-taxid` +: Automatically replace deprecated taxonomic identifiers with their current equivalents, + as declared in the taxonomy database. + **Default:** false + +`--with-leaves` +: When a taxonomy is extracted from the sequence file itself, treat each sequence as a + leaf node under its annotated taxonomic identifier. + **Default:** false + +### Performance options + +`--max-cpu ` +: Maximum number of parallel processing threads. Increase for faster processing on + multi-core machines. + **Default:** 16 (or the value of the `OBIMAXCPU` environment variable) + +`--batch-size ` +: Minimum number of sequences processed together as a group. + **Default:** 1 + +`--batch-size-max ` +: Maximum number of sequences processed together as a group. + **Default:** 2000 + +`--batch-mem ` +: Maximum memory used per processing group (e.g. `64M`, `1G`). Set to `0` to disable the + memory limit and rely on `--batch-size-max` alone. + **Default:** `128M` + +### Display options + +`--no-progressbar` +: Hide the progress bar. + **Default:** false + +`--silent-warning` +: Suppress warning messages. + **Default:** false + +`--debug` +: Enable verbose debug logging. + **Default:** false + +`--version` +: Print the OBITools4 version and exit. + +`--help`, `-h`, `-?` +: Print this help message and exit. + +## EXAMPLES + +### Example 1 — basic demerge using the default slot + +After running `obiuniq`, the file `unique.fasta` contains merged sequences whose +`merged_sample` attribute records abundance per sample. Demerge back to one +sequence per sample: + + +```bash +obidemerge -d sample unique.fasta > per_sample_merged.fasta +``` + +**Expected output:** 7 sequences written to `per_sample_merged.fasta`. + +### Example 2 — demerge with the default `sample` slot + +If the statistics are already stored under the attribute named `sample` (the default), +no `-d` flag is needed: + +```bash +obidemerge unique.fasta > per_sample_default.fasta +``` + +**Expected output:** 7 sequences written to `per_sample_default.fasta`. + +### Example 3 — write compressed output to a file + +```bash +obidemerge -d sample -o per_sample.fasta.gz --compress unique.fasta +``` + +**Expected output:** 7 sequences written (compressed) to `per_sample.fasta.gz`. + +### Example 4 — pipeline use: cluster, then demerge + +Obtain unique sequences, cluster them, then expand the clusters back to individual +sample records for ecological analysis: + +```bash +obiuniq -m sample reads.fastq \ + | obiclean ... \ + | obidemerge -d sample -o demerged.fasta +``` + +### Example 5 — process multiple input files + +```bash +obidemerge -d sample run1_unique.fasta run2_unique.fasta > combined_demerged.fasta +``` + +**Expected output:** 6 sequences written to `combined_demerged.fasta`. + +## SEE ALSO + +`obiuniq(1)` — collapses identical sequences and records per-sample counts (the inverse operation) +`obiclean(1)` — removes PCR/sequencing artefacts from a set of unique sequences +`obiannotate(1)` — adds or modifies sequence attributes +`obigrep(1)` — filters sequences by attributes or sequence content +`obicount(1)` — counts sequences and total reads in a file + +## NOTES + +**Relationship to `obiuniq`.** +`obiuniq --merge sample` stores per-sample counts under an attribute named `merged_sample`. +When you later call `obidemerge`, you must therefore pass `-d sample` to match that +attribute name. The `-d` option takes the **logical** slot name (here `sample`), not the +internal storage name (`merged_sample`). + + +**Read counts after demerging.** +Each output sequence has its read count set to the value recorded in the statistics map for +that sample. If you sum the counts of all output sequences that share the same identifier, +you recover the total count of the original merged record. + +**Order of output sequences.** +The order in which the per-sample copies of a single merged sequence appear in the output +is not guaranteed. If a stable order is required, pipe the output through `obisort`. + +## OUTPUT + +`obidemerge` writes one sequence record per sample entry found in the statistics attribute. +Each output record is a copy of the input sequence, with: + +- the statistics attribute (`merged_`) removed, +- the `` attribute set to the sample name, +- the `count` attribute set to the abundance for that sample. + +Sequences with no statistics for the chosen slot are passed through unchanged. + +## Observed output example + +``` +>seq001 {"count":5,"sample":"sampleA"} +acgtacgtacgtacgtacgtacgtacgtacgtacgtacgt +>seq001 {"count":3,"sample":"sampleB"} +acgtacgtacgtacgtacgtacgtacgtacgtacgtacgt +>seq001 {"count":1,"sample":"sampleC"} +acgtacgtacgtacgtacgtacgtacgtacgtacgtacgt +>seq002 {"count":2,"sample":"sampleA"} +ttggccaattggccaattggccaattggccaattggccaa +>seq002 {"count":7,"sample":"sampleD"} +ttggccaattggccaattggccaattggccaattggccaa +>seq003 {"count":4,"sample":"sampleB"} +gctagctagctagctagctagctagctagctagctagcta +>seq004 {"count":6} +aaaaccccggggttttaaaaccccggggttttaaaacccc +``` diff --git a/autodoc/cmd/obidistribute.md b/autodoc/cmd/obidistribute.md new file mode 100644 index 0000000..7e85a83 --- /dev/null +++ b/autodoc/cmd/obidistribute.md @@ -0,0 +1,296 @@ +# NAME + +obidistribute — divided an input set of sequences into subsets + +--- + +# SYNOPSIS + +``` +obidistribute --pattern|-p [--append|-A] [--batch-mem ] + [--batch-size ] [--batch-size-max ] + [--batches|-n ] [--classifier|-c ] [--compress|-Z] + [--csv] [--debug] [--directory|-d ] [--ecopcr] [--embl] + [--fasta] [--fasta-output] [--fastq] [--fastq-output] + [--genbank] [--hash|-H ] [--help|-h|-?] + [--input-OBI-header] [--input-json-header] [--json-output] + [--max-cpu ] [--na-value ] [--no-order] + [--no-progressbar] [--out|-o ] + [--output-OBI-header|-O] [--output-json-header] [--pprof] + [--pprof-goroutine ] [--pprof-mutex ] + [--silent-warning] [--skip-empty] [--solexa] [--u-to-t] + [--version] [] +``` + +--- + +# DESCRIPTION + +`obidistribute` splits a set of biological sequences into multiple output files according to one of three distribution strategies: annotation-based classification, round-robin batch assignment, or hash-based sharding. + +The most common use case in metabarcoding is demultiplexing: sequences carry a tag annotation (e.g., `sample_id`) and `obidistribute` writes each sample's sequences into its own file. The output filename for each group is built from a user-supplied pattern containing `%s`, which is replaced by the classifier value or batch index. + +When no classifier is specified, sequences can be split into a fixed number of batches (`--batches`) for parallel downstream processing, or sharded deterministically by hash (`--hash`) to ensure reproducible partitioning regardless of input order. + +Output files can be organised into subdirectories (one per classifier value) using `--directory`, and existing files can be extended rather than overwritten with `--append`. Sequences lacking the classifier annotation are assigned to a file whose name uses the NA value (default: `"NA"`). + +--- + +# INPUT + +`obidistribute` reads biological sequences from one or more files supplied as positional arguments, or from standard input when no files are given. All major NGS and flat-file formats are supported and auto-detected: + +- FASTA / FASTQ (plain or gzip-compressed) +- GenBank and EMBL flat files +- ecoPCR output +- CSV + +Format can be forced with `--fasta`, `--fastq`, `--embl`, `--genbank`, `--ecopcr`, or `--csv`. Header annotation style can be specified with `--input-OBI-header` or `--input-json-header`. + +--- + +# OUTPUT + +Each distribution group produces a separate output file named according to the `--pattern` template. The `%s` placeholder in the pattern is replaced by the classifier value, batch index, or hash shard index, depending on the chosen distribution mode. + +Output format follows the same rules as other OBITools commands: FASTQ is used when quality scores are present, FASTA otherwise. The format can be forced with `--fasta-output`, `--fastq-output`, or `--json-output`. All annotations present in the input sequences are preserved in the output files. + +When `--directory` is used together with `--classifier`, output files are placed in subdirectories named after the classifier values, allowing hierarchical organisation of results. + +## Observed output example + +``` +@seq001 {"sample_id":"sampleA"} +atcgatcgatcgatcgatcg ++ +IIIIIIIIIIIIIIIIIIII +@seq002 {"sample_id":"sampleA"} +gctagctagctagctagcta ++ +IIIIIIIIIIIIIIIIIIII +@seq003 {"sample_id":"sampleA"} +ttagctaatcggtaatcggt ++ +IIIIIIIIIIIIIIIIIIII +@seq009 {"sample_id":"sampleA"} +atgatgatgatgatgatgat ++ +IIIIIIIIIIIIIIIIIIII +``` + +--- + +# OPTIONS + +## Distribution mode + +- **`--pattern|-p `** — _(required)_ + Default: none. + The template used to build output filenames. The variable part is represented by `%s`. Example: `toto_%s.fastq`. + +- **`--classifier|-c `** + Default: `""`. + The name of an annotation tag on the sequences. Sequences are dispatched into separate files based on the value of this tag. The tag value must be a string, integer, or boolean. + +- **`--batches|-n `** + Default: `0`. + Splits the input into exactly *N* batches by round-robin assignment, regardless of sequence metadata. + +- **`--hash|-H `** + Default: `0`. + Splits the input into at most *N* batches using a hash of the sequence. Produces deterministic, reproducible sharding. + +- **`--directory|-d `** + Default: `""`. + Used together with `--classifier`: organises output files into subdirectories named after classifier values. + +## Output file handling + +- **`--append|-A`** + Default: `false`. + Appends sequences to output files if they already exist, instead of overwriting them. + +- **`--na-value `** + Default: `"NA"`. + Value used as the filename component when a sequence does not have the classifier tag defined. + +- **`--compress|-Z`** + Default: `false`. + Compresses all output files using gzip. + +## Input format + +- **`--fasta`** + Default: `false`. + Read data following the FASTA format. + +- **`--fastq`** + Default: `false`. + Read data following the FASTQ format. + +- **`--embl`** + Default: `false`. + Read data following the EMBL flatfile format. + +- **`--genbank`** + Default: `false`. + Read data following the GenBank flatfile format. + +- **`--ecopcr`** + Default: `false`. + Read data following the ecoPCR output format. + +- **`--csv`** + Default: `false`. + Read data following the CSV format. + +- **`--input-OBI-header`** + Default: `false`. + FASTA/FASTQ title line annotations follow OBI format. + +- **`--input-json-header`** + Default: `false`. + FASTA/FASTQ title line annotations follow JSON format. + +- **`--solexa`** + Default: `false`. + Decodes quality string according to the Solexa specification. + +- **`--u-to-t`** + Default: `false`. + Convert Uracil to Thymine. + +- **`--skip-empty`** + Default: `false`. + Sequences of length equal to zero are suppressed from the output. + +- **`--no-order`** + Default: `false`. + When several input files are provided, indicates that there is no order among them. + +## Output format + +- **`--fasta-output`** + Default: `false`. + Write sequences in FASTA format (default if no quality data available). + +- **`--fastq-output`** + Default: `false`. + Write sequences in FASTQ format (default if quality data available). + +- **`--json-output`** + Default: `false`. + Write sequences in JSON format. + +- **`--output-OBI-header|-O`** + Default: `false`. + Output FASTA/FASTQ title line annotations follow OBI format. + +- **`--output-json-header`** + Default: `false`. + Output FASTA/FASTQ title line annotations follow JSON format. + +- **`--out|-o `** + Default: `"-"`. + Filename used for saving the output. + +## Performance + +- **`--max-cpu `** + Default: `16`. + Number of parallel threads computing the result. + +- **`--batch-size `** + Default: `1`. + Minimum number of sequences per batch. + +- **`--batch-size-max `** + Default: `2000`. + Maximum number of sequences per batch. + +- **`--batch-mem `** + Default: `""` (128M). + Maximum memory per batch (e.g. `128K`, `64M`, `1G`). Set to `0` to disable. + +## Diagnostic & debug + +- **`--debug`** + Default: `false`. + Enable debug mode, by setting log level to debug. + +- **`--no-progressbar`** + Default: `false`. + Disable the progress bar printing. + +- **`--silent-warning`** + Default: `false`. + Stop printing of warning messages. + +- **`--pprof`** + Default: `false`. + Enable pprof server. Look at the log for details. + +- **`--pprof-goroutine `** + Default: `6060`. + Enable profiling of goroutine blocking profile. + +- **`--pprof-mutex `** + Default: `10`. + Enable profiling of mutex lock. + +--- + +# EXAMPLES + +```bash +# Demultiplex sequences by sample_id annotation into per-sample FASTQ files +obidistribute --classifier sample_id --pattern out_ex1_%s.fastq --no-progressbar --input-json-header reads.fastq +``` + +**Expected output:** 10 sequences written to 4 files: `out_ex1_sampleA.fastq` (4 sequences), `out_ex1_sampleB.fastq` (3 sequences), `out_ex1_sampleC.fastq` (2 sequences), `out_ex1_NA.fastq` (1 sequence). + +```bash +# Demultiplex into subdirectories, one directory per sample +obidistribute --classifier sample_id --directory --pattern %s/reads.fastq reads.fastq +``` + +```bash +# Split a large dataset into 3 equal batches for parallel processing +obidistribute --batches 3 --pattern chunk_%s.fasta --fasta-output --no-progressbar sequences.fasta +``` + +**Expected output:** 10 sequences written to 3 files: `chunk_1.fasta` (4 sequences), `chunk_2.fasta` (3 sequences), `chunk_3.fasta` (3 sequences). Batch indices are 1-based. + +```bash +# Hash-based sharding into 4 reproducible shards +obidistribute --hash 4 --pattern shard_%s.fastq --no-progressbar reads.fastq +``` + +**Expected output:** 10 sequences written to 4 files: `shard_0.fastq` through `shard_3.fastq`. Shard indices are 0-based. + +```bash +# Append new sequences to existing per-sample files (incremental demultiplexing) +obidistribute --classifier sample_id --pattern samples_%s.fastq --append new_reads.fastq +``` + +```bash +# Demultiplex sequences, replacing the NA label for unclassified sequences +obidistribute --classifier sample_id --na-value unclassified --pattern out_ex6_%s.fastq --no-progressbar --input-json-header reads.fastq +``` + +**Expected output:** 10 sequences written to 4 files including `out_ex6_unclassified.fastq` (1 sequence without `sample_id` annotation). + +--- + +# SEE ALSO + +`obiconvert`, `obisplit`, `obigrep` + +--- + +# NOTES + +- Sequences that lack the annotation specified by `--classifier` are written to the file whose name is built using the `--na-value` (default: `"NA"`). +- The three distribution modes (`--classifier`, `--batches`, `--hash`) are mutually exclusive. +- When using `--directory` together with `--classifier`, subdirectories are created automatically if they do not exist. +- Batch indices produced by `--batches` are 1-based; hash shard indices produced by `--hash` are 0-based. diff --git a/autodoc/cmd/obigrep.md b/autodoc/cmd/obigrep.md new file mode 100644 index 0000000..94bdfb3 --- /dev/null +++ b/autodoc/cmd/obigrep.md @@ -0,0 +1,326 @@ +# obigrep(1) — OBITools4 Manual + +## NAME + +`obigrep` — select a subset of sequence records on various criteria + +## SYNOPSIS + +``` +obigrep [OPTIONS] [FILE...] +``` + +## DESCRIPTION + +`obigrep` filters a set of biological sequence records (in FASTA or FASTQ format) and writes only those matching all specified criteria to the output. Its name is modelled on the Unix `grep` command, but instead of filtering lines in a text file, it filters sequence records. + +Filtering criteria can be combined freely: only sequence records satisfying **all** specified conditions are retained. The selection can be inverted with `--inverse-match` to keep the records that would otherwise be discarded. + +Sequences are read from one or more files, or from standard input if no file is given. Results are written to standard output or to a file specified with `--out`. Records that do not pass the filters can optionally be saved to a separate file with `--save-discarded`. + +## INPUT FORMATS + +`obigrep` recognises the following input formats automatically. A specific format can be forced with the corresponding flag: + +| Flag | Format | +|------|--------| +| `--fasta` | FASTA | +| `--fastq` | FASTQ | +| `--embl` | EMBL flat file | +| `--genbank` | GenBank flat file | +| `--ecopcr` | ecoPCR output | +| `--csv` | CSV tabular format | + +Header annotation styles can be selected with `--input-OBI-header` (OBITools format) or `--input-json-header` (JSON format). + +## OUTPUT FORMATS + +By default, the output format matches the input format (FASTQ when quality scores are present, FASTA otherwise). The format can be forced: + +- `--fasta-output` — write FASTA +- `--fastq-output` — write FASTQ +- `--json-output` — write JSON +- `--output-OBI-header` / `-O` — annotate FASTA/FASTQ title lines in OBITools format +- `--output-json-header` — annotate FASTA/FASTQ title lines in JSON format +- `--compress` / `-Z` — compress output with gzip + +Use `--out FILE` / `-o FILE` to write results to a file instead of standard output. + +## FILTERING OPTIONS + +### By sequence length + +- `--min-length LENGTH` / `-l LENGTH` + Keep only sequences at least *LENGTH* bases long. + +- `--max-length LENGTH` / `-L LENGTH` + Keep only sequences at most *LENGTH* bases long. + +### By read abundance + +Sequence records can carry a `count` attribute recording how many times the sequence was observed. The following options filter on that count: + +- `--min-count COUNT` / `-c COUNT` + Keep only sequences observed at least *COUNT* times (default: 1). + +- `--max-count COUNT` / `-C COUNT` + Keep only sequences observed at most *COUNT* times. + +### By sequence pattern + +- `--sequence PATTERN` / `-s PATTERN` + Keep records whose nucleotide sequence matches the regular expression *PATTERN* (case-insensitive). This option can be repeated; all patterns must match. + +- `--approx-pattern PATTERN` + Keep records whose sequence contains an approximate match to *PATTERN*. The number of allowed differences is controlled by `--pattern-error`. This option can be repeated. + +- `--pattern-error N` + Maximum number of mismatches (or indels, if `--allows-indels` is set) tolerated when using `--approx-pattern` (default: 0, i.e. exact match). + +- `--allows-indels` + Allow insertions and deletions (in addition to substitutions) when performing approximate pattern matching. + +- `--only-forward` + Search patterns on the forward strand only. By default both strands are searched. + +### By identifier or definition + +- `--identifier PATTERN` / `-I PATTERN` + Keep records whose identifier matches the regular expression *PATTERN* (case-insensitive). Can be repeated. + +- `--id-list FILENAME` + Keep only records whose identifier appears in *FILENAME*, a plain-text file with one identifier per line. + +- `--definition PATTERN` / `-D PATTERN` + Keep records whose definition line matches the regular expression *PATTERN* (case-insensitive). Can be repeated. + +### By attribute (metadata) + +Sequence records can carry arbitrary key/value annotations: + +- `--has-attribute KEY` / `-A KEY` + Keep records that possess an attribute named *KEY*, regardless of its value. Can be repeated. + +- `--attribute KEY=PATTERN` / `-a KEY=PATTERN` + Keep records for which the value of attribute *KEY* matches the regular expression *PATTERN* (case-sensitive). Can be repeated; all constraints must be satisfied. + +### By custom boolean expression + +- `--predicate EXPRESSION` / `-p EXPRESSION` + Keep records for which the boolean expression *EXPRESSION* evaluates to true. Attributes are accessed via the `annotations` map (e.g. `annotations["count"]`). The special variable `sequence` refers to the sequence object; its length can be obtained with `len(sequence)`. Can be repeated; all expressions must be true. + + Example: `-p 'annotations["count"] >= 10 && len(sequence) < 200'` + +### By taxonomy + +Taxonomy-based filtering requires a taxonomy database to be provided with `--taxonomy`. + +- `--taxonomy PATH` / `-t PATH` + Path to the taxonomy database. + +- `--restrict-to-taxon TAXID` / `-r TAXID` + Keep only records whose taxon belongs to the lineage of *TAXID* (i.e. is *TAXID* itself or a descendant). Can be repeated; sequences must satisfy at least one of the provided taxids. + +- `--ignore-taxon TAXID` / `-i TAXID` + Discard records whose taxon belongs to the lineage of *TAXID*. Can be repeated. + +- `--valid-taxid` + Keep only records that carry a valid, recognised taxonomic identifier. + +- `--require-rank RANK_NAME` + Keep only records whose taxon has a defined ancestor at the given rank (e.g. *species*, *genus*, *family*). Can be repeated. + +- `--update-taxid` + Automatically update merged taxids to their current valid equivalent. + +- `--fail-on-taxonomy` + Exit with an error if a taxid referenced in the data is not valid. + +- `--with-leaves` + When the taxonomy is extracted from a sequence file, attach each sequence as a leaf node under its annotated taxid. + +- `--raw-taxid` + Print taxids in output files without supplementary information (taxon name and rank). + +### Inversion + +- `--inverse-match` / `-v` + Invert the selection: output the records that would otherwise be discarded. + +## PAIRED-END OPTIONS + +When paired-end sequencing data are provided (forward and reverse reads stored in two files), `obigrep` can apply filters taking both reads into account. + +- `--paired-with FILENAME` + File containing the reverse (paired) reads. + +- `--paired-mode MODE` + How to combine the filter result from the forward and reverse reads. *MODE* is one of: + + | Mode | Meaning | + |------|---------| + | `forward` | Keep the pair if the **forward** read passes (default) | + | `reverse` | Keep the pair if the **reverse** read passes | + | `and` | Keep the pair if **both** reads pass | + | `or` | Keep the pair if **at least one** read passes | + | `andnot` | Keep the pair if the **forward** passes and the **reverse** does not | + | `xor` | Keep the pair if **exactly one** read passes | + +## OUTPUT CONTROL + +- `--save-discarded FILENAME` + Write sequence records that do **not** pass the filters to *FILENAME*. + +- `--out FILENAME` / `-o FILENAME` + Write the selected records to *FILENAME* (default: standard output). + +- `--skip-empty` + Suppress sequences of length zero from the output. + +## PERFORMANCE OPTIONS + +- `--max-cpu N` + Number of parallel processing threads (default: number of available CPUs). + +- `--batch-size N` + Minimum number of sequences per processing batch (default: 1). + +- `--batch-size-max N` + Maximum number of sequences per processing batch (default: 2000). + +- `--batch-mem SIZE` + Maximum memory per batch (e.g. `128M`, `1G`). Overrides `--batch-size-max` when memory is the limiting factor. Can also be set via the environment variable `OBIBATCHMEM`. + +- `--no-order` + When multiple input files are provided, indicates that no ordering is assumed between them, which can improve throughput. + +- `--no-progressbar` + Disable the progress bar. + +## MISCELLANEOUS OPTIONS + +- `--u-to-t` + Convert uracil (U) to thymine (T) in all sequences (useful for RNA data). + +- `--solexa` + Decode quality scores according to the legacy Solexa specification instead of the standard Phred encoding. + +- `--silent-warning` + Suppress warning messages. + +- `--debug` + Enable verbose debug logging. + +- `--version` + Print version information and exit. + +- `--help` / `-h` / `-?` + Display the help message and exit. + +## EXAMPLES + +Keep all sequences longer than 100 bases: + +```bash +obigrep --min-length 100 input.fasta > out_min_length.fasta +``` + +**Expected output:** 6 sequences written to `out_min_length.fasta`. + +Select sequences observed at least 10 times: + +```bash +obigrep --min-count 10 input.fasta > out_min_count.fasta +``` + +**Expected output:** 4 sequences written to `out_min_count.fasta`. + +Keep sequences whose identifier starts with `BOLD`: + +```bash +obigrep --identifier '^BOLD' input.fasta > out_bold.fasta +``` + +**Expected output:** 2 sequences written to `out_bold.fasta`. + +Select only sequences carrying the IUPAC primer motif `GGGCWATGTTTCATAAYGGG` with up to 2 mismatches: + +```bash +obigrep --approx-pattern GGGCWATGTTTCATAAYGGG --pattern-error 2 input.fasta > out_primer.fasta +``` + +**Expected output:** 2 sequences written to `out_primer.fasta`. + +Retain sequences belonging to the genus *Homo* (taxid 9605) in an NCBI taxonomy: + +```bash +obigrep --taxonomy /data/ncbi_tax --restrict-to-taxon 9605 input.fasta +``` + +Keep sequences that have a `sample` attribute equal to `lake1` and save the rest to a separate file: + +```bash +obigrep --attribute sample='^lake1$' --save-discarded discarded.fasta \ + input.fasta > lake1.fasta +``` + +**Expected output:** 5 sequences written to `lake1.fasta`, 5 sequences written to `discarded.fasta`. + +Invert a length filter (discard sequences shorter than 50 bases): + +```bash +obigrep --min-length 50 --inverse-match input.fasta > out_short.fasta +``` + +**Expected output:** 1 sequence written to `out_short.fasta`. + +Apply a custom predicate (sequences with count ≥ 5): + +```bash +obigrep -p 'annotations["count"] >= 5' input.fasta > out_predicate.fasta +``` + +**Expected output:** 6 sequences written to `out_predicate.fasta`. + +## OUTPUT + +### Attribute table + +Attributes present on sequence records are preserved unchanged in the output. No new attributes are added by `obigrep` itself — only filtering occurs. + +| Attribute | Type | Description | +|-----------|------|-------------| +| `count` | integer | Number of times the sequence was observed (read from input) | +| `sample` | string | Sample identifier (read from input) | + +Any other annotations present in the input are carried through to the output unmodified. + +### Observed output example + +``` +>seq001 {"count":15,"sample":"lake1"} +acgtacgtacgtacgtacgtgggcaatgtttcataatgggacgtacgtacgtacgtacgt +acgtacgtacgtacgtacgtacgtacgtacgtacgtacgtacgtacgtacgtacgtacgt +acgtacgtacgtacgtacgtacgtacgtacgt +>seq002 {"count":3,"sample":"lake1"} +tgcatgcatgcatgcatgcatgcatgcatgcatgcatgcatgcatgcatgcatgcatgca +tgcatgcatgcatgcatgcatgcatgcatgcatgcatgcatgcatgcatgcatgcatgca +>seq004 {"count":2,"sample":"lake1"} +aaacccgggtttagctagctagctagctagctagctagctagctagctagctagctagct +agctagctagctagctagctagctagctagctagctagctagctagctagctagctagct +atacgtatcgatcg +>BOLD_005 {"count":8,"sample":"pond1"} +cgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgat +cgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcg +>seq008 {"count":7,"sample":"river2"} +ttacgatcgatcgatcgatcgggcaatgtttcataaggggacgatcgatcgatcgatcga +tcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgat +``` + +## SEE ALSO + +`obiannotate`(1), `obiuniq`(1), `obiconvert`(1), `obitag`(1), `obisplit`(1) + +## OBITools4 + +`obigrep` is part of the **OBITools4** suite for analysing DNA metabarcoding and environmental DNA data. diff --git a/autodoc/cmd/obijoin.md b/autodoc/cmd/obijoin.md new file mode 100644 index 0000000..2d50020 --- /dev/null +++ b/autodoc/cmd/obijoin.md @@ -0,0 +1,257 @@ +# NAME + +obijoin — merge annotations contained in a file to another file + +--- + +# SYNOPSIS + +``` +obijoin --join-with|-j [--batch-mem ] [--batch-size ] + [--batch-size-max ] [--by|-b ]... [--compress|-Z] + [--csv] [--debug] [--ecopcr] [--embl] [--fail-on-taxonomy] [--fasta] + [--fasta-output] [--fastq] [--fastq-output] [--genbank] + [--help|-h|-?] [--input-OBI-header] [--input-json-header] + [--json-output] [--max-cpu ] [--no-order] [--no-progressbar] + [--out|-o ] [--output-OBI-header|-O] [--output-json-header] + [--pprof] [--pprof-goroutine ] [--pprof-mutex ] + [--raw-taxid] [--silent-warning] [--skip-empty] [--solexa] + [--taxonomy|-t ] [--u-to-t] [--update-id|-i] + [--update-quality|-q] [--update-sequence|-s] [--update-taxid] + [--version] [--with-leaves] [] +``` + +--- + +# DESCRIPTION + +`obijoin` merges annotations from a secondary file into a primary sequence dataset. For each sequence in the primary input, it looks up matching records in the secondary file based on one or more shared attribute keys, then copies all annotations from matched partner records onto the primary sequence. + +The join is a **left outer join**: every sequence in the primary dataset is preserved in the output, whether or not a match is found in the secondary file. Unmatched sequences simply receive no additional annotations. Key matching is exact string equality. + +A common use case is enriching amplicon or read sequences with external sample metadata. The secondary file (the *annotation source*) can be a FASTA/FASTQ sequence file, a CSV table, an EMBL or GenBank flat file, or any other format accepted by OBITools4. This makes it straightforward to prepare a simple spreadsheet with sample identifiers and metadata columns, save it as CSV, and merge it directly into a sequence dataset — the CSV format is auto-detected, no format conversion or extra flag is needed. + +In addition to transferring annotations, `obijoin` can optionally replace the sequence identifier, nucleotide sequence, or quality scores of each primary sequence with values from its matched partner, controlled by the `--update-id`, `--update-sequence`, and `--update-quality` flags. + +--- + +# INPUT + +`obijoin` accepts a primary sequence dataset on standard input or as one or more file arguments. The supported formats are automatically detected and include FASTA, FASTQ, EMBL, GenBank, ecoPCR output, CSV, and JSON. Format-specific flags (`--fasta`, `--fastq`, `--embl`, `--genbank`, `--ecopcr`, `--csv`) can force a specific parser when auto-detection is ambiguous. + +The secondary file, supplied via `--join-with`, is loaded entirely into memory before processing begins, and supports the same set of formats including CSV — the format is auto-detected automatically. + +When multiple primary input files are provided and their ordering across files is irrelevant, `--no-order` allows the reader to return batches in whichever order they complete, improving throughput. + +--- + +# OUTPUT + +The output is a sequence file in FASTA or FASTQ format (determined automatically by the presence of quality data), written to standard output or to the file specified by `--out`. Alternative output formats can be requested with `--fasta-output`, `--fastq-output`, or `--json-output`. The output can be gzip-compressed with `--compress`. + +Each output sequence carries all annotations from the primary dataset, enriched with every annotation attribute copied from the matched partner record. If a field name exists in both, the partner value overwrites the primary value. When `--update-id`, `--update-sequence`, or `--update-quality` are set, the corresponding sequence-level fields are also replaced with the partner's values. + +## Observed output example + +``` +>seq001 {"barcode":"ATGC","experiment":"amplicon_run1","location":"Paris","sample":"S1"} +atgcatgcatgcatgcatgc +>seq002 {"barcode":"GCTA","experiment":"amplicon_run2","location":"Lyon","sample":"S2"} +gctagctagctagctagcta +>seq003 {"barcode":"TTTT","sample":"S3"} +tttttttttttttttttttt +>seq004 {"barcode":"ATGC","experiment":"amplicon_run1","location":"Paris","sample":"S1"} +aaaaatttttcccccggggg +>seq005 {"barcode":"GCTA","experiment":"amplicon_run2","location":"Lyon","sample":"S2"} +gggggaaaaatttttccccc +>seq006 {"barcode":"AAAA","sample":"S4"} +ccccccgggggtttttaaaaa +``` + +--- + +# OPTIONS + +## Required + +`--join-with|-j ` +: Path to the secondary file whose records are joined onto the primary sequences. This parameter is mandatory. The file can be in any format accepted by OBITools4 (FASTA, FASTQ, CSV, EMBL, GenBank, ecoPCR); the format is auto-detected. Default: none. + +## Join control + +`--by|-b ` +: Declares a join key as an attribute name or a `primary_attr=secondary_attr` mapping. Repeat the flag to join on multiple keys simultaneously; all keys must match for a record pair to be considered a hit (intersection semantics). When omitted, the join defaults to matching by sequence identifier (`id`). Default: `[]`. + +`--update-id|-i` +: Replace the identifier of each primary sequence with the identifier from its matched partner record. Default: `false`. + +`--update-sequence|-s` +: Replace the nucleotide or amino acid sequence of each primary sequence with the sequence from its matched partner. Default: `false`. + +`--update-quality|-q` +: Replace the per-base quality scores of each primary sequence with the quality scores from its matched partner. Relevant only when both datasets carry quality information (FASTQ). Default: `false`. + +## Input format + +`--csv` +: Read the primary input data in OBITools CSV format (e.g., sequences exported by `obicsv`). This flag applies to the primary input only; secondary files supplied via `--join-with` are always auto-detected. Default: `false`. + +`--ecopcr` +: Read data following the ecoPCR output format. Default: `false`. + +`--embl` +: Read data following the EMBL flatfile format. Default: `false`. + +`--fasta` +: Read data following the FASTA format. Default: `false`. + +`--fastq` +: Read data following the FASTQ format. Default: `false`. + +`--genbank` +: Read data following the GenBank flatfile format. Default: `false`. + +`--input-OBI-header` +: Treat FASTA/FASTQ title line annotations as OBI format. Default: `false`. + +`--input-json-header` +: Treat FASTA/FASTQ title line annotations as JSON format. Default: `false`. + +`--solexa` +: Decode the quality string according to the Solexa specification. Default: `false`. + +`--u-to-t` +: Convert uracil (U) to thymine (T) in input sequences. Default: `false`. + +`--skip-empty` +: Suppress sequences of length zero from the output. Default: `false`. + +`--no-order` +: When several input files are provided, indicates that there is no order among them. Default: `false`. + +## Output format + +`--out|-o ` +: Filename used for saving the output. Default: `-` (standard output). + +`--fasta-output` +: Write sequences in FASTA format (default when no quality data are available). Default: `false`. + +`--fastq-output` +: Write sequences in FASTQ format (default when quality data are available). Default: `false`. + +`--json-output` +: Write sequences in JSON format. Default: `false`. + +`--output-OBI-header|-O` +: Output FASTA/FASTQ title line annotations in OBI format. Default: `false`. + +`--output-json-header` +: Output FASTA/FASTQ title line annotations in JSON format. Default: `false`. + +`--compress|-Z` +: Compress the output using gzip. Default: `false`. + +## Taxonomy + +`--taxonomy|-t ` +: Path to the taxonomy database. Default: `""`. + +`--fail-on-taxonomy` +: Cause `obijoin` to fail with an error if a taxid encountered is not currently valid. Default: `false`. + +`--raw-taxid` +: Print taxids in files without supplementary information (taxon name and rank). Default: `false`. + +`--update-taxid` +: Automatically update taxids that are declared as merged to a newer one. Default: `false`. + +`--with-leaves` +: When taxonomy is extracted from a sequence file, add sequences as leaves of their taxid annotation. Default: `false`. + +## Performance + +`--max-cpu ` +: Number of parallel threads used to compute the result. Default: `16`. + +`--batch-size ` +: Minimum number of sequences per processing batch. Default: `1`. + +`--batch-size-max ` +: Maximum number of sequences per processing batch. Default: `2000`. + +`--batch-mem ` +: Maximum memory per batch (e.g. `128K`, `64M`, `1G`). Set to `0` to disable. Default: `128M`. + +## Diagnostics + +`--no-progressbar` +: Disable the progress bar. Default: `false`. + +`--silent-warning` +: Stop printing warning messages. Default: `false`. + +`--debug` +: Enable debug mode by setting the log level to debug. Default: `false`. + +--- + +# EXAMPLES + +```bash +# Annotate amplicon sequences with sample metadata from a CSV table, +# matching on the sample attribute. CSV format is auto-detected. +obijoin --join-with metadata.csv --by sample input.fasta > out_basic.fasta +``` + +**Expected output:** 6 sequences written to `out_basic.fasta`. + +```bash +# Join using a cross-attribute key: primary sequences have a 'sample' attribute, +# while the annotation CSV uses 'well' for the same identifier. +obijoin --join-with well_metadata.csv --by sample=well input.fasta > out_crosskey.fasta +``` + +**Expected output:** 6 sequences written to `out_crosskey.fasta`. + +```bash +# Join on two keys simultaneously: match only when both sample and barcode agree, +# then update sequence identifiers with those from the reference file. +obijoin --join-with references.fasta \ + --by sample --by barcode \ + --update-id \ + input.fasta > out_multikey.fasta +``` + +**Expected output:** 6 sequences written to `out_multikey.fasta`. + +```bash +# Replace sequences and quality scores of reads with values from a corrected FASTQ file, +# joining by sequence ID (default when no --by is specified). +obijoin --join-with corrected.fastq \ + --update-sequence --update-quality \ + input.fastq > out_updated.fastq +``` + +**Expected output:** 3 sequences written to `out_updated.fastq`. + +```bash +# Use an OBITools CSV file as primary input (--csv flag), join with a metadata CSV, +# then write compressed FASTA output without showing the progress bar. +obijoin --join-with metadata.csv --by sample \ + --csv --fasta-output --compress \ + --no-progressbar \ + --out out_compressed.fasta.gz \ + primary.csv +``` + +**Expected output:** 3 sequences written to `out_compressed.fasta.gz`. + +--- + +# NOTES + +- The secondary file supplied via `--join-with` is loaded entirely into memory before the join begins. For very large secondary files this may require significant RAM. +- Key matching is based on exact string equality; no regular expression or fuzzy matching is applied. +- The join is a left outer join: primary sequences without a matching partner in the secondary file are still emitted, unchanged, in the output. +- When the annotation source is a plain CSV spreadsheet (columns = attributes, rows = records), the format is auto-detected — no `--csv` flag is needed. The `--csv` flag applies exclusively to the primary input and is intended for sequences stored in OBITools CSV format. diff --git a/autodoc/cmd/obimicrosat.md b/autodoc/cmd/obimicrosat.md new file mode 100644 index 0000000..ddc9ae5 --- /dev/null +++ b/autodoc/cmd/obimicrosat.md @@ -0,0 +1,205 @@ +# NAME + +obimicrosat — looks for microsatellites sequences in a sequence file + +--- + +# SYNOPSIS + +``` +obimicrosat [options] [...] +``` + +--- + +# DESCRIPTION + +`obimicrosat` scans DNA sequences for simple sequence repeats (SSRs), also called +microsatellites — tandem repetitions of a short motif (1–6 bp by default). For each +sequence containing a qualifying repeat, the command annotates it with the location, +unit sequence, repeat count, and flanking regions, then writes it to output. Sequences +with no detected microsatellite are silently discarded. + +The detection works in two passes. A first regular expression finds any tandem repeat +satisfying the unit-length and repeat-count constraints. The true minimal repeat unit +is then determined, and a second scan refines the exact boundaries. The repeat unit is +normalized to its lexicographically smallest rotation across all rotations and its +reverse complement, which allows equivalent loci to be grouped consistently across +samples. + +By default, when the canonical form of a unit requires the reverse complement, the +whole sequence is reoriented so that the microsatellite is always reported on the +direct strand of the normalized unit. This behaviour can be disabled with +`--not-reoriented`. + +A common use case is identifying polymorphic SSR markers for population genetics, or +flagging repeat-rich regions before designing PCR primers. + +--- + +# INPUT + +Accepts one or more sequence files on the command line. If no file is given, sequences +are read from standard input. Supported formats include FASTA, FASTQ, JSON/OBI, GenBank, +EMBL, ecoPCR output, and CSV. Compressed files (gzip) are handled transparently. +Format is detected automatically unless overridden by input flags. + +--- + +# OUTPUT + +Outputs only the sequences in which a microsatellite was found. Each retained sequence +carries the following additional attributes: + +| Attribute | Content | +|---|---| +| `microsat` | Full repeat region as a string | +| `microsat_from` | 1-based start position of the repeat | +| `microsat_to` | End position of the repeat (inclusive) | +| `microsat_unit` | Repeat unit as observed in the sequence | +| `microsat_unit_normalized` | Lexicographically smallest canonical form | +| `microsat_unit_orientation` | `direct` or `reverse` | +| `microsat_unit_length` | Length of the repeat unit (bp) | +| `microsat_unit_count` | Number of complete unit repetitions | +| `seq_length` | Total length of the (possibly reoriented) sequence | +| `microsat_left` | Flanking sequence to the left of the repeat | +| `microsat_right` | Flanking sequence to the right of the repeat | + +When a sequence is reoriented (reverse-complemented), `_cmp` is appended to its +identifier. + +The output format follows the same rules as the rest of OBITools4: FASTQ when quality +scores are present, FASTA or JSON/OBI otherwise, configurable via output flags. + +## Observed output example + +``` +>seq001 {"definition":"dinucleotide AC repeat 16x with 40bp non-repetitive flanks","microsat":"acacacacacacacacacacacacacacacac","microsat_from":40,"microsat_left":"agtcgaacttgcatgccttcagggcaagtctagcttacg","microsat_right":"cgatagtcatgcaagtcttgcggcatagatcgttacca","microsat_to":71,"microsat_unit":"ac","microsat_unit_count":16,"microsat_unit_length":2,"microsat_unit_normalized":"ac","microsat_unit_orientation":"direct","seq_length":109} +agtcgaacttgcatgccttcagggcaagtctagcttacgacacacacacacacacacaca +cacacacacaccgatagtcatgcaagtcttgcggcatagatcgttacca +>seq006_cmp {"definition":"GT repeat 16x with 40bp non-repetitive flanks canonical form is AC","microsat":"acacacacacacacacacacacacacacacac","microsat_from":39,"microsat_left":"tggtaacgatctatgccgcaagacttgcatgactatcg","microsat_right":"cgtaagctagacttgccctgaaggcatgcaagttcgact","microsat_to":70,"microsat_unit":"ac","microsat_unit_count":16,"microsat_unit_length":2,"microsat_unit_normalized":"ac","microsat_unit_orientation":"reverse","seq_length":109} +tggtaacgatctatgccgcaagacttgcatgactatcgacacacacacacacacacacac +acacacacaccgtaagctagacttgccctgaaggcatgcaagttcgact +``` + +--- + +# OPTIONS + +## Microsatellite detection + +**`--min-unit-length` / `-m`** +- Default: `1` +- Minimum length in base pairs of the repeated motif. Set to `2` to exclude + mononucleotide repeats, `3` for di- and mononucleotide-free searches, etc. + +**`--max-unit-length` / `-M`** +- Default: `6` +- Maximum length in base pairs of the repeated motif. Increasing this value detects + longer repeat units (minisatellites) at the cost of more complex patterns. + +**`--min-unit-count`** +- Default: `5` +- Minimum number of times the motif must be repeated. A value of `5` with a 2 bp unit + requires at least 10 bp of pure repeat. + +**`--min-length` / `-l`** +- Default: `20` +- Minimum total length (in bp) of the repeat region. This filter applies after the + unit-count filter and is useful to exclude very short but technically qualifying + repeats. + +**`--min-flank-length` / `-f`** +- Default: `0` +- Minimum length of the flanking sequence on each side of the repeat. Sequences with + flanks shorter than this threshold are discarded, which is useful when the output + will feed a primer-design step. + +**`--not-reoriented` / `-n`** +- Default: `false` (reorientation is active by default) +- When set, sequences are never reverse-complemented to match the canonical orientation + of the repeat unit. The microsatellite is reported as found, in its original + orientation. + +## Input / output + +Inherited from the standard OBITools4 conversion layer. Common flags include: + +**`--input-OBI-header`** — parse OBI-style FASTA/FASTQ headers. +**`--input-json-header`** — parse JSON-encoded headers. +**`--skip-empty`** — skip sequences with no nucleotides. +**`--u-to-t`** — convert U to T (RNA → DNA). +**`--output-json-header`** — write JSON-encoded headers. +**`--output-obi-header`** — write OBI-style headers. +**`--gzip`** — compress output with gzip. +**`--workers` / `-p`** — number of parallel processing workers. + +--- + +# EXAMPLES + +```bash +# Detect default microsatellites (unit 1–6 bp, ≥5 repeats, ≥20 bp total) +obimicrosat sequences.fasta > out_default.fasta +``` + +**Expected output:** 6 sequences written to `out_default.fasta`. + +```bash +# Restrict to di- and trinucleotide repeats only +obimicrosat -m 2 -M 3 sequences.fasta > out_dinucleotide.fasta +``` + +**Expected output:** 4 sequences written to `out_dinucleotide.fasta` +(mononucleotide and tetranucleotide repeats excluded). + +```bash +# Require at least 30 bp flanking sequence on each side (for primer design) +obimicrosat -f 30 sequences.fasta > out_primer_ready.fasta +``` + +**Expected output:** 3 sequences written to `out_primer_ready.fasta` +(sequences with flanks shorter than 30 bp are discarded). + +```bash +# Keep sequences in their original orientation (no reverse-complement) +obimicrosat --not-reoriented sequences.fasta > out_no_reorient.fasta +``` + +**Expected output:** 6 sequences written to `out_no_reorient.fasta` +(GT-repeat sequence kept as-is without `_cmp` suffix; `microsat_unit_orientation` is `reverse`). + +```bash +# Require at least 8 repeat units and a minimum repeat length of 30 bp +obimicrosat --min-unit-count 8 -l 30 sequences.fasta > out_strict.fasta +``` + +**Expected output:** 4 sequences written to `out_strict.fasta` +(short or low-count repeats excluded). + +--- + +# SEE ALSO + +`obigrep` — filter sequences by annotation after microsatellite detection. +`obiannotate` — add or modify sequence annotations. +`obiconvert` — format conversion for sequence files. + +--- + +# NOTES + +- Only sequences with at least one qualifying microsatellite are written to output; + all others are silently filtered out. +- The normalization algorithm considers all rotations of the unit and their reverse + complements, selecting the lexicographically smallest string. This ensures consistent + grouping of loci regardless of which strand was sequenced. +- When reorientation is active (the default), sequences whose canonical unit falls on + the reverse strand are reverse-complemented and their ID receives the suffix `_cmp`. + Coordinate attributes (`microsat_from`, `microsat_to`) always refer to the + (possibly reoriented) output sequence. +- Repetitive low-complexity sequences may match multiple overlapping patterns; only the + first match is reported per sequence. +- Flanking sequences must be **non-repetitive** to avoid the tool detecting a tandem + repeat within the flank instead of the intended SSR. When designing synthetic test + data, ensure flanking regions do not contain tandem repeat motifs of their own. diff --git a/autodoc/cmd/obiscript.md b/autodoc/cmd/obiscript.md new file mode 100644 index 0000000..1b13440 --- /dev/null +++ b/autodoc/cmd/obiscript.md @@ -0,0 +1,384 @@ +# NAME + +obiscript — executes a lua script on the input sequences + +--- + +# SYNOPSIS + +``` +obiscript [--allows-indels] [--approx-pattern ]... + [--attribute|-a ]... [--batch-mem ] + [--batch-size ] [--batch-size-max ] [--compress|-Z] + [--csv] [--debug] [--definition|-D ]... [--ecopcr] + [--embl] [--fail-on-taxonomy] [--fasta] [--fasta-output] [--fastq] + [--fastq-output] [--genbank] [--has-attribute|-A ]... + [--help|-h|-?] [--id-list ] + [--identifier|-I ]... [--ignore-taxon|-i ]... + [--input-OBI-header] [--input-json-header] [--inverse-match|-v] + [--json-output] [--max-count|-C ] [--max-cpu ] + [--max-length|-L ] [--min-count|-c ] + [--min-length|-l ] [--no-order] [--no-progressbar] + [--only-forward] [--out|-o ] [--output-OBI-header|-O] + [--output-json-header] + [--paired-mode ] + [--pattern-error ] [--pprof] [--pprof-goroutine ] + [--pprof-mutex ] [--predicate|-p ]... + [--raw-taxid] [--require-rank ]... + [--restrict-to-taxon|-r ]... [--script|-S ] + [--sequence|-s ]... [--silent-warning] [--skip-empty] + [--solexa] [--taxonomy|-t ] [--template] [--u-to-t] + [--update-taxid] [--valid-taxid] [--version] [--with-leaves] + [] +``` + +--- + +# DESCRIPTION + +`obiscript` applies a user-provided Lua script to a stream of biological sequences. For each input sequence record, the script's `worker(sequence)` function is called, giving the user full programmatic access to the sequence's identifier, data, quality scores, and metadata attributes. This makes it possible to implement custom annotation logic, computed filters, or record transformations that go beyond what fixed-function OBITools commands offer. + +The Lua script may also define two optional lifecycle hooks: `begin()`, called once before any sequence is processed (useful for initialising counters or opening files), and `finish()`, called after the last sequence (useful for printing summary statistics or flushing output). A thread-safe shared table `obicontext` is available across all workers, allowing aggregation across parallel executions. + +Sequences are read from files or standard input in any format supported by OBITools4 (FASTA, FASTQ, EMBL, GenBank, ecoPCR, CSV). The sequence filtering flags (such as `--min-length`, `--predicate`, etc.) select which sequences the Lua script is applied to; sequences that do not match the filter pass through to the output unchanged without the script being executed on them. All sequences — scripted or not — are written to the output. + +To get started, use `--template` to print a minimal Lua script skeleton with stubs for all three hooks and inline documentation. + +--- + +# INPUT + +`obiscript` reads biological sequences from one or more files supplied as positional arguments, or from standard input if no files are given. All formats supported by OBITools4 are accepted: FASTA, FASTQ, EMBL flatfile, GenBank flatfile, ecoPCR output, and CSV. Format auto-detection is used by default; explicit format flags (`--fasta`, `--fastq`, `--embl`, `--genbank`, `--ecopcr`, `--csv`) override it. Header annotation style can be forced with `--input-OBI-header` or `--input-json-header`. + +--- + +# OUTPUT + +Sequences processed by the Lua script are written to standard output, or to the file given by `--out`. Any modifications made to sequence records inside `worker()` (identifier, sequence, attributes) are reflected in the output. The output format defaults to FASTA when no quality data are present and to FASTQ otherwise; use `--fasta-output`, `--fastq-output`, or `--json-output` to override. Header annotation style in FASTA/FASTQ output can be set with `--output-OBI-header` or `--output-json-header`. Output can be gzip-compressed with `--compress`. + +## Observed output example + +``` +>sample1_seq001 {"definition":"control sequence for annotation test","sample":"sample1"} +atcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcg +>sample1_seq002 {"definition":"another control sequence from sample1","sample":"sample1"} +gctagctagctagctagctagctagctagctagctagctagctagcta +>sample2_seq003 {"definition":"second sample sequence","sample":"sample2"} +ttaattaattaattaattaattaattaattaattaattaattaattaa +>sample2_seq004 {"definition":"second sample another sequence","sample":"sample2"} +ccggccggccggccggccggccggccggccggccggccggccggccgg +>sample3_seq005 {"definition":"third sample first sequence","sample":"sample3"} +aaaattttccccggggaaaattttccccggggaaaattttccccgggg +>sample3_seq006 {"definition":"third sample second sequence","sample":"sample3"} +ttttaaaaccccggggttttaaaaccccggggttttaaaaccccgggg +``` + +--- + +# OPTIONS + +## Script + +### `--script|-S ` +- Default: `""` +- Path to the Lua script file to execute. The file must exist and be syntactically valid Lua. The script should define a `worker(sequence)` function, and optionally `begin()` and `finish()`. + +### `--template` +- Default: `false` +- Print a minimal Lua script template to standard output, with stubs for `begin()`, `worker()`, and `finish()` and inline documentation, then exit. Use this to bootstrap a new script. + +## Sequence filtering (selects sequences on which the script is applied; non-matching sequences pass through unchanged) + +### `--predicate|-p ` +- Default: `[]` +- Boolean expression evaluated for each sequence record. Attribute keys are accessible as variable names; `sequence` refers to the record itself. Multiple `-p` options are combined with AND. + +### `--sequence|-s ` +- Default: `[]` +- Regular expression matched against the nucleotide sequence. Case-insensitive. Multiple patterns are combined with AND. + +### `--identifier|-I ` +- Default: `[]` +- Regular expression matched against the sequence identifier. Case-insensitive. + +### `--definition|-D ` +- Default: `[]` +- Regular expression matched against the sequence definition line. Case-insensitive. + +### `--approx-pattern ` +- Default: `[]` +- Pattern matched approximately against the sequence. Use `--pattern-error` to set the maximum number of errors allowed. + +### `--pattern-error ` +- Default: `0` +- Maximum number of errors (mismatches) allowed during approximate pattern matching. + +### `--allows-indels` +- Default: `false` +- Allow insertions and deletions (in addition to mismatches) during approximate pattern matching. + +### `--only-forward` +- Default: `false` +- Restrict pattern matching to the forward strand only. + +### `--has-attribute|-A ` +- Default: `[]` +- Apply the script only to records that have an attribute with key ``; others pass through. + +### `--attribute|-a ` +- Default: `{}` +- Apply the script only to records where the attribute `KEY` matches the regular expression `VALUE`. Case-sensitive. Multiple `-a` options are combined with AND. + +### `--id-list ` +- Default: `""` +- Path to a text file containing one sequence identifier per line. The script is applied only to records whose identifier appears in the file; others pass through. + +### `--min-length|-l ` +- Default: `1` +- Apply the script only to sequences whose length is at least `LENGTH`; shorter sequences pass through unchanged. + +### `--max-length|-L ` +- Default: `2000000000` +- Apply the script only to sequences whose length is at most `LENGTH`; longer sequences pass through unchanged. + +### `--min-count|-c ` +- Default: `1` +- Apply the script only to sequences with a count (abundance) of at least `COUNT`; others pass through unchanged. + +### `--max-count|-C ` +- Default: `2000000000` +- Apply the script only to sequences with a count (abundance) of at most `COUNT`; others pass through unchanged. + +### `--inverse-match|-v` +- Default: `false` +- Invert the selection: apply the script to records that do NOT match the filter criteria; matching records pass through unchanged. + +## Taxonomic filtering + +### `--taxonomy|-t ` +- Default: `""` +- Path to the taxonomy database. Required for taxonomy-based options. + +### `--restrict-to-taxon|-r ` +- Default: `[]` +- Retain only sequences whose taxid belongs to the specified taxon. + +### `--ignore-taxon|-i ` +- Default: `[]` +- Exclude sequences whose taxid belongs to the specified taxon. + +### `--require-rank ` +- Default: `[]` +- Retain only sequences whose taxon has the specified rank (e.g., `species`, `genus`). + +### `--valid-taxid` +- Default: `false` +- Retain only sequences that carry a currently valid NCBI taxid. + +### `--fail-on-taxonomy` +- Default: `false` +- Abort with an error if a taxid used during filtering is not currently valid. + +### `--update-taxid` +- Default: `false` +- Automatically replace taxids declared as merged with their current equivalent. + +### `--raw-taxid` +- Default: `false` +- Print taxids in output without supplementary information (taxon name and rank). + +### `--with-leaves` +- Default: `false` +- When extracting taxonomy from a sequence file, attach sequences as leaves of their taxid annotation. + +## Paired-end mode + +### `--paired-mode ` +- Default: `"forward"` +- When paired reads are provided, determines how filter conditions are applied to both reads of a pair. + +## Input format + +### `--fasta` +- Default: `false` +- Force FASTA format parsing. + +### `--fastq` +- Default: `false` +- Force FASTQ format parsing. + +### `--embl` +- Default: `false` +- Force EMBL flatfile format parsing. + +### `--genbank` +- Default: `false` +- Force GenBank flatfile format parsing. + +### `--ecopcr` +- Default: `false` +- Force ecoPCR output format parsing. + +### `--csv` +- Default: `false` +- Force CSV format parsing. + +### `--input-OBI-header` +- Default: `false` +- Parse FASTA/FASTQ title line annotations as OBI format. + +### `--input-json-header` +- Default: `false` +- Parse FASTA/FASTQ title line annotations as JSON format. + +### `--solexa` +- Default: `false` +- Decode quality strings according to the Solexa specification. + +### `--u-to-t` +- Default: `false` +- Convert uracil (U) to thymine (T) in sequences. + +### `--skip-empty` +- Default: `false` +- Suppress sequences of length zero from the output. + +### `--no-order` +- Default: `false` +- When multiple input files are provided, indicates that no ordering is assumed among them. + +## Output format + +### `--out|-o ` +- Default: `"-"` (standard output) +- File path for saving the output. + +### `--fasta-output` +- Default: `false` +- Write output in FASTA format. + +### `--fastq-output` +- Default: `false` +- Write output in FASTQ format. + +### `--json-output` +- Default: `false` +- Write output in JSON format. + +### `--output-OBI-header|-O` +- Default: `false` +- Write FASTA/FASTQ title line annotations in OBI format. + +### `--output-json-header` +- Default: `false` +- Write FASTA/FASTQ title line annotations in JSON format. + +### `--compress|-Z` +- Default: `false` +- Compress output using gzip. + +## Performance + +### `--max-cpu ` +- Default: `16` (env: `OBIMAXCPU`) +- Number of parallel threads used for processing. + +### `--batch-size ` +- Default: `1` (env: `OBIBATCHSIZE`) +- Minimum number of sequences per processing batch. + +### `--batch-size-max ` +- Default: `2000` (env: `OBIBATCHSIZEMAX`) +- Maximum number of sequences per processing batch. + +### `--batch-mem ` +- Default: `""` → `128M` (env: `OBIBATCHMEM`) +- Maximum memory per batch (e.g. `128K`, `64M`, `1G`). Set to `0` to disable. + +## Diagnostics + +### `--debug` +- Default: `false` (env: `OBIDEBUG`) +- Enable debug logging. + +### `--no-progressbar` +- Default: `false` +- Disable the progress bar. + +### `--silent-warning` +- Default: `false` (env: `OBIWARNING`) +- Suppress warning messages. + +### `--pprof` +- Default: `false` +- Enable the pprof profiling HTTP server (see log for address). + +### `--pprof-goroutine ` +- Default: `6060` (env: `OBIPPROFGOROUTINE`) +- Port for goroutine blocking profile. + +### `--pprof-mutex ` +- Default: `10` (env: `OBIPPROFMUTEX`) +- Rate for mutex lock profiling. + +--- + +# EXAMPLES + +```bash +# Print a starter script template and save it to my_script.lua +obiscript --template > my_script.lua +``` + +**Expected output:** Lua template with `begin()`, `worker()`, and `finish()` stubs written to `my_script.lua`. + +```bash +# Add a custom annotation to every sequence record +# (the script sets a new attribute 'sample' from the identifier prefix) +obiscript --script annotate.lua --fasta-output sequences.fasta > annotated.fasta +``` + +**Expected output:** 6 sequences written to `annotated.fasta`. + +```bash +# Count reads per taxon using the finish() hook, filtering to a specific taxon +obiscript --script count_taxa.lua \ + --restrict-to-taxon 6231 \ + --taxonomy /data/ncbi_tax \ + sequences.fasta > filtered_annotated.fasta +``` + +```bash +# Apply a script to FASTQ sequences with a length filter +obiscript --script process_pairs.lua \ + --min-length 100 \ + --out result.fastq \ + reads.fastq +``` + +**Expected output:** 4 sequences written to `result.fastq`. + +```bash +# Run on FASTQ input, output JSON, using 4 CPU threads +obiscript --script enrich.lua \ + --json-output \ + --max-cpu 4 \ + sequences.fastq > enriched.json +``` + +**Expected output:** 4 sequences written to `enriched.json`. + +--- + +# SEE ALSO + +`obigrep` — filter sequences using the same selection criteria without scripting. +`obiannotate` — add or modify sequence attributes without scripting. + +--- + +# NOTES + +- The Lua `worker(sequence)` function is called in parallel across multiple goroutines. Use the thread-safe `obicontext` table (with `obicontext:lock()` / `obicontext:unlock()`) for any shared mutable state accessed across workers. +- The `begin()` and `finish()` hooks each run in a single goroutine and do not need locking for their own internal state. +- Sequence records modified inside `worker()` must be returned (or the original returned unmodified) for the record to appear in the output. Returning `nil` drops the sequence. diff --git a/autodoc/cmd/obisummary.md b/autodoc/cmd/obisummary.md new file mode 100644 index 0000000..cda5c11 --- /dev/null +++ b/autodoc/cmd/obisummary.md @@ -0,0 +1,271 @@ +# NAME + +obisummary — resume main information from a sequence file + +--- + +# SYNOPSIS + +``` +obisummary [--batch-mem ] [--batch-size ] + [--batch-size-max ] [--csv] [--debug] [--ecopcr] [--embl] + [--fasta] [--fastq] [--genbank] [--help|-h|-?] + [--input-OBI-header] [--input-json-header] [--json-output] + [--map ]... [--max-cpu ] [--no-order] [--pprof] + [--pprof-goroutine ] [--pprof-mutex ] [--silent-warning] + [--solexa] [--u-to-t] [--version] [--yaml-output] [] +``` + +--- + +# DESCRIPTION + +`obisummary` reads a set of biological sequences and computes a statistical +summary of their content and annotations. Rather than producing a new sequence +file, it outputs a single structured record describing the dataset as a whole. + +The summary covers three main areas. First, global counts: the total number of +reads (sequences weighted by their `count` attribute), the number of distinct +sequence variants, and the total sequence length across all records. Second, +annotation profiling: `obisummary` inspects every annotation key present in +the dataset and classifies it as a scalar attribute (single value per +sequence), a map attribute (key-to-count mapping), or a vector attribute +(multi-value per sequence). Third, per-sample statistics: when sequences carry +sample information (via `merged_sample` or equivalent per-sample annotations), +`obisummary` reports for each sample the number of reads, the number of +variants, and the number of singletons. If `obiclean` has been run previously, +the summary also captures `obiclean_status` and related quality flags per +sample. + +The output is a single JSON record by default, or YAML when `--yaml-output` is +requested. +`obisummary` is typically used after processing steps such as +`obiclean` or `obiuniq` to quickly validate the state of a dataset before +downstream analysis. + +--- + +# INPUT + +`obisummary` accepts biological sequence data from one or more files supplied +as positional arguments, or from standard input when no files are given. +Supported formats include FASTA, FASTQ, GenBank flatfile, EMBL flatfile, +ecoPCR output, and CSV. By default the format is detected automatically; use +the format flags (`--fasta`, `--fastq`, `--genbank`, `--embl`, `--ecopcr`, +`--csv`) to force a specific parser. + +FASTA/FASTQ annotation headers may follow the OBI format (`--input-OBI-header`) +or JSON format (`--input-json-header`). RNA sequences can be read as DNA by +converting uracil to thymine with `--u-to-t`. Quality strings encoded according +to the Solexa specification are handled with `--solexa`. + +When multiple input files are provided, `obisummary` assumes they are ordered; +use `--no-order` to indicate that no ordering exists among them. + +--- + +# OUTPUT + +`obisummary` writes a single structured record to standard output. The default +format is JSON; use `--yaml-output` to obtain YAML instead. + + +The record contains three top-level sections: + +- **`count`**: global metrics including `variants` (distinct sequences), + `reads` (total weighted count), and `total_length` (sum of all sequence + lengths). + +- **`annotations`**: a breakdown of all annotation keys found in the dataset, + classified as `scalar_attributes`, `map_attributes`, or `vector_attributes`, + together with the observed keys and their occurrence counts within each + category. + +- **`samples`**: when sample information is present, `sample_count` and a + per-sample `sample_stats` table with `reads`, `variants`, and `singletons` + fields. If `obiclean` data is present, an `obiclean_bad` field is also + reported per sample. + +When `--map` is used, the named map attribute is included in the annotation +detail for that attribute. + +## Observed output example + +``` +{ + "annotations": { + "keys": { + "scalar": { + "count": 5 + } + }, + "map_attributes": 0, + "scalar_attributes": 1, + "vector_attributes": 0 + }, + "count": { + "reads": 21, + "total_length": 100, + "variants": 5 + } +} +``` + +--- + +# OPTIONS + +## Summary output + +**`--json-output`** +- Default: `false` +- Print the result as a JSON record (this is the default behaviour; this flag + makes the choice explicit). + + +**`--yaml-output`** +- Default: `false` +- Print the result as a YAML record instead of the default JSON format. + + +**`--map `** +- Default: `[]` (none) +- Name of a map attribute to include in the summary. This option may be + repeated to request multiple map attributes. Each named attribute will be + detailed in the `map_attributes` section of the output. + +## Input format + +**`--fasta`** +- Default: `false` +- Read data following the FASTA format. + +**`--fastq`** +- Default: `false` +- Read data following the FASTQ format. + +**`--genbank`** +- Default: `false` +- Read data following the GenBank flatfile format. + +**`--embl`** +- Default: `false` +- Read data following the EMBL flatfile format. + +**`--ecopcr`** +- Default: `false` +- Read data following the ecoPCR output format. + +**`--csv`** +- Default: `false` +- Read data following the CSV format. + +**`--input-OBI-header`** +- Default: `false` +- FASTA/FASTQ title line annotations follow OBI format. + +**`--input-json-header`** +- Default: `false` +- FASTA/FASTQ title line annotations follow JSON format. + +**`--solexa`** +- Default: `false` +- Decode quality strings according to the Solexa specification. + +**`--u-to-t`** +- Default: `false` +- Convert uracil (U) to thymine (T) when reading RNA sequences. + +## Batch control + +**`--batch-size `** +- Default: `1` +- Minimum number of sequences per processing batch. + +**`--batch-size-max `** +- Default: `2000` +- Maximum number of sequences per processing batch. + +**`--batch-mem `** +- Default: `""` (128M effective) +- Maximum memory per batch (e.g. `128K`, `64M`, `1G`). Set to `0` to disable + the memory limit. + +## Processing + +**`--max-cpu `** +- Default: `16` +- Number of parallel threads used to compute the result. + +**`--no-order`** +- Default: `false` +- When several input files are provided, indicates that there is no order + among them. + +## General + +**`--debug`** +- Default: `false` +- Enable debug mode by setting the log level to debug. + +**`--silent-warning`** +- Default: `false` +- Stop printing warning messages. + +**`--version`** +- Default: `false` +- Print the version and exit. + +**`--help` / `-h` / `-?`** +- Default: `false` +- Display help and exit. + +**`--pprof`** +- Default: `false` +- Enable the pprof profiling server. Consult the log for the server address. + +**`--pprof-goroutine `** +- Default: `6060` +- Port for goroutine blocking profile. + +**`--pprof-mutex `** +- Default: `10` +- Port for mutex lock profiling. + +--- + +# EXAMPLES + +```bash +# Get a JSON summary of a FASTA file produced by obiclean +obisummary cleaned.fasta > out_default.yaml +``` + +**Expected output:** a JSON summary record in `out_default.yaml`. + +```bash +# Get the summary as an explicit JSON record for programmatic processing +obisummary --json-output cleaned.fasta > out_json.json +``` + +**Expected output:** a JSON summary record in `out_json.json`. + +```bash +# Get a YAML record from a FASTQ file +obisummary --yaml-output --fastq reads.fastq > out_yaml.yaml +``` + +**Expected output:** a YAML summary record in `out_yaml.yaml`. + +```bash +# Summarise data read from standard input, forcing FASTA format +obigrep -p 'annotations.count > 1' sequences.fasta | obisummary --fasta > out_pipeline.yaml +``` + +**Expected output:** a JSON summary record in `out_pipeline.yaml` (3 variants, 10 reads). + +--- + +# SEE ALSO + +`obiclean`, `obiuniq`, `obicount` diff --git a/autodoc/cmd/obiuniq.md b/autodoc/cmd/obiuniq.md new file mode 100644 index 0000000..d1ccb26 --- /dev/null +++ b/autodoc/cmd/obiuniq.md @@ -0,0 +1,347 @@ +# NAME + +obiuniq — dereplicate sequence data sets + +--- + +# SYNOPSIS + +``` +obiuniq [--batch-mem ] [--batch-size ] [--batch-size-max ] + [--category-attribute|-c ]... [--chunk-count ] + [--compress|-Z] [--csv] [--debug] [--ecopcr] [--embl] + [--fail-on-taxonomy] [--fasta] [--fasta-output] [--fastq] + [--fastq-output] [--genbank] [--help|-h|-?] [--in-memory] + [--input-OBI-header] [--input-json-header] [--json-output] + [--max-cpu ] [--merge|-m ]... [--na-value ] + [--no-order] [--no-progressbar] [--no-singleton] + [--out|-o ] [--output-OBI-header|-O] [--output-json-header] + [--pprof] [--pprof-goroutine ] [--pprof-mutex ] + [--raw-taxid] [--silent-warning] [--skip-empty] [--solexa] + [--taxonomy|-t ] [--u-to-t] [--update-taxid] [--version] + [--with-leaves] [] +``` + +--- + +# DESCRIPTION + +`obiuniq` groups identical sequences together and replaces them with a single +representative, recording the total number of original occurrences as an +abundance count. This process — called dereplication — is a standard step in +amplicon sequencing workflows: it dramatically reduces the number of sequence +records to process, while preserving exact counts needed for downstream +statistical analyses. + +By default, two sequences are considered identical if and only if their +nucleotide strings are the same. Using `--category-attribute` (repeatable), +additional metadata fields can be included in the identity criterion. For +example, grouping by sample name keeps the same sequence as separate records +when it occurs in different samples, enabling per-sample abundance tracking. + +For each group of identical sequences, `obiuniq` emits one output record +carrying the merged metadata of all members. The `--merge` option (repeatable) +instructs the command to also record, in an attribute named `merged_`, the +distribution of `KEY` attribute values across the sequences collapsed into each +group — useful for provenance tracking and quality control. + +Sequences that appear only once in the entire dataset (singletons) can be +removed with `--no-singleton`. Singletons often represent sequencing errors +rather than genuine biological variants, so their removal is a common +noise-reduction step. + +--- + +# INPUT + +`obiuniq` accepts biological sequence data in FASTA, FASTQ, EMBL, GenBank, +ecoPCR, or CSV format (auto-detected by default, or forced with format flags +such as `--fasta`, `--fastq`, `--embl`, etc.). Input is read from one or more +files given as positional arguments, or from standard input when no files are +provided. + +When multiple input files are provided, `obiuniq` assumes they are ordered +(e.g., paired-end reads in the same read order). If no such ordering exists, +use `--no-order` to signal that files can be consumed independently. + +FASTA/FASTQ header annotations are parsed heuristically by default. Use +`--input-OBI-header` for OBI-formatted headers or `--input-json-header` for +JSON-formatted headers. RNA sequences can be normalised to DNA on the fly with +`--u-to-t`. + +--- + +# OUTPUT + +`obiuniq` writes dereplicated sequences to standard output or to the file +specified by `--out`. Each output record represents one group of identical +sequences (identical under the chosen grouping criterion). The output carries +the merged metadata from all input records in the group. + +The output format defaults to FASTA. Even when the input contains quality +scores (FASTQ), quality information is not preserved across merged sequences, +so the output is written in FASTA format unless `--fastq-output` is explicitly +requested. +Output annotations follow the OBI header format when `--output-OBI-header` is +set, or JSON when `--output-json-header` is set. The output can be +gzip-compressed with `--compress`. + +For each output record: +- The abundance count reflects how many input sequences were merged into the + group. +- Attributes created by `--merge KEY` are named `merged_KEY` and map each + observed value of the `KEY` attribute to the count of input sequences + carrying that value within the group. +- All other attributes are merged from the contributing records according to + the standard OBITools4 merging rules. + +## Observed output example + +``` +>seq008 {"count":1,"primer":"p1"} +cccccccccccccccccccc +>seq001 {"count":4,"primer":"p1"} +atcgatcgatcgatcgatcg +>seq004 {"count":2,"primer":"p1","sample":"s1"} +gctagctagctagctagcta +>seq007 {"count":1,"primer":"p1","sample":"s2"} +tttttttttttttttttttt +``` + +--- + +# OPTIONS + +## Dereplication Options + +**`--category-attribute|-c `** (default: `[]`) +Adds one metadata attribute to the grouping criterion. Two sequences are +placed in the same group only when they are nucleotide-identical **and** share +the same value for every attribute listed with `-c`. This option can be +repeated to combine multiple attributes (e.g., `-c sample -c primer`). +Records that lack a listed attribute receive the value set by `--na-value`. + +**`--chunk-count `** (default: `100`) +Controls how many internal partitions the dataset is split into during +processing. A higher value reduces per-partition memory usage at the cost of +more temporary files; a lower value increases per-partition memory but reduces +I/O overhead. Tune this when processing very large or very small datasets. + +**`--in-memory`** (default: `false`) +Stores intermediate data chunks in RAM rather than in temporary disk files. +Speeds up processing on datasets that fit comfortably in available memory; +omit this flag (the default) for large datasets that exceed available RAM. + +**`--merge|-m `** (default: `[]`) +Creates an output attribute named `merged_KEY` that maps each observed value +of the `KEY` attribute to the count of input sequences carrying that value +within the group. Repeat to track multiple attributes. +Useful for tracking which sample or category contributions were collapsed into each group. + +**`--na-value `** (default: `"NA"`) +Value assigned to a category attribute when a sequence record does not carry +that attribute. All sequences lacking the attribute are grouped together under +this placeholder, rather than being treated as incomparable. + +**`--no-singleton`** (default: `false`) +Discards all output records whose abundance count is exactly one — i.e., +sequences that occur only once across the entire input. Removing singletons +is a standard heuristic for excluding sequencing errors from further analysis. + +## Input Options + +**`--batch-mem `** (default: `""`, env: `OBIBATCHMEM`) +Maximum memory budget per processing batch (e.g. `128K`, `64M`, `1G`). Set +to `0` to disable the memory ceiling. Overrides `--batch-size-max` when +both are set. + +**`--batch-size `** (default: `10`, env: `OBIBATCHSIZE`) +Minimum number of sequences per batch (floor). + +**`--batch-size-max `** (default: `2000`, env: `OBIBATCHSIZEMAX`) +Maximum number of sequences per batch (ceiling). + +**`--csv`** (default: `false`) +Parse input as CSV format. + +**`--ecopcr`** (default: `false`) +Parse input as ecoPCR output format. + +**`--embl`** (default: `false`) +Parse input as EMBL flatfile format. + +**`--fasta`** (default: `false`) +Parse input as FASTA format. + +**`--fastq`** (default: `false`) +Parse input as FASTQ format. + +**`--genbank`** (default: `false`) +Parse input as GenBank flatfile format. + +**`--input-OBI-header`** (default: `false`) +Treat FASTA/FASTQ title line annotations as OBI-format key=value pairs. + +**`--input-json-header`** (default: `false`) +Treat FASTA/FASTQ title line annotations as JSON objects. + +**`--no-order`** (default: `false`) +When multiple input files are provided, indicates that there is no ordering +relationship among them. + +**`--skip-empty`** (default: `false`) +Suppress sequences of length zero from the output. + +**`--solexa`** (default: `false`, env: `OBISOLEXA`) +Decode quality strings according to the Solexa specification rather than the +standard Phred encoding. + +**`--u-to-t`** (default: `false`) +Convert uracil (U) to thymine (T) in all input sequences, normalising RNA to +DNA representation. + +## Output Options + +**`--compress|-Z`** (default: `false`) +Compress output using gzip. + +**`--fasta-output`** (default: `false`) +Write output in FASTA format (default when no quality scores are available). + +**`--fastq-output`** (default: `false`) +Write output in FASTQ format (default when quality scores are present). + +**`--json-output`** (default: `false`) +Write output in JSON format. + +**`--out|-o `** (default: `"-"`) +Write output to the specified file instead of standard output. + +**`--output-OBI-header|-O`** (default: `false`) +Write FASTA/FASTQ title line annotations in OBI format. + +**`--output-json-header`** (default: `false`) +Write FASTA/FASTQ title line annotations in JSON format. + +## Taxonomy Options + +**`--fail-on-taxonomy`** (default: `false`) +Cause `obiuniq` to exit with an error if a taxid in the data is not a +currently valid taxon in the loaded taxonomy. + +**`--raw-taxid`** (default: `false`) +Print taxids in output without supplementary information (taxon name and rank). + +**`--taxonomy|-t `** (default: `""`) +Path to the taxonomy database used to validate or update taxids. + +**`--update-taxid`** (default: `false`) +Automatically replace merged taxids with the most recent valid taxid. + +**`--with-leaves`** (default: `false`) +When taxonomy is extracted from a sequence file, add sequences as leaves of +their taxid annotation. + +## Execution Options + +**`--max-cpu `** (default: `16`, env: `OBIMAXCPU`) +Number of parallel threads used to compute the result. + +**`--debug`** (default: `false`, env: `OBIDEBUG`) +Enable debug mode by setting the log level to debug. + +**`--no-progressbar`** (default: `false`) +Disable the progress bar. + +**`--silent-warning`** (default: `false`, env: `OBIWARNING`) +Suppress warning messages. + +**`--pprof`** (default: `false`) +Enable the pprof profiling server (address logged at startup). + +**`--pprof-goroutine `** (default: `6060`, env: `OBIPPROFGOROUTINE`) +Port for the goroutine blocking profile endpoint. + +**`--pprof-mutex `** (default: `10`, env: `OBIPPROFMUTEX`) +Rate for the mutex contention profile. + +**`--version`** (default: `false`) +Print the version string and exit. + +**`--help|-h|-?`** (default: `false`) +Print usage information and exit. + +--- + +# EXAMPLES + +```bash +# Dereplicate a FASTQ file of amplicon reads; write unique sequences to a FASTA output file. +obiuniq reads.fastq -o out_basic.fastq +``` + +**Expected output:** 4 sequences written to `out_basic.fastq`. + +```bash +# Dereplicate keeping sequences separate per sample (category attribute), +# and discard singletons to remove likely sequencing errors. +obiuniq -c sample --no-singleton reads.fastq -o out_no_singleton.fastq +``` + +**Expected output:** 2 sequences written to `out_no_singleton.fastq`. + +```bash +# Dereplicate per sample, recording the sample distribution in 'merged_sample', +# and use 'UNKNOWN' for reads missing the sample attribute. +obiuniq -c sample --merge sample --na-value UNKNOWN reads.fastq -o out_merge.fastq +``` + +**Expected output:** 5 sequences written to `out_merge.fastq`. + +```bash +# Process a dataset entirely in memory using 200 internal partitions, +# writing gzip-compressed output. +obiuniq --in-memory --chunk-count 200 --compress -o out_inmemory.fastq.gz reads.fastq +``` + +**Expected output:** 4 sequences written to `out_inmemory.fastq.gz`. + +```bash +# Dereplicate reads from two sample files with no assumed ordering between them, +# grouping by both sample and primer attributes. +obiuniq --no-order -c sample -c primer sample1.fastq sample2.fastq -o out_multifile.fastq +``` + +**Expected output:** 4 sequences written to `out_multifile.fastq`. + +--- + +# SEE ALSO + +- `obigrep` — filter dereplicated sequences by abundance, length, or annotation +- `obiannotate` — add or modify annotations on dereplicated records +- `obicount` — count sequences or groups in a dataset +- `obiclean` — remove sequencing artefacts from a dereplicated dataset +- `obisummary` — summarise annotation distributions across a sequence set + +--- + +# NOTES + +For datasets that do not fit in RAM, `obiuniq` uses temporary disk-backed +chunk files by default. The number of chunks is controlled by `--chunk-count` +(default 100). Increasing this value lowers per-chunk memory requirements; +decreasing it reduces I/O at the cost of higher peak memory. Use `--in-memory` +only when the full working set fits in available RAM, as exceeding memory will +degrade performance or cause out-of-memory failures. + +Singletons (sequences with abundance = 1) are a common source of noise in +amplicon sequencing, often arising from PCR or sequencing errors. The +`--no-singleton` flag is therefore recommended for most metabarcoding +workflows, unless the study design requires retaining all observed variants. + +When the `--category-attribute` option is used, records that lack the +specified attribute are grouped together under the `--na-value` placeholder +(default `"NA"`). This ensures that all records participate in dereplication +without being silently dropped, but users should be aware that heterogeneous +records with different missing attributes may be unintentionally merged. diff --git a/autodoc/docmd/pkg.md b/autodoc/docmd/pkg.md new file mode 100644 index 0000000..ea669fd --- /dev/null +++ b/autodoc/docmd/pkg.md @@ -0,0 +1,48 @@ +# `neural-ensemble` — A Lightweight Library for Modular Neural Ensemble Learning + +The `neural-ensemble` package provides tools to build, train, evaluate, and deploy ensembles of neural networks with minimal boilerplate. It emphasizes modularity, reproducibility, and scalability—supporting both homogeneous (e.g., multiple ResNets) and heterogeneous ensembles (mix of CNNs, Transformers, MLPs)—while offering unified interfaces for data handling, training orchestration, and uncertainty quantification. + +## Core Functionalities + +### 1. **Model Composition** +- `Ensemble`: A container class to manage multiple models (heterogeneous or homogeneous), supporting dynamic model registration, weighted averaging, voting, and stacking. +- `ModelConfig`: A dataclass to declaratively specify model architecture (e.g., backbone, input shape), training hyperparameters, and checkpoint paths. + +### 2. **Training & Orchestration** +- `EnsembleTrainer`: Handles distributed or sequential training of ensemble members, with support for early stopping, learning rate scheduling per member, and custom loss weighting. +- `TrainerCallback`: Abstract base for implementing logging, checkpointing, or metric tracking hooks. + +### 3. **Data Handling** +- `EnsembleDataset`: Wraps any PyTorch-compatible dataset and automatically replicates inputs across all ensemble members (with optional per-member augmentation). +- `EnsembleDataModule`: Lightning-compatible data module for seamless integration with PyTorch Lightning workflows. + +### 4. **Inference & Aggregation** +- `EnsemblePredictor`: Provides `.predict()` and `.forward_ensemble()`, supporting: + - *Hard/soft voting* (classification) + - *Mean/variance aggregation* (regression) + - *Monte Carlo dropout & deep ensembles* for uncertainty estimation +- `UncertaintyMetrics`: Computes ECE, NLL, Brier score, and predictive entropy. + +### 5. **Evaluation & Calibration** +- `EnsembleEvaluator`: Runs comprehensive evaluation across members and the ensemble, reporting per-member vs. aggregate metrics. +- `CalibrationWrapper`: Applies temperature scaling or isotonic regression to calibrate ensemble outputs. + +### 6. **Serialization & Deployment** +- `Ensemble.save()` / `.load()`: Persists full ensemble state (weights, configs) to disk. +- `Ensemble.to_torchscript()`: Exports the ensemble for production inference (e.g., via TorchServe or ONNX). + +## Key Design Principles +- **Minimal dependencies**: Built on top of PyTorch, with optional integrations (Lightning, HuggingFace). +- **No hidden state**: All ensemble behavior is controlled via explicit configuration. +- **Extensible hooks**: Custom aggregation rules, losses, or metrics can be injected via inheritance. + +## Example Workflow +```python +ensemble = Ensemble([ + ModelConfig(backbone="resnet18", input_shape=(3, 224, 224)), + ModelConfig(backbone="vit_b_16", input_shape=(3, 224, 224)), +]) +trainer = EnsembleTrainer(ensemble=ensemble) +trainer.fit(train_loader, val_loader) +preds, uncertainties = EnsemblePredictor(ensemble).predict(test_loader, return_uncertainty=True) +``` diff --git a/autodoc/docmd/pkg/obialign/alignment.md b/autodoc/docmd/pkg/obialign/alignment.md new file mode 100644 index 0000000..8d02e24 --- /dev/null +++ b/autodoc/docmd/pkg/obialign/alignment.md @@ -0,0 +1,22 @@ +# `obialign` Package: Sequence Alignment Utilities + +The `obialign` package provides core functions for pairwise biological sequence alignment in Go, designed to work with `obiseq.BioSequence` objects. + +- **Core Alignment Construction**: `_BuildAlignment()` and `BuildAlignment()` reconstruct aligned sequences from a precomputed alignment path (e.g., output by dynamic programming). It supports gap characters and reuses buffers for efficiency. + +- **Quality-Aware Consensus Building**: `BuildQualityConsensus()` generates a consensus sequence from an alignment and per-base quality scores: + - At mismatches, it retains the higher-quality base. + - When qualities are equal and bases differ, an IUPAC ambiguity code is used (via `_FourBitsBaseCode`/`_Decode`). + - Quality values are combined and adjusted for mismatches using a Phred-like error probability model. + - Optionally records mismatch statistics in sequence attributes. + +- **Performance & Memory Efficiency**: Uses preallocated buffers (via `PEAlignArena`) or fallback allocation, with slice recycling to minimize GC pressure. + +- **Metadata Handling**: Preserves sequence IDs and definitions in output; supports optional mismatch reporting for downstream analysis. + +- **Alignment Path Format**: The path is a sequence of signed integers encoding: + - Negative steps → deletions in seqB (insertion in A), + - Positive steps → insertions in B, + - Consecutive pairs encode match/mismatch runs. + +This package is part of the OBITools4 ecosystem, targeting high-throughput amplicon or metagenomic data processing. diff --git a/autodoc/docmd/pkg/obialign/backtracking.md b/autodoc/docmd/pkg/obialign/backtracking.md new file mode 100644 index 0000000..b817120 --- /dev/null +++ b/autodoc/docmd/pkg/obialign/backtracking.md @@ -0,0 +1,30 @@ +# Semantic Description of `obialign` Backtracking Module + +The `_Backtracking` function implements a **traceback algorithm** for sequence alignment, reconstructing the optimal path through an alignment matrix. + +## Core Functionality + +- **Input**: + - `pathMatrix`: Encodes alignment decisions (match/mismatch/gap) as integers. + - `lseqA`, `lseqB`: Lengths of sequences A and B. + - `path`: Pre-allocated slice to store the traceback path. + +- **Output**: A compact representation of alignment steps, alternating between: + - Diagonal moves (`ldiag`): Matches/mismatches (one step in both sequences). + - Horizontal/vertical moves (`lleft` or `lup`): Gaps in sequence B (horizontal) or A (vertical). + +## Algorithm Highlights + +- **Reverse traversal** from `(lseqA−1, lseqB−1)` to origin. +- **Batching logic**: Consecutive gaps in same direction are aggregated (e.g., `lleft += step`) to compress run-length encoding. +- **Path reconstruction**: Steps are pushed *backwards* into the `path` slice using a moving pointer `p`. +- **Memory efficiency**: Uses `slices.Grow()` to preallocate space and logs resizing for debugging. + +## Encoded Path Semantics + +Each pair in the returned slice encodes: +- `[diag_count, move_type]`, where `move_type` is either a gap length (`lleft > 0`: horizontal, or `lup < 0`: vertical) or zero (end of diagonal run). + +## Use Case + +Enables efficient reconstruction and serialization of alignment paths—ideal for tools requiring low-level control over dynamic programming backtracking (e.g., pairwise aligners, edit-distance decompositions). diff --git a/autodoc/docmd/pkg/obialign/dnamatrix.md b/autodoc/docmd/pkg/obialign/dnamatrix.md new file mode 100644 index 0000000..d6b231f --- /dev/null +++ b/autodoc/docmd/pkg/obialign/dnamatrix.md @@ -0,0 +1,26 @@ +# Semantic Description of `obialign` Package + +This Go package provides core utilities for **DNA sequence alignment scoring**, leveraging probabilistic models and log-space computations to ensure numerical stability. + +## Key Functionalities + +- **Four-bit nucleotide encoding**: Uses `_FourBitsBaseCode` (implied but not shown) to encode DNA bases as 4-bit values, enabling bitwise operations for fast comparison. + +- **Bitwise match ratio (`_MatchRatio`)**: Computes a normalized overlap score between two encoded bases by counting shared bits, adjusting for presence/absence in each operand. + +- **Log-space arithmetic helpers**: + - `_Logaddexp`: Stable computation of `log(exp(a) + exp(b))`. + - `_Log1mexp`, `_Logdiffexp`: Accurate log-domain operations for `log(1 − exp(a))` and `log(exp(a) − exp(b))`, critical for probability transformations. + +- **Match/mismatch scoring (`_MatchScoreRatio`)**: + - Derives log-probability-based scores for observed matches/mismatches using Phred-quality inputs (`QF`, `QR`). + - Incorporates base composition priors (e.g., uniform 4-mer assumption via `log(3)`, `log(4)`). + +- **Precomputed scoring matrices**: + - `_NucPartMatch`: Precomputes match ratios for all base-pair combinations. + - `_NucScorePartMatch{Match,Mismatch}`: Stores integer-scaled alignment scores (×10) for all Phred-quality pairs, enabling fast lookup during dynamic programming. + +- **Thread-safe initialization**: + - `_InitDNAScoreMatrix` ensures one-time setup of all matrices using a mutex guard, preventing race conditions. + +All computations are designed for high performance and numerical robustness in large-scale sequence alignment tasks. diff --git a/autodoc/docmd/pkg/obialign/fastlcs.md b/autodoc/docmd/pkg/obialign/fastlcs.md new file mode 100644 index 0000000..fd61f92 --- /dev/null +++ b/autodoc/docmd/pkg/obialign/fastlcs.md @@ -0,0 +1,23 @@ +# Semantic Description of `obialign` Package + +The `obialign` package provides low-level utilities for efficiently encoding, decoding, and manipulating alignment-related metrics—specifically **score**, **path length**, and an **out-flag**—within compact 64-bit integers. This design supports high-performance operations in sequence alignment pipelines (e.g., OBITools4). + +- **Core Encoding Strategy**: + A `uint64` encodes three fields: a *score* (upper bits), an inverted path *length*, and a single-bit flag indicating whether the value represents an "out" (i.e., terminal/invalid) state. + +- **`encodeValues(score, length int, out bool)`**: + Packs `score`, `-length-1` (to preserve ordering via unsigned comparison), and the `out` flag into one integer. The most significant bit (bit 32) marks out-values. + +- **`decodeValues(value uint64)`**: + Reverses encoding: extracts score, reconstructs original length via `((value + 1) ^ mask)`, and checks the out-flag. + +- **Utility Bitwise Helpers**: + - `_incpath(value)`: decrements stored length (since it's negated, subtraction increases actual path). + - `_incscore(value)`: increments score by `1 << wsize`. + - `_setout(value)`: clears the out-flag, marking value as *not* terminal. + +- **Predefined Constants**: + - `_empty`: neutral state (score=0, length=0). + - `_out`/`_notavail`: sentinel values for invalid or unavailable paths (high length, score=0). + +This compact representation enables fast comparisons and updates during dynamic programming or alignment graph traversal—critical for scalability in large-scale metabarcoding analyses. diff --git a/autodoc/docmd/pkg/obialign/fastlcsegf.md b/autodoc/docmd/pkg/obialign/fastlcsegf.md new file mode 100644 index 0000000..f85d22a --- /dev/null +++ b/autodoc/docmd/pkg/obialign/fastlcsegf.md @@ -0,0 +1,42 @@ +# Semantic Description of `obialign` Package + +The `obialign` package provides high-performance functions for computing the **Longest Common Subsequence (LCS)** between two biological sequences, with support for error tolerance and end-gap-free alignment. + +## Core Algorithm + +- Implements a **Needleman-Wunsch** dynamic programming algorithm optimized for speed and memory efficiency. +- Uses bit-packed encoding (`uint64`) to store score, path length, and gap status in a compact form. +- Leverages **diagonal banding** to restrict computation only within the allowed error margin, reducing time and space complexity. + +## Scoring Scheme + +- **Match**: +1 point +- **Mismatch or gap (indel)**: 0 points + +## Key Functions + +1. `FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[]uint64) (int, int, int)` + - Computes LCS score and alignment length between raw byte sequences. + - If `endgapfree` is true, ignores leading/trailing gaps (useful for read alignment). + - Returns `(score, length, end_position)`; `end_position` marks where the LCS ends in sequence A. + - Returns `-1, -1, -1` if the actual error count exceeds `maxError`. + +2. `FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer ...)` + - Wrapper for `FastLCSEGFScoreByte` with end-gap-free mode enabled by default. + - Designed for standard biosequence inputs. + +3. `FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer ...)` + - Computes standard LCS (including end gaps). Returns `(score, alignment_length)`. + +## Features + +- **Error-bounded**: Supports `maxError = -1` (unlimited) or a fixed max number of mismatches + gaps. +- **Memory-efficient**: Reuses user-provided or auto-created buffers to avoid allocations during repeated calls. +- **IUPAC-aware**: Uses `obiseq.SameIUPACNuc()` to handle ambiguous nucleotide codes (e.g., `R`, `Y`). +- **Optimized for short reads**: Particularly suited to high-throughput sequencing data alignment tasks (e.g., in OBITools4). + +## Use Cases + +- Molecular barcode/UMI clustering +- Read-to-reference alignment in amplicon sequencing +- Similarity filtering of biological sequences diff --git a/autodoc/docmd/pkg/obialign/fourbitsencode.md b/autodoc/docmd/pkg/obialign/fourbitsencode.md new file mode 100644 index 0000000..c0cb104 --- /dev/null +++ b/autodoc/docmd/pkg/obialign/fourbitsencode.md @@ -0,0 +1,15 @@ +# Semantic Description of `obialign` Package + +The `obialign` package provides low-level utilities for efficient nucleotide sequence encoding and decoding, specifically designed for bioinformatics alignment tasks. + +- **Core functionality**: Encodes IUPAC nucleotide symbols (including ambiguous codes like `R`, `Y`, `N`) into compact 4-bit binary representations. +- **Binary encoding scheme**: Each bit in a byte corresponds to one canonical nucleotide: A (bit 0), C (bit 1), G (bit 2), T (bit 3). +- **Ambiguity support**: Codes like `R` (A/G) set both corresponding bits (`0b0101`). Fully ambiguous `N` sets all four bits (`0b1111`). +- **Gap/missing handling**: Symbols `.` and `-`, as well as non-nucleotide characters, map to `0b0000`. +- **Memory efficiency**: The encoding avoids allocations via optional buffer reuse. +- **Lookup tables**: + - `_FourBitsBaseCode`: Maps ASCII nucleotide characters (lowercased via `nuc & 31`) to their binary code. + - `_FourBitsBaseDecode`: Inverse mapping for human-readable output (not exported, used internally). +- **Integration**: Works with `obiseq.BioSequence`, a generic biological sequence container from the OBITools4 ecosystem. + +The `Encode4bits` function enables fast, space-efficient sequence processing—ideal for high-throughput sequencing data where alignment speed and memory usage are critical. diff --git a/autodoc/docmd/pkg/obialign/is_d0_or_d1.md b/autodoc/docmd/pkg/obialign/is_d0_or_d1.md new file mode 100644 index 0000000..47a0661 --- /dev/null +++ b/autodoc/docmd/pkg/obialign/is_d0_or_d1.md @@ -0,0 +1,19 @@ +## `obialign` Package: Semantic Overview (≤50 lines) + +The `obialign` package provides a lightweight, high-performance utility for **detecting single-edit-distance relationships** between biological sequences (`obiseq.BioSequence`). Its core function, `D1Or0`, determines whether two sequences are either **identical** or differ by exactly **one substitution, insertion, or deletion (indel)**. + +- `abs[k]`: A generic helper computing absolute values for integers or floats (via Go generics). +- `D1Or0(...)`: Returns a 4-tuple: + - **`int` (first)**: `0` if identical, `1` if differing by one edit, `-1` otherwise. + - **`int` (second)**: Position of the differing site (`-1` if identical). + - **`byte`, `byte`**: Mismatched characters (or `'-'` for gaps indicating indels). + +**Algorithmic strategy:** +1. Early rejection if length difference exceeds 1. +2. Forward scan until first mismatch → identifies left boundary of divergence. +3. Backward scan from ends to find rightmost match boundary. +4. Validates whether the mismatch region allows exactly one edit: + - Single substitution: equal lengths, single divergent position. + - Insertion/deletion: length differs by 1 and only one non-overlapping character remains. + +Designed for speed in **OTU/ASV dereplication or error correction** pipelines (e.g., metabarcoding), where rapid filtering of near-identical sequences is critical. Does *not* compute full alignments; optimized for binary decision-making under strict edit constraints. diff --git a/autodoc/docmd/pkg/obialign/locatepattern.md b/autodoc/docmd/pkg/obialign/locatepattern.md new file mode 100644 index 0000000..410766e --- /dev/null +++ b/autodoc/docmd/pkg/obialign/locatepattern.md @@ -0,0 +1,29 @@ +# `LocatePattern` Functionality Overview + +The `obialign.LocatePattern` function implements a **local alignment algorithm** to find the best approximate match of a short DNA pattern (e.g., primer) within a longer biological sequence, using **dynamic programming**. + +- **Input**: + - `id`: identifier for logging/error reporting. + - `pattern []byte`: the query sequence (e.g., primer). + - `sequence []byte`: the target read/contig. + +- **Constraints**: + - Pattern must be strictly shorter than the sequence (`len(pattern) < len(sequence)`). + +- **Scoring Scheme**: + - Match: `+0` (using IUPAC compatibility via `obiseq.SameIUPACNuc`). + - Mismatch/Gap: `-1`. + +- **Algorithm Features**: + - End-gap free alignment (no penalty for gaps at sequence ends), enabling flexible primer positioning. + - Uses a flattened buffer (`buffIndex`) for memory-efficient matrix storage (width × height). + - Tracks alignment path via `path` array: diagonal (`0`, match/mismatch), up (`+1`, deletion in pattern/left gap), left (`-1`, insertion/deletion). + - Backtracks from the bottom-right to find optimal local alignment start/end coordinates. + +- **Output**: + - `start`: starting index in `sequence`. + - `end+1`: ending index (exclusive) of best match. + - Error count: `-score`, i.e., number of mismatches/gaps in alignment. + +- **Use Case**: + Designed for high-throughput amplicon processing (e.g., primer trimming in metabarcoding pipelines like OBITools4). diff --git a/autodoc/docmd/pkg/obialign/pairedendalign.md b/autodoc/docmd/pkg/obialign/pairedendalign.md new file mode 100644 index 0000000..67699f3 --- /dev/null +++ b/autodoc/docmd/pkg/obialign/pairedendalign.md @@ -0,0 +1,37 @@ +# Semantic Description of `obialign` Package + +The `obialign` package provides high-performance, memory-efficient tools for **pairwise alignment of paired-end biological sequences**, optimized specifically for Next-Generation Sequencing (NGS) data. + +## Core Functionalities + +### 1. **Memory Arena Management** +- `PEAlignArena` is a reusable memory buffer to avoid repeated allocations during multiple alignments. +- Preallocates matrices (`scoreMatrix`, `pathMatrix`), alignment buffers, and auxiliary structures based on expected max sequence lengths. + +### 2. **Dynamic Programming Alignment Functions** +Implements three specialized global alignment variants using Needleman–Wunsch with affine gap penalties (scaled per mismatch): + +- **`PELeftAlign`**: Free gaps at the *start* of `seqB` and end of `seqA`. Ideal for aligning overlapping reads where the first read starts before or within the second. +- **`PERightAlign`**: Free gaps at start of `seqA` and end of `seqB`. Suited when the second read extends beyond the first. +- **`PECenterAlign`**: Free gaps at both ends of *both* sequences; requires `seqA ≥ seqB`. Designed for full overlap scenarios (e.g., merging paired-end reads). + +All use column-major matrix storage and efficient index arithmetic via helper functions `_GetMatrix`, `_SetMatrices`, etc. + +### 3. **Scoring & Quality Integration** +- Pairwise base/quality scores computed by `_PairingScorePeAlign`, combining: + - Nucleotide compatibility (via precomputed `_NucPartMatch`) + - Phred quality scores (`_NucScorePartMatchMatch`, `_NucScorePartMatchMismatch`) + - A user-defined `scale` factor to modulate mismatch penalties. + +### 4. **Fast Heuristic Pre-Alignment** +The main `PEAlign` function integrates a kmer-based fast pre-screening: +- Uses 4-mer indexing (`obikmer.Index4mer`) and shift estimation via `FastShiftFourMer`. +- If overlap is significant (`fastCount + 3 < over`), performs localized DP only on the predicted overlapping region (using `PELeftAlign` or `PERightAlign`) to save time. +- Otherwise, computes full alignment over entire sequences (both left and right variants), selecting the best score. + +### 5. **Backtracking & Path Output** +- `_Backtracking` reconstructs the optimal alignment path from `pathMatrix`. +- Paths encoded as alternating `(offset, length)` pairs for aligned segments (diagonal = 0), with gaps encoded as `-1`/`+1`. + +### Use Case +Designed for **paired-end read merging**, overlap detection, and consensus building in metagenomic pipelines (e.g., OBITOOLS4 ecosystem). Efficient, scalable for large batch processing via arena reuse. diff --git a/autodoc/docmd/pkg/obialign/readalign.md b/autodoc/docmd/pkg/obialign/readalign.md new file mode 100644 index 0000000..d97bb40 --- /dev/null +++ b/autodoc/docmd/pkg/obialign/readalign.md @@ -0,0 +1,58 @@ +# Semantic Description of `obialign.ReadAlign` + +The `ReadAlign` function performs **paired-end read alignment** with quality-aware scoring, optimized for overlapping consensus construction in NGS data processing. + +## Core Functionality + +- **Input**: Two biological sequences (`seqA`, `seqB`) as `BioSequence` objects, plus alignment parameters: + - `gap`: gap penalty (linear) + - `scale`: scaling factor for quality scores + - `delta`: extension buffer around initial overlap estimate + - `fastScoreRel`: use relative vs absolute k-mer matching score + +## Algorithm Overview + +1. **Preprocessing & Initialization** + - Ensures DNA scoring matrix is initialized (`_InitDNAScoreMatrix`). + +2. **Fast Overlap Estimation via 4-mer Indexing** + - Builds a k-mer index of `seqA` using `obikmer.Index4mer`. + - Computes optimal shift via `_FastShiftFourMer` in both forward and reverse-complement orientations. + - Selects orientation (direct or reversed) yielding highest k-mer match count (`fastCount`) and score (`fastScore`). + +3. **Overlap Computation** + - Determines overlap length `over` based on shift: + ```text + over = |seqA| - shift if shift > 0 + |seqB| + shift if shift < 0 + min(|seqA|,|seqB)| otherwise + ``` + +4. **Dynamic Programming Alignment** + - If overlap is *not* identical (`fastCount + 3 < over`): + - Extracts subregions with `delta`-buffered boundaries. + - Calls either `_FillMatrixPeLeftAlign` (left-aligned case) or `_FillMatrixPERightAlign`. + - Backtracks via `_Backtracking` to produce alignment path. + - Else (near-perfect overlap): + - Skips DP; computes score directly from quality scores using `_NucScorePartMatchMatch`. + - Returns trivial path `[extra5, partLen]`. + +## Output + +Returns: + +| Index | Type | Meaning | +|-------|----------|---------| +| 0️⃣ | `int` | Final alignment score (weighted by quality) | +| 1️⃣ | `[]int` | Alignment path (list of positions: `[startA, endA, startB, endB]` or similar) | +| 2️⃣ | `int` | K-mer match count (`fastCount`) | +| 3️⃣ | `int` | Overlap length (`over`) | +| 4️⃣ | `float64` | K-mer-based score (`fastScore`) | +| 5️⃣ | `bool` | Whether alignment was performed in direct orientation (`true`) or on reverse-complement of `seqB` | + +## Key Design Highlights + +- **Efficient pre-filtering** using 4-mers avoids full DP for nearly identical reads. +- **Quality-aware scoring**, leveraging Phred scores via `_NucScorePartMatchMatch`. +- Supports **asymmetric overlaps** (left/right alignment) with boundary padding (`delta`). +- Uses preallocated memory arenas to minimize GC pressure in high-throughput pipelines. diff --git a/autodoc/docmd/pkg/obiapat/pattern.md b/autodoc/docmd/pkg/obiapat/pattern.md new file mode 100644 index 0000000..9f1bc29 --- /dev/null +++ b/autodoc/docmd/pkg/obiapat/pattern.md @@ -0,0 +1,25 @@ +# Apat Package: Pattern Matching for Biological Sequences + +The `obiapat` Go package provides high-performance pattern matching over biological sequences using the **Apat algorithm**, a C-based implementation wrapped in Go. It supports fuzzy matching (with mismatches and indels), reverse-complement patterns, memory-safe resource management via finalizers, and efficient filtering of non-overlapping matches. + +## Core Types + +- `ApatPattern`: Represents a compiled pattern (up to 64 bp), supporting IUPAC ambiguity codes (`W`, `[AT]`), negated bases (`!A`), and fixed positions (`#`). +- `ApatSequence`: Wraps a biological sequence (from `obiseq.BioSequence`) for fast matching, with optional circular topology support and memory recycling. + +## Key Functions & Methods + +- `MakeApatPattern(pattern string, errormax int, allowsIndel bool)`: Compiles a pattern with max error tolerance and optional indels. +- `ReverseComplement()`: Returns the reverse-complemented pattern (useful for DNA strand symmetry). +- `FindAllIndex(...)`: Returns all matches as `[start, end, errors]`, supporting partial sequence searches. +- `IsMatching(...)`: Boolean check for presence of at least one match in a region. +- `BestMatch(...)`: Finds the *best* (lowest-error) match, with local realignment for indel-containing patterns. +- `FilterBestMatch(...)`: Returns *non-overlapping* matches, prioritizing lower-error occurrences. +- `AllMatches(...)`: Filters and refines all valid matches (including indel-aware alignment). +- `Free()`, `Len()`: Explicit memory cleanup and length queries. + +## Implementation Notes + +Internally, the package uses `cgo` to interface with C structures (`Pattern`, `Seq`) allocated via custom memory management. Finalizers ensure safe deallocation, while unsafe pointer arithmetic avoids data copying during search (e.g., `unsafe.SliceData`). Logging is integrated via Logrus. + +This package enables scalable, low-level pattern mining in NGS data preprocessing pipelines (e.g., primer detection, adapter trimming). diff --git a/autodoc/docmd/pkg/obiapat/pattern_test.md b/autodoc/docmd/pkg/obiapat/pattern_test.md new file mode 100644 index 0000000..ac5c6e6 --- /dev/null +++ b/autodoc/docmd/pkg/obiapat/pattern_test.md @@ -0,0 +1,32 @@ +# Semantic Description of `obiapat` Package Functionality + +The `obiapat` package provides utilities for constructing and representing **approximate sequence patterns**—flexible biological or symbolic string templates supporting mismatches, insertions, and deletions. + +## Core Functionality + +- **`MakeApatPattern(pattern string, errormax int, allowsIndel bool)`** + Parses a pattern specification (e.g., `"A[T]C!GT"`) and returns an internal representation (`*ApatPattern`) suitable for approximate matching. + + - `pattern`: A string where: + - Standard characters (e.g., `'A'`, `'C'`) denote exact matches. + - Brackets `[X]` indicate *optional* or *variable positions*, e.g., ambiguity (like IUPAC codes). + - Exclamation `!` marks positions where **errors** (substitutions) are permitted. + - `errormax`: Maximum number of allowed errors (mismatches or indels, depending on flags). + - `allowsIndel`: Boolean flag enabling/disabling insertion/deletion operations. + +## Behavior & Semantics + +- Returns a compiled pattern object (non-nil) on success; errors may arise from malformed input or invalid parameters. +- Supports three modes: + - **Exact matching** (`errormax = 0`, `allowsIndel = false`). + - **Substitution-only approximation** (`errormax > 0`, `allowsIndel = false`). + - **Full approximate matching with indels** (`errormax > 0`, `allowsIndel = true`). + +## Testing Coverage + +The provided test suite validates: +- Valid pattern parsing across different configurations. +- Correct handling of `nil` vs. non-nil output pointers. +- Robustness against error conditions (e.g., invalid inputs would trigger expected errors). + +In summary, `obiapat` enables efficient definition and handling of *approximate regular expressions* tailored for sequence analysis in bioinformatics or pattern recognition contexts. diff --git a/autodoc/docmd/pkg/obiapat/pcr.md b/autodoc/docmd/pkg/obiapat/pcr.md new file mode 100644 index 0000000..63a1cfe --- /dev/null +++ b/autodoc/docmd/pkg/obiapat/pcr.md @@ -0,0 +1,27 @@ +# PCR Simulation Module (`obiapat`) + +This Go package implements a **PCR (Polymerase Chain Reaction) simulation algorithm** for biological sequence analysis. It supports flexible primer matching, amplicon extraction with optional flanking extensions, and handles both linear and circular DNA topologies. + +## Key Functionalities + +- **Primer Matching**: Accepts forward/reverse primers with configurable mismatch tolerance (`OptionForwardPrimer`, `OptionReversePrimer`). Internally builds pattern objects and their reverse complements. +- **Amplicon Extraction**: Identifies valid amplicons bounded by primer pairs, respecting user-defined length constraints (`OptionMinLength`, `OptionMaxLength`). +- **Extension Support**: Optionally adds fixed-length flanking regions (`OptionWithExtension`) — either strict full-extension only or partial trimming allowed. +- **Topology Handling**: Supports linear (`Circular: false`) and circular DNA sequences via `OptionCircular`. +- **Batch & Parallel Processing**: Configurable batch size (`OptionBatchSize`) and parallel workers count (`OptionParallelWorkers`), enabling efficient processing of large datasets. +- **Annotation-Rich Output**: Each amplicon includes detailed annotations (primer sequences, match positions, errors, direction), preserving original sequence metadata. + +## Core API + +- `PCRSim(sequence, options...)`: Simulates PCR on a single sequence. +- `PCRSlice(sequencesSlice, options...)`: Applies simulation across multiple sequences in a slice. +- `PCRSliceWorker(options...)`: Returns a reusable worker function for parallel execution via `obiseq.MakeISliceWorker`. + +## Implementation Details + +- Uses pattern-matching (`ApatPattern`) with fuzzy search to locate primers. +- Handles circular topology by wrapping indices around sequence boundaries. +- Reuses internal memory via `MakeApatSequence`/`Free`, supporting efficient GC and large-scale processing. +- Logs critical errors with `logrus`; debug-level details for amplicon generation. + +Designed to integrate within the OBITools4 ecosystem, this module enables high-fidelity *in silico* PCR for metabarcoding and NGS data validation workflows. diff --git a/autodoc/docmd/pkg/obiapat/predicat.md b/autodoc/docmd/pkg/obiapat/predicat.md new file mode 100644 index 0000000..3790155 --- /dev/null +++ b/autodoc/docmd/pkg/obiapat/predicat.md @@ -0,0 +1,23 @@ +## Semantic Description of `IsPatternMatchSequence` + +The function `IsPatternMatchSequence` defines a **sequence predicate** for pattern-based matching in biological sequences (e.g., DNA/RNA), supporting fuzzy and strand-aware search. + +### Core Functionality: +- **Input Parameters** + - `pattern`: A regular expression-like string describing the target pattern. + - `errormax`: Maximum allowed mismatches (substitutions only by default). + - `bothStrand`: If true, also search on the reverse-complement strand. + - `allowIndels`: Enables insertion/deletion errors (beyond mismatches) when set to true. + +- **Internal Workflow** + - Parses the pattern into an automaton (`apat`) via `MakeApatPattern`. + - Computes its reverse complement for dual-strand matching. + - Returns a closure (`SequencePredicate`) that tests whether a given `BioSequence` matches the pattern (or its RC), within error tolerance. + +- **Matching Logic** + - Converts input sequence to `apat` format. + - Checks match on forward strand first; if failed and `bothStrand=true`, tries reverse complement. + - Uses automaton-based matching (`IsMatching`) for efficient fuzzy search. + +### Semantic Use Case: +Enables flexible, error-tolerant detection of sequence motifs (e.g., primers, barcodes) in high-throughput sequencing data—supporting both *in silico* primer design validation and read filtering in metagenomic pipelines. diff --git a/autodoc/docmd/pkg/obichunk/chunk.md b/autodoc/docmd/pkg/obichunk/chunk.md new file mode 100644 index 0000000..f768cbf --- /dev/null +++ b/autodoc/docmd/pkg/obichunk/chunk.md @@ -0,0 +1,15 @@ +# `ISequenceChunk` Function — Semantic Description + +The `ISequenceChunk` function provides a unified interface for processing biological sequence data in chunks, supporting two execution modes: **in-memory** and **on-disk**, depending on resource constraints or performance needs. + +- It accepts an iterator over biological sequences (`obiiter.IBioSequence`) and a sequence classifier (`obiseq.BioSequenceClassifier`), used to annotate or categorize sequences. +- A boolean flag `onMemory` determines whether processing occurs in RAM (`ISequenceChunkOnMemory`) or on disk (`ISequenceChunkOnDisk`), enabling scalability for large datasets. +- Optional parameters allow fine-tuning: + - `dereplicate`: enables deduplication of identical sequences. + - `na`: specifies how missing or ambiguous values are handled (e.g., `"?"`, `"N"`, etc.). + - `statsOn`: configures what metadata (e.g., description fields) are tracked for statistics. + - `uniqueClassifier`: an optional secondary classifier used to assign unique identifiers or labels. + +The function abstracts the underlying implementation, ensuring consistent behavior regardless of storage strategy. It returns an iterator over processed sequences (`obiiter.IBioSequence`) or an error, supporting streaming workflows and compatibility with downstream pipeline stages. + +This design promotes flexibility, memory efficiency, and modularity in high-throughput sequence analysis pipelines (e.g., metabarcoding). diff --git a/autodoc/docmd/pkg/obichunk/chunk_on_disk.md b/autodoc/docmd/pkg/obichunk/chunk_on_disk.md new file mode 100644 index 0000000..938f243 --- /dev/null +++ b/autodoc/docmd/pkg/obichunk/chunk_on_disk.md @@ -0,0 +1,18 @@ +# `obichunk` Package: On-Disk Chunking and Dereplication of Biosequences + +The `obichunk` package provides functionality to efficiently process large sets of biological sequences by splitting them into manageable, disk-based chunks. Its core feature is the `ISequenceChunkOnDisk` function, which takes a sequence iterator and distributes sequences into temporary files using a classifier. Each file corresponds to one *batch* (e.g., `chunk_*.fastx`), enabling scalable, parallel-friendly workflows. + +Key capabilities include: + +- **Temporary Directory Management**: Automatically creates and cleans up a system temp directory (`obiseq_chunks_*`) for intermediate storage. +- **File Discovery**: Recursively finds all `.fastx` files generated during chunking via `find`. +- **Asynchronous Streaming**: Returns an iterator (`obiiter.IBioSequence`) that yields batches asynchronously, decoupling chunk creation from consumption. +- **Optional Dereplication**: When enabled (`dereplicate = true`), sequences are deduplicated *per batch* using a composite key (sequence + classification categories). Merged duplicates retain aggregated statistics. +- **Logging & Monitoring**: Logs total batch count and per-batch processing start events for transparency. + +Internally, `ISequenceChunkOnDisk` uses: +- `obiiter.MakeIBioSequence()` to build the output iterator, +- `obiformats.WriterDispatcher` for parallel writing of distributed sequences into chunk files, +- and a second goroutine to read, optionally dereplicate (via `BioSequenceClassifier`), and push batches back into the output iterator. + +Designed for memory efficiency, it avoids loading all sequences in RAM by streaming and chunking on-disk—ideal for large-scale NGS data preprocessing. diff --git a/autodoc/docmd/pkg/obichunk/chunks_on_memory.md b/autodoc/docmd/pkg/obichunk/chunks_on_memory.md new file mode 100644 index 0000000..9ec3b87 --- /dev/null +++ b/autodoc/docmd/pkg/obichunk/chunks_on_memory.md @@ -0,0 +1,21 @@ +# `ISequenceChunkOnMemory` Function — Semantic Description + +The function `Isequencechunkonmemory`, from the Go package `obichunk`, implements **asynchronous in-memory chunking** of biological sequence data. + +It consumes an iterator over `BioSequence` objects and distributes them into **heterogeneous batches** using a provided classifier. The core purpose is to group sequences by classification (e.g., sample, taxon, or feature), store each group in memory as a slice (`BioSequenceSlice`), and emit them sequentially via an output iterator. + +Key features: +- **Parallel processing**: Each classification group (referred to as a *flux*) is processed in its own goroutine. +- **Thread-safe aggregation**: A mutex ensures safe concurrent updates to shared `chunks` and `sources` maps. +- **Lazy emission**: Batches are emitted only after all classification groups have been fully processed (`jobDone.Wait()`). +- **Ordered output**: Batches are emitted in increasing `order` index (0, 1, …), preserving determinism despite parallel internal processing. +- **Error handling**: Critical failures (e.g., channel retrieval errors) terminate the program with `log.Fatalf`. + +Input: +- An iterator (`obiiter.IBioSequence`) of raw sequences. +- A `*obiseq.BioSequenceClassifier`, used to route each sequence into a classification bucket. + +Output: +- A new iterator yielding `BioSequenceBatch` objects, each containing all sequences belonging to one classification group and its source identifier. + +Use case: Efficient parallel preprocessing of high-throughput sequencing data into sample- or taxon-specific batches for downstream analysis. diff --git a/autodoc/docmd/pkg/obichunk/options.md b/autodoc/docmd/pkg/obichunk/options.md new file mode 100644 index 0000000..28e263b --- /dev/null +++ b/autodoc/docmd/pkg/obichunk/options.md @@ -0,0 +1,26 @@ +# Semantic Description of `obichunk` Package + +The `obichunk` package provides a flexible and configurable options management system for data processing pipelines, particularly in the context of biological sequence analysis (e.g., metabarcoding). It defines a typed `Options` struct and associated builder-style configuration functions. + +## Core Concepts + +- **Immutable Configuration Builder**: Options are constructed via `MakeOptions([]WithOption)`, applying a list of functional setters (`WithOption`) to an internal `__options__` struct. +- **Encapsulation**: The concrete options are hidden behind a pointer (`pointer *__options__`) to ensure safe sharing and mutation control. + +## Supported Functionalities + +- **Categorization**: `OptionSubCategory(keys...)` appends category labels (e.g., sample or marker names) to an internal list; `PopCategories()` retrieves and removes the first category. +- **Missing Value Handling**: `OptionNAValue(na string)` customizes placeholder for missing data (default: `"NA"`). +- **Statistical Tracking**: `OptionStatOn(keys...)` registers statistical descriptions (via `obiseq.StatsOnDescription`) for per-field metrics collection. +- **Batch Processing Control**: + - `OptionBatchCount(number)` sets the number of batches. + - `OptionsBatchSize(size)` defines how many items per batch (default from `obidefault`). +- **Parallelization**: `OptionsParallelWorkers(nworkers)` configures concurrency level (default from environment). +- **Disk vs Memory Sorting**: `OptionSortOnDisk()` enables disk-backed sorting; `OptionSortOnMemory()` disables it (default). +- **Singleton Filtering**: `OptionsNoSingleton()` excludes singleton sequences; `OptionsWithSingleton()` allows them (default). + +## Design Highlights + +- Functional options pattern for extensibility and readability. +- Default values derived from `obidefault` where applicable (e.g., batch size, workers). +- Designed for integration with `obiseq` and `obidefault`, supporting scalable, reproducible NGS data workflows. diff --git a/autodoc/docmd/pkg/obichunk/subchunks.md b/autodoc/docmd/pkg/obichunk/subchunks.md new file mode 100644 index 0000000..8290bf2 --- /dev/null +++ b/autodoc/docmd/pkg/obichunk/subchunks.md @@ -0,0 +1,29 @@ +# Semantic Description of `obichunk.ISequenceSubChunk` + +The function `ISequenceSubChunk` in the `obichunk` package implements **parallel, class-based sorting and batching of biological sequences**, preserving input order within each batch while reordering across batches by classification code. + +## Core Functionality + +- **Input**: + - An iterator over `BioSequence` batches (`obiiter.IBioSequence`) + - A sequence classifier (`obiseq.BioSequenceClassifier`) assigning each sequence a numeric class code + - A number of worker goroutines (`nworkers`), defaulting to system-configured parallelism + +- **Processing**: + - Each worker consumes its own iterator split and classifier clone, enabling concurrent batch processing. + - For each incoming `BioSequenceBatch`: + - If the batch has >1 sequence: sequences are extracted, classified into `code`, and sorted *in-place* by class code. + - Consecutive sequences with the same `code` are grouped into new batches; a new batch is emitted upon code change. + - If the batch has ≤1 sequence, it’s passed through unchanged (but reordered with a new order ID). + +- **Ordering Mechanism**: + - Uses `atomic.AddInt32` to assign strictly increasing order IDs (`nextOrder`) across workers, preserving deterministic inter-batch ordering. + - Sorting within batches is performed via a custom `sort.Interface` implementation using closures for flexible comparison logic (here, by ascending class code). + +- **Output**: + - Returns a new iterator (`obiiter.IBioSequence`) emitting batches grouped by classification code, with globally ordered batch IDs. + - Workers are coordinated via `newIter.Done()`/`Wait()/Close()`, ensuring clean termination. + +## Semantic Purpose + +Enables efficient, parallel **grouping of sequences by taxonomic or functional class** (e.g., OTU assignment), optimizing downstream processing that requires sorted/class-ordered input — e.g., consensus building, alignment, or read merging per group. diff --git a/autodoc/docmd/pkg/obichunk/unique.md b/autodoc/docmd/pkg/obichunk/unique.md new file mode 100644 index 0000000..1b6264a --- /dev/null +++ b/autodoc/docmd/pkg/obichunk/unique.md @@ -0,0 +1,45 @@ +# Semantic Description of `IUniqueSequence` Functionality + +The `IUniqueSequence` function performs **dereplication** of biological sequence data — i.e., grouping identical or near-identical sequences while preserving metadata and counts. It operates on an `obiiter.IBioSequenceBatch` iterator. + +## Core Workflow + +1. **Input Processing** + Accepts an input sequence iterator and optional configuration via `WithOption`. + +2. **Parallelization Strategy** + Supports configurable parallel workers (`nworkers`). When `SortOnDisk()` is enabled, it falls back to single-threaded processing for disk-based sorting. + +3. **Data Splitting Phase** + - Uses `HashClassifier` to partition input into buckets (controlled by `BatchCount`). + - Ensures deterministic chunking for reproducibility. + +4. **Storage Choice** + - *In-memory*: via `ISequenceChunkOnMemory`. + - *Disk-based*: via `ISequenceSubChunk` + external sorting (requires single worker). + +5. **Uniqueness Classification** + - Builds a composite classifier combining: + - Sequence identity (`SequenceClassifier`) + - Optional annotation categories (e.g., sample, primer), with NA handling. + - If no annotations are specified, only raw sequence identity is used. + +6. **Singleton Filtering** + Optionally excludes singleton reads (count = 1) via `NoSingleton()` option. + +7. **Parallel Dereplication** + - Spawns worker goroutines to process chunks. + - Each worker applies `ISequenceSubChunk` + deduplication logic per classifier group. + +8. **Output Merging** + - Aggregates results using `IMergeSequenceBatch`, preserving: + - Sequence counts + - Statistics (if enabled) + - NA handling and ordering + +## Key Features + +- **Scalable**: Supports both memory-efficient (disk) and high-speed (RAM) modes. +- **Configurable**: Via functional options (`Options`). +- **Thread-safe**: Uses `sync.Mutex` for deterministic ordering. +- **Metadata-aware**: Incorporates annotation-based grouping (e.g., sample, primer). diff --git a/autodoc/docmd/pkg/obicorazick/worker.md b/autodoc/docmd/pkg/obicorazick/worker.md new file mode 100644 index 0000000..dc09de8 --- /dev/null +++ b/autodoc/docmd/pkg/obicorazick/worker.md @@ -0,0 +1,28 @@ +# Aho-Corasick-Based Sequence Analysis in `obicorazick` + +This Go package provides efficient pattern-matching utilities for biological sequence data, leveraging the Aho-Corasick algorithm. + +## Core Components + +- **`AhoCorazickWorker(slot string, patterns []string) obiseq.SeqWorker`** + Builds *multiple* Aho-Corasick matchers in parallel (batched to manage memory), then returns a `SeqWorker` function. + - Scans each sequence *forward* and its reverse complement. + - Counts total matches (`slot`), forward-only (`_Fwd`) and reverse-complement-specific (`_Rev`) matches. + - Attaches match counts as sequence attributes. + +- **`AhoCorazickPredicate(minMatches int, patterns []string) obiseq.SequencePredicate`** + Compiles a *single* matcher and returns a predicate function. + - Returns `true` if the number of matches ≥ `minMatches`. + - Useful for filtering sequences (e.g., taxonomic assignment or contamination detection). + +## Technical Highlights + +- **Batched compilation**: Large pattern sets are split into chunks (default `10⁷` patterns/batch) to avoid memory overload. +- **Parallelization**: Matcher construction uses goroutines, scaled by `obidefault.ParallelWorkers()`. +- **Progress tracking**: Optional CLI progress bar via `progressbar/v3`, enabled globally. +- **Logging & debugging**: Uses Logrus for info/debug messages; logs match counts per sequence. + +## Use Cases + +- Rapid screening of sequences against large reference databases (e.g., primers, barcodes, contaminants). +- Filtering or annotating sequences based on pattern presence/abundance. diff --git a/autodoc/docmd/pkg/obidefault/batch.md b/autodoc/docmd/pkg/obidefault/batch.md new file mode 100644 index 0000000..7802d78 --- /dev/null +++ b/autodoc/docmd/pkg/obidefault/batch.md @@ -0,0 +1,34 @@ +# ObiDefault Package: Batch Configuration Module + +This Go module provides centralized configuration for sequence batching in Obitools, supporting both **count-based** and **memory-aware** batch processing. + +## Core Features + +- `_BatchSize` / `SetBatchSize()` + Defines and configures the *minimum* number of sequences per batch (default: `1`). + Used internally as `minSeqs` in `RebatchBySize`. + +- `_BatchSizeMax()` / `SetBatchSizeMax()` + Sets the *maximum* sequences per batch (default: `2000`). Batches are flushed upon reaching this limit, regardless of memory. + +- **CLI & Environment Integration** + Batch size is determined by `--batch-size` CLI flag and/or the `OBIBATCHSIZE` environment variable (via parsing logic not shown here but implied by comments). + +- `_BatchMem()` / `SetBatchMem(n int)` + Configures the *maximum memory per batch* (default: `128 MB`). A value of `0` disables memory-based batching, falling back to pure count-based logic. + +- `_BatchMemStr()` + Stores the *raw CLI string* passed to `--batch-mem` (e.g., `"256M"`, `"1G"`), enabling human-readable input parsing elsewhere. + +## Utility Functions + +- `BatchSizePtr()`, `BatchMemPtr()` + Expose pointers to internal variables for direct modification or inter-process sharing. + +- `BatchSizeMaxPtr()`, `BatchMemStrPtr()` + Provide read/write access to max-size and raw memory string values. + +## Design Intent + +- Separates **configuration** (defaults, CLI/env parsing) from **processing logic**, enabling modular and testable batch handling. +- Supports both scalable, large-scale processing (via count limits) and memory-constrained environments (via soft RAM caps). diff --git a/autodoc/docmd/pkg/obidefault/compressed.md b/autodoc/docmd/pkg/obidefault/compressed.md new file mode 100644 index 0000000..24f21f9 --- /dev/null +++ b/autodoc/docmd/pkg/obidefault/compressed.md @@ -0,0 +1,35 @@ +# Output Compression Control Module + +This Go package (`obidefault`) provides a simple, global configuration mechanism for toggling output compression behavior across an application. + +## Core Features + +- **Global Compression Flag**: A package-level boolean variable `__compress__` (default: `false`) controls whether output should be compressed. +- **Read Access**: + - `CompressOutput()` returns the current compression setting as a boolean. +- **Write Access**: + - `SetCompressOutput(b bool)` updates the compression flag to a new value. +- **Pointer Access**: + - `CompressOutputPtr()` returns a pointer to the internal flag, enabling indirect modification (e.g., for UI bindings or reflection-based updates). + +## Design Intent + +- Minimal, side-effect-free API. +- Thread-safety *not* guaranteed — intended for use in single-threaded initialization or controlled environments. +- Encapsulation via unexported variable `__compress__`, enforced through accessor functions. + +## Typical Usage + +```go +// Enable compression globally: +obidefault.SetCompressOutput(true) + +if obidefault.CompressOutput() { + // Apply compression logic (e.g., gzip, brotli) +} +``` + +## Notes + +- The double underscore prefix (`__compress__`) signals internal/private status (convention, not enforced). +- Designed for runtime configurability without recompilation. diff --git a/autodoc/docmd/pkg/obidefault/logger.md b/autodoc/docmd/pkg/obidefault/logger.md new file mode 100644 index 0000000..e7ad834 --- /dev/null +++ b/autodoc/docmd/pkg/obidefault/logger.md @@ -0,0 +1,38 @@ +# `obidefault` Package — Semantic Overview + +This minimal Go package provides a centralized, mutable global flag for controlling warning verbosity across an application. + +## Core Functionality + +- **`__silent_warning__`**: + A package-level boolean variable (unexported) that determines whether warnings should be suppressed. + +- **`SilentWarning() bool`**: + A read-only accessor returning the current state of `__silent_warning__`. Enables safe, non-mutating checks elsewhere in the codebase. + +- **`SilentWarningPtr() *bool`**: + Returns a pointer to `__silent_warning__`, allowing external code (e.g., CLI parsers, config loaders) to directly mutate the flag — e.g., `*SilentWarningPtr() = true`. + +## Design Intent + +- **Simplicity & Centralization**: + Avoids scattering warning-control logic; provides a single source of truth. + +- **Flexibility**: + Supports both *read-only* inspection (via `SilentWarning()`) and *global mutation* (via pointer), useful for early initialization phases. + +- **Explicit Semantics**: + When `SilentWarning()` returns `true`, all warning-generating code *should* suppress output (implementation responsibility lies outside this package). + +## Usage Example + +```go +// Suppress warnings globally: +*obidefault.SilentWarningPtr() = true + +if !obidefault.SilentWarning() { + log.Println("⚠️ Warning: something happened") +} +``` + +> **Note**: The double underscore prefix on `__silent_warning__` signals internal/private status, discouraging direct access. diff --git a/autodoc/docmd/pkg/obidefault/progressbar.md b/autodoc/docmd/pkg/obidefault/progressbar.md new file mode 100644 index 0000000..f0bfe93 --- /dev/null +++ b/autodoc/docmd/pkg/obidefault/progressbar.md @@ -0,0 +1,33 @@ +# Progress Bar Control Module (`obidefault`) + +This Go package provides a simple, global mechanism to enable or disable progress bar display across an application. + +## Core Functionality + +- **`ProgressBar()`**: Returns `true` if progress bars are *enabled* (i.e., when `__no_progress_bar__` is `false`). +- **`NoProgressBar()`**: Returns the current state of `__no_progress_bar__`, i.e., whether progress bars are *disabled*. +- **`SetNoProgressBar(b bool)`**: Sets the global flag `__no_progress_bar__`. Passing `true` disables progress bars; passing `false` enables them. +- **`NoProgressBarPtr()`**: Returns a pointer to the internal `__no_progress_bar__` variable, allowing direct read/write access (e.g., for reflection or UI binding). + +## Design Intent + +- Centralizes progress bar visibility control in one place. +- Supports both boolean query/set and pointer-based manipulation for flexibility (e.g., CLI flags, config binding). +- Uses a *negative* flag name (`__no_progress_bar__`) internally to default progress bars **on** (i.e., `false` → enabled). + +## Usage Example + +```go +// Disable progress bars globally: +obidefault.SetNoProgressBar(true) + +// Check status: +if !obidefault.ProgressBar() { + log.Println("Progress bars are disabled.") +} +``` + +## Notes + +- Thread-safety is *not* guaranteed; concurrent access should be externally synchronized. +- The double underscore prefix (`__no_progress_bar__`) signals internal/private usage per Go convention (though not enforced). diff --git a/autodoc/docmd/pkg/obidefault/quality.md b/autodoc/docmd/pkg/obidefault/quality.md new file mode 100644 index 0000000..a01e928 --- /dev/null +++ b/autodoc/docmd/pkg/obidefault/quality.md @@ -0,0 +1,26 @@ +# Quality Shift and Read/Write Control Module + +This Go package (`obidefault`) provides configurable controls over quality score handling in sequence data processing (e.g., FASTQ files). It defines three global variables and corresponding accessor/mutator functions: + +- `_Quality_Shift_Input`: Input quality score offset (default: `33`, i.e., Phred+33/Sanger format). +- `_Quality_Shift_Output`: Output quality score offset (default: `33`), allowing format conversion. +- `_Read_Qualities`: Boolean flag indicating whether quality scores should be parsed/processed (`true` by default). + +## Public API + +| Function | Purpose | +|---------|--------| +| `SetReadQualitiesShift(shift byte)` | Sets the quality score offset for *input* data (e.g., when reading FASTQ). | +| `ReadQualitiesShift() byte` | Returns the current input quality offset. | +| `SetWriteQualitiesShift(shift byte)` | Sets the quality score offset for *output* data (e.g., when writing FASTQ). | +| `WriteQualitiesShift() byte` | Returns the current output quality offset. | +| `SetReadQualities(read bool)` | Enables/disables reading/processing of quality scores. | +| `ReadQualities() bool` | Returns whether qualities are currently being read/used. | + +## Semantic Use Cases + +- **Format Interoperability**: Allows seamless conversion between Phred+33 (Sanger), Phred+64, or other quality encodings. +- **Performance Optimization**: Disabling `ReadQualities` skips parsing of quality strings, useful when only sequences are needed. +- **Centralized Configuration**: Global state enables consistent behavior across modules without passing parameters. + +All functions are thread-unsafe by design—intended for initialization before concurrent processing begins. diff --git a/autodoc/docmd/pkg/obidefault/taxonomy.md b/autodoc/docmd/pkg/obidefault/taxonomy.md new file mode 100644 index 0000000..5f3e2ca --- /dev/null +++ b/autodoc/docmd/pkg/obidefault/taxonomy.md @@ -0,0 +1,21 @@ +# `obidefault` Package: Configuration State Management + +This Go package provides a centralized, thread-safe(ish) configuration layer for taxonomy-related settings in the OBIDMS (Open Biological and Biomedical Data Management System) framework. It exposes simple getters, setters, and pointer accessors for four core boolean/string flags that control how taxonomic identifiers (taxids) are handled during data processing. + +## Core Configuration Flags + +- `__taxonomy__`: Stores the currently selected taxonomy (e.g., `"NCBI"`, `"UNIPROT"`). +- `__alternative_name__`: Enables/disables use of alternative taxonomic names (e.g., synonyms). +- `__fail_on_taxonomy__`: If true, processing halts on taxonomy mismatches/errors. +- `__update_taxid__`: If true, taxids are auto-updated to current NCBI/DB versions. +- `__raw_taxid__`: If true, raw (unprocessed) taxids are preserved instead of normalized. + +## Public API + +- **Getters**: `UseRawTaxids()`, `SelectedTaxonomy()`, `HasSelectedTaxonomy()`, etc., return current values. +- **Pointer Accessors**: e.g., `SelectedTaxonomyPtr()` returns a pointer for direct mutation (advanced use). +- **Setters**: `SetSelectedTaxonomy()`, `SetAlternativeNamesSelected()`, etc., update state. + +## Use Case + +Typically used at application startup to configure global behavior (e.g., `SetSelectedTaxonomy("NCBI")`, `SetUpdateTaxid(true)`), then referenced by downstream modules during data import, validation, or mapping. Minimalist and explicit—no external dependencies. diff --git a/autodoc/docmd/pkg/obidefault/workers.md b/autodoc/docmd/pkg/obidefault/workers.md new file mode 100644 index 0000000..204e38f --- /dev/null +++ b/autodoc/docmd/pkg/obidefault/workers.md @@ -0,0 +1,35 @@ +# Obidefault: Parallelism Configuration Module + +This Go package (`obideault`) provides a centralized, configurable interface for managing parallel execution parameters—particularly useful in I/O- and CPU-bound workloads. + +## Core Concepts + +- **CPU-aware defaults**: Automatically detects available cores via `runtime.NumCPU()`. +- **Configurable workers per core**: + - General: `_WorkerPerCore` (default `1.0`) + - Read-specific: `_ReadWorkerPerCore` (`0.25`, i.e., ~1 reader per 4 cores) + - Write-specific: `_WriteWorkerPerCore` (`0.25`) +- **Strict overrides**: Allow hardcoding worker counts via `SetStrictReadWorker()`/`Write...`, bypassing per-core scaling. + +## Public API + +| Function | Purpose | +|---------|--------| +| `ParallelWorkers()` | Total workers = `MaxCPU() × WorkerPerCore` | +| `Read/WriteParallelWorkers()` | Resolves to strict count if set, else per-core calculation (min 1) | +| `ParallelFilesRead()` | Files read in parallel: defaults to `ReadParallelWorkers()`, overridable | +| Getters (`MaxCPU`, `WorkerPerCore`, etc.) | Expose current settings safely | +| Setters (`Set*`) | Dynamically adjust behavior at runtime | + +## Configuration Sources + +- **Command-line flags**: e.g., `--max-cpu` or `-m` +- **Environment variable**: `OBIMAXCPU` + +## Design Highlights + +✅ Decouples resource discovery from policy +✅ Supports both *proportional* (per-core) and *absolute* (strict) worker definitions +✅ Ensures non-zero defaults for critical paths (`ReadParallelWorkers` ≥ 1) + +⚠️ **Note**: `WriteParallelWorkers()` contains a likely bug—returns `_StrictReadWorker` in the else branch instead of `StrictWriteWorker`. diff --git a/autodoc/docmd/pkg/obidist/dist_matrix.md b/autodoc/docmd/pkg/obidist/dist_matrix.md new file mode 100644 index 0000000..7b59f93 --- /dev/null +++ b/autodoc/docmd/pkg/obidist/dist_matrix.md @@ -0,0 +1,28 @@ +# `obidist` Package: Efficient Symmetric Distance/Similarity Matrix Management + +The `*DistMatrix` type provides a memory-efficient, symmetric matrix implementation for distance or similarity data. + +- **Storage Strategy**: Only the upper triangle (i < j) is stored, reducing memory usage from *O(n²)* to *n(n−1)/2*. +- **Diagonal Handling**: Diagonal entries are fixed (0.0 for distances, 1.0 for similarities); assignments to diagonal indices are silently ignored. +- **Symmetry Guarantee**: `Get(i, j)` and `Set(i, j, v)` automatically handle both (i,j) and (j,i), ensuring consistency. + +## Constructors + +| Function | Description | +|---------|-------------| +| `NewDistMatrix(n)` / `WithLabels(labels)` | Creates *n×n* distance matrix (diag = 0). | +| `NewSimilarityMatrix(n)` / `WithLabels(labels)` | Creates *n×n* similarity matrix (diag = 1). | + +## Core Operations + +- `Get(i, j)` / `Set(i, j, v)`: Access/update symmetric entries. +- `Size() int`, `GetLabel(i)` / `SetLabel(i, label)`: Query/mutate element labels. +- `Labels() []string`, `GetRow(i)` / `GetColumn(j)`: Retrieve full rows/columns (as copies). + +## Analysis Helpers + +- `MinDistance()`, `MaxDistance()` → `(value, i, j)` of the extremal off-diagonal entry. +- `Copy() *DistMatrix`: Deep copy for immutability-safe operations. +- `ToFullMatrix()` → `[][]float64`: Converts to dense representation (use sparingly). + +Designed for clustering, phylogenetics, or any domain requiring fast symmetric matrix access with minimal footprint. diff --git a/autodoc/docmd/pkg/obidist/dist_matrix_test.md b/autodoc/docmd/pkg/obidist/dist_matrix_test.md new file mode 100644 index 0000000..6d764b8 --- /dev/null +++ b/autodoc/docmd/pkg/obidist/dist_matrix_test.md @@ -0,0 +1,28 @@ +# `obidist` Package: Semantic Feature Overview + +The `obidist` Go package provides two core data structures for managing **distance** and **similarity matrices**, with built-in guarantees suitable for scientific computing (e.g., clustering, phylogenetics). Key features include: + +- **`DistMatrix`**: A symmetric `n×n` matrix representing pairwise distances, where: + - Diagonal entries are *always* `0.0` (self-distance). + - Off-diagonals obey symmetry: `dist(i, j) == dist(j, i)`. + - Automatic enforcement via dedicated `Set()`/`Get()` methods. + +- **`SimilarityMatrix`**: A symmetric matrix where: + - Diagonal entries are *always* `1.0`. + - Off-diagonals represent similarity scores (e.g., between `0` and `1`, though not enforced). + - Symmetry is similarly guaranteed. + +Both matrix types support: +- **Optional labels**: Associate human-readable identifiers (e.g., sample names) with rows/columns. +- **Safe bounds checking**: Panics on out-of-range access (tested via `defer/recover`). +- **Deep copy support**: Ensures isolation between original and copied instances. +- **Utility methods**: + - `MinDistance()` / `MaxDistance()`: Return extremal values and their indices. + - `GetRow(i)`: Retrieve a full row as a slice (symmetric copy). + - `ToFullMatrix()`: Export the matrix as an immutable 2D slice. + +Edge cases are rigorously handled: +- Empty (`n=0`) and singleton (`n=1`) matrices return `(0.0, -1, -1)` for min/max. +- Label mutations do not affect internal state via defensive copying. + +All behaviors are validated through comprehensive unit tests, emphasizing correctness and robustness. diff --git a/autodoc/docmd/pkg/obiformats/batch_of_files_reader.md b/autodoc/docmd/pkg/obiformats/batch_of_files_reader.md new file mode 100644 index 0000000..3bd46d9 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/batch_of_files_reader.md @@ -0,0 +1,43 @@ +# Semantic Description of `ReadSequencesBatchFromFiles` + +This function implements **concurrent, batched streaming** of biological sequences from multiple input files. + +## Core Functionality + +- **Input**: A slice of file paths (`[]string`), an optional batch reader interface, and a concurrency level. +- **Default behavior**: Uses `ReadSequencesFromFile` if no custom reader is provided. + +## Concurrency Model + +- Launches `concurrent_readers` goroutines to process files in parallel. +- Files are distributed via a shared channel (`filenameChan`) — ensuring fair load balancing. + +## Streaming Interface + +- Returns an `obiiter.IBioSequence`, a streaming iterator over batches of biological sequences. +- Internally uses an atomic counter (`nextCounter`) to assign unique, ordered IDs to sequence batches (via `Reorder`), preserving global order despite parallelism. + +## Error Handling & Logging + +- Panics on file-open failure (via `log.Panicf`). +- Logs start/end of reading per file using structured logging (`log.Printf`, `log.Println`). + +## Resource Management + +- Uses a barrier pattern: each reader goroutine calls `batchiter.Done()` upon completion. +- A finalizer goroutine waits for all readers (`WaitAndClose`) and logs termination. + +## Design Intent + +- Enables scalable, memory-efficient ingestion of large NGS datasets. +- Decouples *reading logic* (via `IBatchReader`) from orchestration — supporting pluggable formats. +- Prioritizes throughput and deterministic ordering over strict FIFO per-file semantics. + +## Key Abstractions + +| Type/Interface | Role | +|----------------|------| +| `IBatchReader` | Reader factory: `(filename, options...) → SequenceIterator` | +| `obiiter.IBioSequence` | Thread-safe batch iterator (push model) | +| `AtomicCounter` | Ensures globally unique, sequential batch IDs across goroutines | + diff --git a/autodoc/docmd/pkg/obiformats/batch_reader_type.md b/autodoc/docmd/pkg/obiformats/batch_reader_type.md new file mode 100644 index 0000000..6a903cd --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/batch_reader_type.md @@ -0,0 +1,36 @@ +# `obiformats` Package — Semantic Overview + +The `obiformats` package provides a standardized interface for **format-agnostic batch reading of biological sequence data** within the OBITools4 ecosystem. + +## Core Abstraction + +- **`IBatchReader`** is a function type defining the contract for opening and iterating over sequence files: + ```go + func(string, ...WithOption) (obiiter.IBioSequence, error) + ``` +- It accepts: + - A file path (`string`) + - Optional configuration via variadic `WithOption` arguments (e.g., filtering, parsing rules) +- Returns: + - An iterator over biological sequences (`obiiter.IBioSequence`) + - Or an error if the file cannot be opened/parsed + +## Semantic Intent + +- **Decouples format handling from iteration logic**: Enables uniform consumption of FASTA, FASTQ, SAM/BAM, etc., via a single entry point. +- **Supports extensibility**: New format readers can be registered as `IBatchReader` implementations without altering client code. +- **Enables lazy, streaming access**: Sequences are yielded on-demand via the iterator—memory-efficient for large datasets. + +## Typical Usage Pattern + +1. Select or compose an `IBatchReader` implementation (e.g., for FASTQ). +2. Call it with a file path and optional options. +3. Iterate over the returned `IBioSequence` to process sequences one-by-one. + +## Design Principles + +- **Functional, minimal API**: Single responsibility—reading and iteration. +- **Option-based configurability**: Avoids combinatorial function overloading via `With...` patterns. +- **Integration-ready**: Built to work seamlessly with the broader OBITools4 iterator and sequence abstractions. + +> *Note: Actual format-specific readers (e.g., `NewFASTQBatchReader`) are expected to conform to this interface but reside outside the core type definition.* diff --git a/autodoc/docmd/pkg/obiformats/csv_read.md b/autodoc/docmd/pkg/obiformats/csv_read.md new file mode 100644 index 0000000..ea224e2 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/csv_read.md @@ -0,0 +1,30 @@ +# CSV Import Module for Biological Sequences (`obiformats`) + +This Go package provides functionality to parse biological sequence data from CSV files into structured objects compatible with the OBItools4 framework. + +## Core Features + +- **CSV Parsing**: Reads CSV data via `io.Reader`, supporting comments (`#`), flexible field counts, and leading-space trimming. +- **Sequence Extraction**: Identifies columns named `sequence`, `id`, or `qualities` by header and maps them to corresponding biological sequence fields. +- **Quality Score Adjustment**: Applies a configurable Phred score shift (default: `33`) to quality strings. +- **Metadata Handling**: + - Special handling for taxonomic IDs (`taxid`, `*_taxid`). + - Generic attributes parsed as JSON when possible; fallback to raw string otherwise. +- **Batched Output**: Streams sequences in configurable batches (`batchSize`) via an iterator interface (`obiiter.IBioSequence`). +- **Multiple Entry Points**: + - `ReadCSV`: From any `io.Reader`. + - `ReadCSVFromFile`: Loads from a file (with source naming derived from filename). + - `ReadCSVFromStdin`: Reads from standard input. +- **Error & Edge Handling**: + - Gracefully handles empty files/streams via `ReadEmptyFile`. + - Uses structured logging (Logrus) for fatal and informational messages. + +## Integration + +Designed to integrate with OBItools4’s core types: +- `obiseq.BioSequence`: Holds sequence, ID, qualities, taxid, and arbitrary attributes. +- `obiiter.IBioSequence`: Streaming interface for batched sequence iteration. + +## Use Case + +Efficient, flexible ingestion of tabular biological data (e.g., from alignment outputs or preprocessed FASTQ/FASTA conversions) into downstream analysis pipelines. diff --git a/autodoc/docmd/pkg/obiformats/csv_writer.md b/autodoc/docmd/pkg/obiformats/csv_writer.md new file mode 100644 index 0000000..ebdc212 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/csv_writer.md @@ -0,0 +1,22 @@ +# CSVSequenceRecord Function Description + +The `CSVSequenceRecord` function converts a biological sequence object (`*obiseq.BioSequence`) into a slice of strings suitable for CSV output. It dynamically constructs the record based on user-defined options (`opt Options`), enabling flexible column selection. + +## Core Features + +- **Sequence ID**: Includes the sequence identifier if `opt.CSVId()` is enabled. +- **Abundance Count**: Appends the sequence count (e.g., read depth) if `opt.CSVCount()` is true. +- **Taxonomic Information**: Adds both NCBI taxid and scientific name (retrieved from attributes or fallback via `opt.CSVNAValue()`). +- **Definition Line**: Includes the sequence definition/description if requested via `opt.CSVDefinition()`. +- **Custom Attributes**: Iterates over keys from `opt.CSVKeys()` and appends corresponding attribute values (or NA if missing). +- **Nucleotide Sequence**: Appends the raw sequence string when `opt.CSVSequence()` is enabled. +- **Quality Scores**: Converts Phred-quality scores to ASCII characters (using a configurable shift) if available; otherwise inserts NA. + +## Design Highlights + +- Uses `obiutils.InterfaceToString()` for safe type conversion of arbitrary attribute values. +- Handles missing data consistently via `opt.CSVNAValue()`. +- Supports both standard and user-defined metadata fields. +- Adapts quality encoding to common formats (e.g., Sanger/Illumina) via `obidefault.WriteQualitiesShift()`. + +This function enables interoperable, configurable export of sequence data to tabular formats. diff --git a/autodoc/docmd/pkg/obiformats/csviterator.md b/autodoc/docmd/pkg/obiformats/csviterator.md new file mode 100644 index 0000000..6c7854b --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/csviterator.md @@ -0,0 +1,24 @@ +# `CSVTaxaIterator` Function — Semantic Description + +The function `CSVTaxaIterator`, part of the `obiformats` package, converts a taxonomic iterator (`*obitax.ITaxon`) into an **incremental CSV record generator** via `obiitercsv.ICSVRecord`. It enables streaming, batched export of taxonomic data to CSV format with configurable fields. + +### Core Functionality: +- **Input**: A pointer-based taxonomic iterator (`*obitax.ITaxon`) and optional configuration via `WithOption`. +- **Output**: An asynchronous CSV record iterator (`*obiitercsv.ICSVRecord`) that yields batches of records. + +### Configurable Output Fields (via options): +- `query`: Taxon-associated query identifier, if enabled (`WithPattern`). +- `taxid`: Either raw node ID (e.g., string pointer) or formatted taxon path (`WithRawTaxid` toggle). +- `parent`: Parent taxonomic ID or string representation, if enabled (`WithParent`). +- `taxonomic_rank`: Taxon rank (e.g., "species", "genus"). +- `scientific_name`: Full scientific name of the taxon. +- Custom metadata fields: Specified via `WithMetadata`, extracted from taxon metadata store. +- `path`: Full lineage path (e.g., "k__Bacteria; p__; c__..."), if enabled (`WithPath`). + +### Implementation Highlights: +- Uses **goroutines** for non-blocking push of batches and clean shutdown (`WaitAndClose`, `Done`). +- Supports **batching** (configurable via `BatchSize`) to optimize I/O. +- Dynamically builds CSV headers based on selected options before processing begins. + +### Use Case: +Efficient, memory-light conversion of large taxonomic datasets (e.g., from classification pipelines) into structured CSV for downstream analysis or reporting. diff --git a/autodoc/docmd/pkg/obiformats/csvtaxdump_read.md b/autodoc/docmd/pkg/obiformats/csvtaxdump_read.md new file mode 100644 index 0000000..380fe1f --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/csvtaxdump_read.md @@ -0,0 +1,27 @@ +## CSV Taxonomy Loader for OBITools4 + +This Go module provides a function `LoadCSVTaxonomy` to parse and load taxonomic data from CSV files into an internal taxonomy structure. + +### Key Features: +- **Robust CSV Parsing**: Uses Go’s `encoding/csv` with configurable options (comment lines, lazy quotes, whitespace trimming). +- **Column Mapping**: Dynamically identifies required columns: `taxid`, `parent`, `scientific_name`, and `taxonomic_rank`. +- **Error Handling**: Validates presence of all required columns; fails early with descriptive errors. +- **Taxonomy Construction**: + - Builds a hierarchical taxonomy using `obitax.Taxon` objects. + - Ensures existence of a root node; returns error otherwise. +- **Metadata Extraction**: + - Derives taxonomy name and short code (e.g., prefix before `:` in first taxid). + - Logs key metadata for traceability. +- **Scalable Design**: + - Processes records line-by-line (memory-efficient). + - Supports large datasets via streaming CSV reading. + +### Input Format: +CSV must contain exactly four columns (case-sensitive headers): +- `taxid`: Unique taxon identifier. +- `parent`: Parent taxonomic node ID (empty for root). +- `scientific_name`: Binomial or descriptive name. +- `taxonomic_rank`: e.g., *species*, *genus*. + +### Output: +Returns a fully populated `obitax.Taxonomy` object ready for downstream phylogenetic or sequence classification tasks. diff --git a/autodoc/docmd/pkg/obiformats/dispatcher.md b/autodoc/docmd/pkg/obiformats/dispatcher.md new file mode 100644 index 0000000..4416326 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/dispatcher.md @@ -0,0 +1,14 @@ +# Semantic Description of `obiformats.WriterDispatcher` + +The package `obiformats` provides utilities for writing biosequences (e.g., DNA/RNA/protein reads) to files in a structured, parallelized manner. Its core component is the `WriterDispatcher` function. + +- **Purpose**: Enables concurrent, classifier-guided writing of biosequence batches to multiple output files based on dynamic dispatching logic. +- **Input**: Takes a prototype filename template (`prototypename`), an `IDistribute` dispatcher (which partitions and routes sequences by classification keys), a formatting/writing function (`formater` of type `SequenceBatchWriterToFile`), and optional configuration. +- **Concurrency**: Launches one goroutine per classification category (via `dispatcher.News()`), ensuring scalable parallel writes. +- **Classification Handling**: Supports simple and composite keys (e.g., dual annotations like sample + region), parsing JSON-encoded classifier values when needed. +- **File Naming & Organization**: Substitutes keys into the prototype name, appends `.gz` if compression is enabled, and creates subdirectories (e.g., for sample groups) as required. +- **Error Handling**: Uses `log.Fatalf` to abort on unrecoverable errors (e.g., failed key parsing, directory creation issues). +- **Resource Management**: Ensures all goroutines complete before returning via `sync.WaitGroup`. +- **Extensibility**: The generic `SequenceBatchWriterToFile` type allows plugging in different output formats (e.g., FASTA, JSON) without modifying the dispatcher logic. + +In summary: `WriterDispatcher` is a high-level orchestrator for parallel, classifier-based batch writing of biological sequences to organized file outputs. diff --git a/autodoc/docmd/pkg/obiformats/ecopcr_read.md b/autodoc/docmd/pkg/obiformats/ecopcr_read.md new file mode 100644 index 0000000..004a6b5 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/ecopcr_read.md @@ -0,0 +1,29 @@ +# EcoPCR File Parser for Biological Sequences + +This Go package (`obiformats`) provides functionality to parse EcoPCR output files—tab-delimited CSV-like files containing amplified sequence data generated by the *EcoPCR* tool (used in metabarcoding pipelines). The parser supports two versions of the format (`v1` and `v2`) and extracts rich biological metadata alongside sequences. + +## Key Features + +- **Version Detection**: Automatically detects EcoPCR file version via the `#@ecopcr-v2` header. +- **Primer Extraction**: Reads forward and reverse primer sequences from comment lines in the file header. +- **Mode Inference**: Identifies amplification mode (e.g., `direct`, `inverted`) from header metadata. +- **Sequence Parsing**: Reads each record as a biological sequence (`obiseq.BioSequence`) with: + - Name (with deduplication support) + - Nucleotide/protein sequence + - Comment field +- **Structured Annotation**: Populates rich annotations including: + - Taxonomic hierarchy (taxid, rank, species/genus/family names) + - Primer matching info (`forward_match`, `reverse_mismatch`) + - Melting temperatures (if present in v2) + - Amplicon length and strand orientation +- **Streaming & Batching**: Returns an iterator (`obiiter.IBioSequence`) for memory-efficient, batched processing of large files. +- **File Handling**: Provides both `ReadEcoPCR` (from any `io.Reader`) and `ReadEcoPCRFromFile` convenience functions. + +## Implementation Highlights + +- Custom line reader (`__readline__`) for robust header parsing. +- CSV parser configured with `|` delimiter and comment support (`#`). +- Deduplication of sequence names using a running count suffix. +- Concurrent goroutine-based streaming to decouple I/O and processing. + +This module integrates with the broader *OBItools4* ecosystem for high-throughput sequence analysis in environmental DNA studies. diff --git a/autodoc/docmd/pkg/obiformats/embl_read.md b/autodoc/docmd/pkg/obiformats/embl_read.md new file mode 100644 index 0000000..0ef2bd6 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/embl_read.md @@ -0,0 +1,17 @@ +# EMBL Format Parser for OBITools4 + +This Go package (`obiformats`) provides robust, streaming parsers for the **EMBL nucleotide sequence format**, supporting both standard and rope-based (memory-efficient) parsing. Key features: + +- **Entry Boundary Detection**: `EndOfLastFlatFileEntry()` identifies the end of EMBL entries using the signature terminator pattern `//` (with optional CR/LF), enabling chunked file processing. +- **Two Parsing Modes**: + - `EmblChunkParser()`: Line-scanning parser for buffered I/O (`io.Reader`). + - `EmblChunkParserRope()`: Direct rope-based parser for zero-copy processing of large files. +- **Configurable Options**: + - `withFeatureTable`: Includes EMBL feature table (`FH`/`FT`) lines. + - `UtoT`: Converts RNA uracil (`u/U`) to DNA thymine (`t/T`). +- **Metadata Extraction**: Captures `ID`, `OS` (scientific name), `DE` (description), and taxonomic ID (`/db_xref="taxon:..."`) into sequence annotations. +- **Sequence Handling**: Parses multi-line EMBL sequences (10-bases-per-group, with position numbers), skipping digits and whitespace. +- **Parallel Processing**: `ReadEMBL()`/`ReadEMBLFromFile()` support concurrent parsing via worker goroutines, streaming results as `BioSequenceBatch` objects. +- **Integration**: Outputs are compatible with OBITools4’s iterator framework (`obiiter.IBioSequence`) and sequence type `obiseq.BioSequence`. + +Designed for scalability, the module handles large EMBL files efficiently—ideal for metagenomic or biodiversity data pipelines. diff --git a/autodoc/docmd/pkg/obiformats/empty_file.md b/autodoc/docmd/pkg/obiformats/empty_file.md new file mode 100644 index 0000000..b1190f6 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/empty_file.md @@ -0,0 +1,22 @@ +## `ReadEmptyFile` Function — Semantic Description + +- **Package**: `obiformats`, part of the OBITools4 ecosystem for biological sequence handling. +- **Purpose**: Creates and returns an *empty*, closed iterator over biosequences (`IBioSequence`). +- **Signature**: + `func ReadEmptyFile(options ...WithOption) (obiiter.IBioSequence, error)` +- **Input**: Accepts variadic `WithOption` configuration functions (currently unused in this minimal implementation). +- **Behavior**: + - Instantiates a new `IBioSequence` iterator via `obiiter.MakeIBioSequence()`. + - Immediately closes the stream using `.Close()` — indicating no data will be yielded. +- **Output**: + - Returns a *terminal* iterator (no elements), suitable as a safe default or fallback. + - Error return is always `nil`, since no I/O occurs and the operation is deterministic. + +### Semantic Role & Use Cases +- **Default/Placeholder**: Useful in conditional logic where a valid (but empty) sequence iterator is required when no input file exists or parsing fails. +- **Consistency**: Ensures callers always receive a well-formed iterator, avoiding `nil` checks. +- **Resource Safety**: The closed state prevents accidental iteration or memory leaks. + +### Design Notes +- Reflects a *pure-functional* and *fail-safe* pattern: no side effects, deterministic behavior. +- Aligns with iterator-based I/O design principles in OBITools4 (lazy, composable streams). diff --git a/autodoc/docmd/pkg/obiformats/fastaseq_read.md b/autodoc/docmd/pkg/obiformats/fastaseq_read.md new file mode 100644 index 0000000..c4340f6 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/fastaseq_read.md @@ -0,0 +1,34 @@ +# FASTA Parser Module (`obiformats`) + +This Go package provides robust, streaming-capable parsing of FASTA-formatted nucleotide sequences. It supports both standard and rope-based (memory-efficient) input handling. + +## Core Functionalities + +- **`FastaChunkParser(UtoT bool)`** + Returns a parser function for in-memory byte streams. Converts `U→T` if enabled (for RNA/DNA normalization). Validates headers, identifiers, and sequences; rejects invalid characters or malformed entries. + +- **`FastaChunkParserRope(...)`** + Parses FASTA directly from a `PieceOfChunk` rope structure, avoiding full data materialization. Optimized for large files. + +- **`ReadFasta(reader io.Reader, ...)`** + High-level API to parse FASTA from any `io.Reader`. Uses chunked reading with parallel workers (configurable via options). Supports full-file batching and header annotation parsing. + +- **`ReadFastaFromFile(...)` / `ReadFastaFromStdin(...)`** + Convenience wrappers for file and stdin inputs, including source naming and empty-file handling. + +- **`EndOfLastFastaEntry(...)`** + Helper to locate the last complete FASTA entry in a buffer, enabling safe chunked streaming without splitting records. + +## Key Features + +- **Strict validation**: Ensures entries start with `>`, contain valid identifiers, and only use allowed sequence characters (`a-z`, `- . [ ]`). +- **Case normalization**: Converts uppercase to lowercase; optional `U→T` conversion. +- **Whitespace handling**: Ignores spaces/tabs in sequences, preserves line breaks only for parsing structure. +- **Parallel processing**: Configurable worker count via options; batches results by source and order for downstream sorting/aggregation. +- **Integration with `obiseq`/`obiiter`**: Yields typed sequence objects (`BioSequence`) and batched iterators compatible with OBITools4 pipelines. + +## Design Highlights + +- Minimal allocations via rope-based parsing (`extractFastaSeq`). +- Graceful error reporting with context (source, identifier, invalid char position). +- Extensible via `WithOption` pattern for header parsing and batching behavior. diff --git a/autodoc/docmd/pkg/obiformats/fastqseq_read.md b/autodoc/docmd/pkg/obiformats/fastqseq_read.md new file mode 100644 index 0000000..7482390 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/fastqseq_read.md @@ -0,0 +1,41 @@ +# FASTQ Parsing Module (`obiformats`) + +This Go package provides robust, streaming-capable parsing of FASTQ files — a standard format for storing nucleotide sequences along with quality scores. + +## Core Functionalities + +- **`EndOfLastFastqEntry(buffer []byte) int`** + Locates the start position (`@`) of the last complete FASTQ entry in a byte buffer using state-machine scanning from end to beginning. Returns `-1` if no valid entry found. + +- **`FastqChunkParser(...)`** + Returns a parser function for processing FASTQ data from an `io.Reader`. Handles: + - Header parsing (`@id [definition]`) + - Sequence normalization (uppercase → lowercase, `U→T` conversion if enabled) + - Quality score shifting (`quality_shift`) + - Strict validation (e.g., `+` line, matching sequence/length) + +- **`FastqChunkParserRope(...)`** + Optimized parser for rope-based input (`PieceOfChunk`), avoiding unnecessary memory copies. Uses direct line-by-line scanning. + +- **Batched File Parsing (`_ParseFastqFile`, `ReadFastq`, etc.)** + Enables concurrent, chunked parsing of large files: + - Splits input into chunks using `ReadFileChunk` + - Uses configurable parallel workers (`nworker`) + - Pushes parsed batches to an iterator interface + +- **Convenience I/O Wrappers** + - `ReadFastqFromFile(filename, ...)`: Parses a file by name. + - `ReadFastqFromStdin(...)`: Reads FASTQ from standard input. + +## Key Options & Features + +- **Quality handling**: Optional quality extraction (`with_quality`), configurable offset (`quality_shift`) +- **Uracil-to-Thymine conversion**: `UtoT` flag for RNA→DNA normalization +- **Header annotation parsing**: Optional post-parsing header interpretation via `ParseFastSeqHeader` +- **Batch sorting & full-file mode**: Supports both streaming and complete-file aggregation + +## Design Highlights + +- **Memory-efficient chunking** with overlap-aware boundary detection (`EndOfLastFastqEntry`) +- **Strict error reporting**: Fails fast on malformed FASTQ (e.g., invalid chars, length mismatch) +- **Integration with `obiseq`, `obiiter`**: Returns typed biological sequence slices and iterator streams compatible with the broader OBITools4 ecosystem. diff --git a/autodoc/docmd/pkg/obiformats/fastqseq_write_generic.md b/autodoc/docmd/pkg/obiformats/fastqseq_write_generic.md new file mode 100644 index 0000000..37bccc8 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/fastqseq_write_generic.md @@ -0,0 +1,11 @@ +## Semantic Description of `obiformats` Package + +The `obiformats` package provides core formatting utilities for biological sequence data in standard FASTX formats (FASTA and FASTQ). It defines two functional types: +- `BioSequenceFormater`: Converts a single biological sequence (`*obiseq.BioSequence`) into its string representation. +- `BioSequenceBatchFormater`: Converts a batch of sequences (`obiiter.BioSequenceBatch`) into raw bytes, suitable for file or stream output. + +Two main constructor functions enable flexible formatting: +- `BuildFastxSeqFormater(format, header)` returns a sequence-level formatter based on the requested format (`"fasta"` or `"fastq"`), applying optional header metadata via `FormatHeader`. +- `BuildFastxFormater(format, header)` builds a batch formatter by composing the sequence-level function over all sequences in an iterator-driven batch, concatenating results with newline separators. + +The package supports extensibility and type safety through function composition while integrating logging (via `logrus`) for critical errors—e.g., unsupported formats trigger a fatal log. It abstracts away low-level I/O, focusing purely on *semantic formatting logic*, making it ideal for pipeline integration in NGS data processing tools. diff --git a/autodoc/docmd/pkg/obiformats/fastseq_header.md b/autodoc/docmd/pkg/obiformats/fastseq_header.md new file mode 100644 index 0000000..1bf84c0 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/fastseq_header.md @@ -0,0 +1,27 @@ +# Semantic Description of `obiformats` Package + +The `obiformats` package provides utilities for parsing sequence headers in the OBItools4 framework, supporting two distinct formats: + +- **JSON-based format** (e.g., `{"id":"seq1", ...}`): Detected by a leading `{` character. +- **Legacy OBI format** (plain text, e.g., `>seq1 description`): Used when no JSON prefix is present. + +## Core Functions + +- **`ParseGuessedFastSeqHeader(sequence *obiseq.BioSequence)`** + Dynamically routes header parsing based on the first character of the sequence definition: + - Calls `ParseFastSeqJsonHeader` if JSON-prefixed. + - Otherwise invokes `ParseFastSeqOBIHeader`. + +- **`IParseFastSeqHeaderBatch(iterator, options...) obiiter.IBioSequence`** + Applies header parsing to a *batch* of sequences: + - Takes an iterator over `BioSequence`s. + - Uses optional configuration (e.g., parallelism, parsing behavior). + - Wraps the parser in a worker pipeline via `MakeIWorker`, preserving sequence flow. + +## Design Principles + +- **Format agnosticism**: Automatically detects header type. +- **Iterator-based streaming**: Enables memory-efficient batch processing of large datasets (e.g., FASTQ/FASTA). +- **Extensibility**: Options pattern (`WithOption`) supports runtime customization. + +This package serves as a header-decoding layer for downstream analysis in metagenomic or metabarcoding workflows. diff --git a/autodoc/docmd/pkg/obiformats/fastseq_interface.md b/autodoc/docmd/pkg/obiformats/fastseq_interface.md new file mode 100644 index 0000000..e42f1d6 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/fastseq_interface.md @@ -0,0 +1,28 @@ +# `FormatHeader` Function Type in `obiformats` + +The `obiformats` package defines a core functional interface for sequence formatting within the OBITools4 ecosystem. + +- **Package**: `obiformats` + Provides utilities for formatting biological sequences according to various output standards (e.g., FASTA, GenBank). + +- **Type Definition**: + ```go + type FormatHeader func(sequence *obiseq.BioSequence) string + ``` + - A `FormatHeader` is a *function type* that takes a pointer to an `obiseq.BioSequence` and returns its formatted header as a string. + +- **Semantic Role**: + Encapsulates the logic for generating *header lines* (e.g., `>id description`) in sequence file formats. + Decouples header formatting from core data structures (`BioSequence`), enabling modular and reusable format adapters. + +- **Usage Context**: + - Used by writers/formatters to produce standardized headers when exporting sequences. + - Allows custom header generation (e.g., for MIxS-compliant metadata, user-defined tags). + - Supports polymorphism: different `FormatHeader` implementations can be swapped per output format. + +- **Dependencies**: + - Relies on `obiseq.BioSequence`, the core sequence data model (ID, description, annotations, etc.). + +- **Design Intent**: + Promotes clean separation of concerns: data (sequence) ↔ formatting logic. + Facilitates extensibility for new output formats without modifying core types. diff --git a/autodoc/docmd/pkg/obiformats/fastseq_json_header.md b/autodoc/docmd/pkg/obiformats/fastseq_json_header.md new file mode 100644 index 0000000..f4c8932 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/fastseq_json_header.md @@ -0,0 +1,21 @@ +This Go package `obiformats` provides semantic parsing and serialization utilities for FASTQ/FASTA sequence headers encoded in JSON format, primarily used within the OBITools4 framework. + +- **JSON Parsing Helpers**: + It defines internal functions (`_parse_json_map_*`, `_parse_json_array_*`) to convert JSON objects/arrays into typed Go maps and slices (`map[string]string`, `[]int`, etc.), using the high-performance [`jsonparser`](https://github.com/buger/jsonparser) library for streaming parsing. + +- **Header Interpretation**: + `_parse_json_header_` interprets a FASTQ/FASTA header string containing embedded JSON metadata. It extracts and assigns: + - Core fields (`id`, `definition`, `count`) + - Specialized OBITools annotations (e.g., `"obiclean_weight"`, `"taxid"` with optional taxonomic ranks) + - Generic annotations of any JSON type (string, number, bool, array, object), preserving numeric precision where possible. + +- **Sequence Annotation Enrichment**: + `ParseFastSeqJsonHeader` parses the header of a `BioSequence`, extracting JSON metadata into its annotations map and reconstructing non-JSON text as the new definition. + +- **Serialization Support**: + `WriteFastSeqJsonHeader` and `FormatFastSeqJsonHeader` serialize sequence annotations back into JSON format, appending them to a buffer or returning as string — enabling round-trip compatibility for annotated sequences. + +- **Error Handling**: + Uses `log.Fatalf` on parsing failures, ensuring malformed headers fail fast during processing. + +In summary: *structured JSON header ↔ BioSequence annotation mapping*, optimized for metabarcoding workflows. diff --git a/autodoc/docmd/pkg/obiformats/fastseq_obi_header.md b/autodoc/docmd/pkg/obiformats/fastseq_obi_header.md new file mode 100644 index 0000000..711d4cf --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/fastseq_obi_header.md @@ -0,0 +1,31 @@ +# OBIFormats Package: Semantic Description + +The `obiformats` package provides parsing and formatting utilities for **OBI-compliant FASTA headers**, enabling structured annotation of biological sequences. + +- It supports parsing key-value annotations embedded in sequence definitions (e.g., `key=value;`), including nested dictionaries. +- Three core parsing functions detect value types: + - `__match__key__`: Identifies assignment patterns (`Key = ...`). + - `__obi_header_value_numeric_pattern__`: Matches floats/integers (e.g., `42.0;`). + - `__obi_header_value_string_pattern__`: Matches quoted strings (e.g., `'example';`). + - `__match__dict__`: Parses balanced `{...}` blocks, handling nested structures and string delimiters. + +- Boolean detection (`__is_true__/__is_false__`) handles multiple case variants (e.g., `true`, `True`, `TRUE`). + +- The main entry point, **`ParseOBIFeatures(text string, annotations obiseq.Annotation)`,** + iteratively extracts key-value pairs from a header string and populates an `Annotation` map. + - Numeric values are stored as integers if they have no fractional part. + - Dictionary-like strings (e.g., `{'a':1,'b':2}`) are JSON-unmarshalled into typed maps: + - `*_count` → `map[string]int`, + - `merged_*` → wrapped in a statistics object (`obiseq.StatsOnValues`). + - `*_status`/`*_mutation` → `map[string]string`. + +- **`ParseFastSeqOBIHeader(sequence *obiseq.BioSequence)`** applies parsing to a sequence’s definition line, moving annotations into its metadata map and preserving leftover text. + +- **`WriteFastSeqOBIHeade(buffer *bytes.Buffer, sequence)`** serializes annotations back into OBI header format: + - Strings and booleans use `key=value;`. + - Maps/dicts are JSON-encoded, then single-quoted for compatibility. + - Special handling ensures `obiseq.StatsOnValues` are safely marshalled. + +- **`FormatFastSeqOBIHeader(sequence)`** returns the formatted header as a string (zero-copy via `unsafe.String` for performance). + +- Designed to interoperate with the broader OBITools4 ecosystem (`obiseq`, `obiutils`), supporting both human-readable and machine-processable sequence metadata. diff --git a/autodoc/docmd/pkg/obiformats/fastseq_read.md b/autodoc/docmd/pkg/obiformats/fastseq_read.md new file mode 100644 index 0000000..8494ae1 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/fastseq_read.md @@ -0,0 +1,26 @@ +# FastSeq Reader Module — Semantic Description + +This Go package (`obiformats`) provides high-performance parsing of FASTA/FASTQ files using a C-backed library (`fastseq_read.h`). It enables streaming, batched reading of biological sequences with optional quality scores. + +## Core Features + +- **C-based FASTX parsing**: Leverages `kseq.h` via Go's cgo for efficient, low-level file/stream parsing. +- **Batched iteration**: Sequences are grouped into configurable batches (`batch_size`) for memory-efficient processing. +- **Quality score handling**: Supports FASTQ; decodes Phred quality scores using a configurable shift offset (`obidefault.ReadQualitiesShift()`). +- **Source tracking**: Each sequence carries its origin (filename or `"stdin"`), aiding provenance. +- **Header parsing hook**: Optional custom header parser (`ParseFastSeqHeader`) allows metadata extraction or transformation. +- **Full-file batching mode**: When enabled, yields a single batch containing the entire file (useful for small files or global operations). +- **Stdin & File I/O**: Two entry points: + - `ReadFastSeqFromFile(filename, ...)` for regular files. + - `ReadFastSeqFromStdin(...)` to process piped input (e.g., from upstream tools). +- **Error resilience**: Gracefully handles missing files, with logging (via `logrus`) for debugging. +- **Async streaming**: Uses goroutines to decouple reading from consumption, enabling concurrent pipelines. + +## Integration + +Built on top of `obitools4`’s core abstractions: +- `obiiter.IBioSequence`: Iterator interface for biological sequences. +- `obiseq.BioSequence`: Data model holding name, sequence bytes, comment, and quality. +- `obiutils`, `obidefault`: Utilities for path handling and defaults. + +Designed for scalability in high-throughput metabarcoding pipelines. diff --git a/autodoc/docmd/pkg/obiformats/fastseq_write_fasta.md b/autodoc/docmd/pkg/obiformats/fastseq_write_fasta.md new file mode 100644 index 0000000..471ff0a --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/fastseq_write_fasta.md @@ -0,0 +1,35 @@ +# `obiformats` Package Overview + +The `obiformats` package provides utilities for formatting and writing biological sequences (e.g., DNA, RNA) in standard formats—primarily **FASTA**. It is designed for high-performance batch processing and supports parallel I/O, compression-aware streaming, and flexible configuration. + +## Core Formatting Functions + +- **`FormatFasta(seq, formater)`** + Converts a single `BioSequence` into a FASTA string: header (`>id description`) followed by sequence lines of up to 60 characters. + +- **`FormatFastaBatch(batch, formater, skipEmpty)`** + Efficiently formats a batch of sequences into FASTA using pre-allocated buffers and direct byte writes—avoiding intermediate strings. Empty sequences are either skipped (with warning) or cause a fatal error. + +## File Writing Functions + +- **`WriteFasta(iterator, file, options...)`** + Writes a stream of sequences to any `io.WriteCloser`. Supports: + - Parallel workers (`ParallelWorkers`) + - Chunked writing via `WriteFileChunk` + - Optional compression (e.g., gzip) + Returns a new iterator mirroring the input for pipeline chaining. + +- **`WriteFastaToStdout(iterator, options...)`** + Convenience wrapper to output FASTA directly to `stdout`, with file-closing behavior configurable. + +- **`WriteFastaToFile(iterator, filename, options...)`** + Writes to a named file with: + - Truncation or append mode (`AppendFile`) + - Automatic paired-end output if `HaveToSavePaired()` is enabled + (writes reverse reads to a secondary file specified via `PairedFileName`) + +## Key Design Highlights + +- **Memory-efficient**: Uses `bytes.Buffer.Grow()` and avoids unnecessary allocations. +- **Robust error handling**: Panics on nil sequences; logs warnings/errors via `logrus`. +- **Pipeline-friendly**: Integrates with the `obiiter` iterator abstraction for streaming workflows. diff --git a/autodoc/docmd/pkg/obiformats/fastseq_write_fastq.md b/autodoc/docmd/pkg/obiformats/fastseq_write_fastq.md new file mode 100644 index 0000000..2f21993 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/fastseq_write_fastq.md @@ -0,0 +1,35 @@ +# FASTQ Output Module (`obiformats`) + +This Go package provides utilities for formatting and writing biological sequence data in **FASTQ format**. It supports single-end, paired-end, batch processing, and parallelized I/O. + +## Core Functionality + +- **`FormatFastq(seq, headerFormatter)`**: Formats a single `BioSequence` into FASTQ string. +- **`FormatFastqBatch(batch, headerFormatter, skipEmpty)`**: Formats a batch of sequences efficiently with dynamic buffer growth and optional skipping/termination on empty reads. + +## Header Customization + +- Accepts a `FormatHeader` function to inject custom metadata (e.g., read group, sample ID) after the sequence identifier. + +## Writing to Streams/Files + +- **`WriteFastq(iterator, fileWriter)`**: Writes sequences from an iterator to any `io.WriteCloser`, supporting compression and parallel workers via options. +- **`WriteFastqToStdout(...)`**: Convenience wrapper for stdout output (e.g., piping). +- **`WriteFastqToFile(...)`**: Writes to a file, with support for: + - Append/truncate modes + - Paired-end output (splits iterator and writes to two files) + - Automatic compression via `obiutils.CompressStream` + +## Parallelization & Robustness + +- Uses goroutines to parallelize formatting/writing across multiple workers. +- Handles empty sequences gracefully: logs warning or fatal error based on `skipEmpty` option. +- Ensures ordered output via batch tracking (`Order()`) and chunked writing. + +## Integration + +Designed to work seamlessly with the `obitools4` ecosystem: +- Uses `obiiter.BioSequenceBatch`, `obiseq.BioSequence`, and logging via Logrus. +- Extensible through functional options (`WithOption`) for configuration. + +> *Efficient, scalable FASTQ output with support for high-throughput NGS workflows.* diff --git a/autodoc/docmd/pkg/obiformats/fastseq_write_with_index.md b/autodoc/docmd/pkg/obiformats/fastseq_write_with_index.md new file mode 100644 index 0000000..8ee9354 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/fastseq_write_with_index.md @@ -0,0 +1,19 @@ +# `obiformats` Package Overview + +The `obiformats` package provides semantic support for handling and validating structured data formats, particularly focused on biodiversity observation records. It offers: + +- **Format Abstraction**: Defines common interfaces and base classes for standardized biodiversity data formats (e.g., Darwin Core, OBIS-ENV). + +- **Validation Rules**: Implements semantic validation logic to ensure data integrity and compliance with community standards (e.g., required fields, controlled vocabularies). + +- **Mapping Utilities**: Includes tools for transforming records between different biodiversity data schemas (e.g., from local formats to Darwin Core). + +- **Ontology Integration**: Leverages semantic web technologies (e.g., RDF, OWL) to support interoperability and reasoning over observation metadata. + +- **Type Safety**: Uses strongly-typed data models (e.g., `Occurrence`, `Event`) to reduce runtime errors and improve code clarity. + +- **Extensibility**: Designed for easy extension—new formats or standards can be added by implementing core interfaces. + +- **Test Coverage**: Includes unit and integration tests to guarantee correctness across format transformations and validations. + +The package targets biodiversity data managers, informaticians building OBIS-compatible systems, and researchers working with ecological observation datasets. diff --git a/autodoc/docmd/pkg/obiformats/file_chunk_read.md b/autodoc/docmd/pkg/obiformats/file_chunk_read.md new file mode 100644 index 0000000..94851f8 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/file_chunk_read.md @@ -0,0 +1,25 @@ +# Semantic Description of `obiformats` Package Functionalities + +The `obiformats` package provides robust, streaming-aware chunking utilities for processing large biological sequence files (e.g., FASTA/FASTQ) in a memory-efficient and parallel-friendly manner. + +- **`PieceOfChunk`**: A rope-like linked buffer structure enabling efficient concatenation and partial reading of large data streams without full materialization. Supports dynamic chaining (`NewPieceOfChunk`, `Next()`) and final packing into a contiguous slice via `Pack()`. + +- **`FileChunk`**: Encapsulates one chunk of raw data (`*bytes.Buffer`) or its rope representation, tagged with source file name and positional order for ordered downstream processing. + +- **`ChannelFileChunk`**: A typed channel (`chan FileChunk`) enabling concurrent, pipeline-style data ingestion—ideal for parallel parsing or streaming workflows. + +- **`LastSeqRecord`**: A callback type (`func([]byte) int`) used to locate the end of a complete biological record (e.g., last newline after full FASTQ entry), ensuring chunks split only at valid boundaries. + +- **`ReadFileChunk()`**: Core function that: + - Reads from an `io.Reader` in configurable chunks (`fileChunkSize`); + - Uses a probe string (e.g., `"@M0"` for FASTQ) to early-exit non-matching segments and avoid unnecessary parsing; + - Extends chunks incrementally (e.g., +1 MB) until a full record boundary is found via `splitter`; + - Returns data as an ordered stream of `FileChunk`s on a channel, closing it upon EOF; + - Optionally packs rope buffers to contiguous memory (`pack` flag), balancing speed vs. RAM usage. + +- **Key semantics**: + - *Chunking by record integrity*, not fixed byte size — prevents splitting biological entries. + - *Lazy evaluation*: only reads ahead when needed to find record boundaries. + - *Streaming-first design* — supports large files without full loading into memory. + +This package is foundational for scalable, robust parsing of high-throughput sequencing data in the OBITools4 ecosystem. diff --git a/autodoc/docmd/pkg/obiformats/file_chunk_write.md b/autodoc/docmd/pkg/obiformats/file_chunk_write.md new file mode 100644 index 0000000..21cb495 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/file_chunk_write.md @@ -0,0 +1,26 @@ +# `WriteFileChunk` Function — Semantic Description + +The `WriteFileChunk` function in the `obiformats` package implements a **thread-safe, ordered chunk writer** for streaming data to an `io.WriteCloser`. It accepts a destination writer and a flag indicating whether the writer should be closed upon completion. + +- **Input**: + - `writer`: An `io.WriteCloser` (e.g., file, buffer) to which data chunks are written. + - `toBeClosed`: Boolean flag specifying if the writer should be closed after all chunks are processed. + +- **Core Behavior**: + - Launches a goroutine that consumes `FileChunk` items from an unbuffered channel (`chunk_channel`). + - Ensures **strict sequential ordering** of chunks by their `Order` field (intended for reassembly after parallel or out-of-order processing). + - If a chunk arrives in order (`chunk.Order == nextToPrint`), it is immediately written. + - Out-of-order chunks are buffered in a map (`toBePrinted`) until their predecessor arrives. + +- **Buffer Management**: + - After writing an in-order chunk, the function checks for newly consecutive buffered chunks and writes them greedily (e.g., if order 2 arrives, it triggers writing of buffered orders 3,4,... as available). + +- **Error Handling**: + - Logs fatal errors on write failures or writer closure issues using `log.Fatalf`. + +- **Cleanup & Lifecycle**: + - Closes the underlying writer if requested and unregisters a pipe registration (via `obiutils`) to signal end-of-stream. + - Returns the input channel, enabling external producers to stream `FileChunk` structs. + +- **Use Case**: + Designed for robust, ordered reconstruction of large binary/data streams (e.g., sequencing reads) in OBITools4 pipelines, especially where parallel chunking and reassembly occur. diff --git a/autodoc/docmd/pkg/obiformats/genbank_read.md b/autodoc/docmd/pkg/obiformats/genbank_read.md new file mode 100644 index 0000000..7cbbe04 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/genbank_read.md @@ -0,0 +1,34 @@ +# GenBank Parser Module (`obiformats`) + +This Go package provides high-performance parsing of **GenBank flat files**, optimized for large-scale genomic data processing. It supports both rope-based (memory-efficient) and buffered I/O parsing strategies. + +## Core Functionalities + +- **State-machine parser**: Processes GenBank records through well-defined states (`inHeader`, `inEntry`, `inFeature`, etc.), ensuring robust handling of structured sections (LOCUS, DEFINITION, SOURCE, FEATURES, ORIGIN/CONTIG). +- **Rope-aware parsing** (`GenbankChunkParserRope`): Directly parses from a `PieceOfChunk` rope structure, avoiding large contiguous memory allocations—critical for chromosomal-scale sequences. +- **Sequence extraction**: Efficient byte-by-byte scanning of the `ORIGIN` section, compacting bases and optionally converting uracil (`u`) to thymine (`t`). +- **Metadata extraction**: Captures sequence ID, declared length (from LOCUS), scientific name (`SOURCE`), and taxonomic ID (`/db_xref="taxon:..."`). +- **Optional feature table support**: When enabled, stores raw FEATURES section content for downstream annotation processing. +- **Parallel streaming I/O**: + - `ReadGenbank()` and `ReadGenbankFromFile()` return an iterator (`obiiter.IBioSequence`) over parsed sequences. + - Supports concurrent parsing via configurable worker count, with chunked file reading and batch output. + +## Key Design Decisions + +- **Zero-copy where possible**: Rope parser avoids `Pack()` to prevent expensive reallocation. +- **Strict state validation**: Logs fatal errors on unexpected line sequences (e.g., `DEFINITION` outside entry state). +- **Fallback parsing**: Falls back to buffered I/O (`GenbankChunkParser`) when rope data is unavailable. +- **U-to-T conversion**: Optional base modification for RNA→DNA normalization (e.g., in transcriptome data). +- **Error resilience**: Warns on empty IDs but continues processing; rejects overly long lines (>100 chars) in buffered mode. + +## Output + +Returns a batched iterator of `BioSequence` objects, each containing: +- Identifier (`id`) +- Compact nucleotide sequence +- Definition line (as description) +- Source file origin +- Optional feature table bytes +- Annotations: `scientific_name`, `taxid` + +Ideal for pipelines requiring scalable, low-memory GenBank ingestion (e.g., metagenomic databases). diff --git a/autodoc/docmd/pkg/obiformats/json_writer.md b/autodoc/docmd/pkg/obiformats/json_writer.md new file mode 100644 index 0000000..b06ec1d --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/json_writer.md @@ -0,0 +1,27 @@ +# JSON Output Module for Biological Sequences (`obiformats`) + +This Go package provides utilities to serialize biological sequence data (from `obiseq`) into structured JSON format, supporting batch processing and parallel I/O. + +- **`JSONRecord(sequence)`**: Converts a single `BioSequence` into an indented JSON object containing: + - `"id"`: Sequence identifier. + - `"sequence"` (optional): Nucleotide/protein sequence string if present. + - `"qualities"` (optional): Quality scores as a string if available. + - `"annotations"` (optional): Metadata annotations map. + +- **`FormatJSONBatch(batch)`**: Formats a batch of sequences as JSON array elements, returning a `*bytes.Buffer`. Handles comma separation and indentation. + +- **`WriteJSON(iterator, file)`**: Writes a stream of sequences to an `io.Writer`, supporting: + - Parallel workers (configurable via options). + - Automatic compression (`gzip`/`bgzip`) if enabled. + - Proper JSON array wrapping: `[`, chunked batches, and final `]`. + - Atomic ordering to preserve sequence integrity across parallel writes. + +- **`WriteJSONToStdout()` / `WriteJSONToFile()`**: Convenience wrappers: + - Outputs to stdout or a file (with append/truncate control). + - Supports paired-end data: writes both forward and reverse reads to separate files when configured. + +- **Internal helpers**: + - `_UnescapeUnicodeCharactersInJSON()`: Fixes double-escaped Unicode in JSON output (e.g., `\\u00E9` → `\u00E9`). + - Uses chunked concurrency with `FileChunk`, ordered by batch number to ensure valid JSON structure. + +Designed for high-throughput NGS data pipelines, it ensures correctness and performance while integrating with `obitools4`'s iterator-based processing model. diff --git a/autodoc/docmd/pkg/obiformats/ncbitaxdump_read.md b/autodoc/docmd/pkg/obiformats/ncbitaxdump_read.md new file mode 100644 index 0000000..7d8df9b --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/ncbitaxdump_read.md @@ -0,0 +1,17 @@ +# NCBI Taxonomy Loader Module (`obiformats`) + +This Go package provides functionality to parse and load NCBI taxonomy dump files into a structured `Taxonomy` object. It supports three core file types: + +- **nodes.dmp**: Defines the taxonomic hierarchy via `taxid|parent_taxid|rank` records. +- **names.dmp**: Maps taxonomic IDs to names and name classes (e.g., "scientific name", "common name"). +- **merged.dmp**: Tracks deprecated taxonomic IDs and their replacements. + +Key features: +- Custom CSV parsing with `|` delimiter, comment support (`#`), and whitespace trimming. +- Support for loading *only scientific names* via the `onlysn` flag in `LoadNCBITaxDump`. +- Efficient buffered reading (`bufio.Reader`) for large files. +- Automatic root taxon (taxid `"1"`, i.e., *root*) assignment after loading. +- Alias resolution: deprecated taxids are mapped to current ones via `AddAlias`. +- Robust error handling with fatal logging on critical failures (e.g., missing root taxon, invalid parent references). + +The main entry point is `LoadNCBITaxDump(directory string, onlysn bool)`, which constructs a fully initialized taxonomy from NCBI dump files. Designed for integration with `obitax` and `obiutils`, it enables downstream applications (e.g., metabarcoding pipelines) to perform taxonomic queries and filtering. diff --git a/autodoc/docmd/pkg/obiformats/ncbitaxdump_readtar.md b/autodoc/docmd/pkg/obiformats/ncbitaxdump_readtar.md new file mode 100644 index 0000000..a573c3b --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/ncbitaxdump_readtar.md @@ -0,0 +1,31 @@ +## NCBI Taxonomy Archive Support in `obiformats` + +This Go package provides utilities for handling **NCBI Taxonomy dumps archived as `.tar` files**. + +### Core Functionalities + +1. **Archive Validation (`IsNCBITarTaxDump`)** + - Checks whether a given `.tar` file contains all required NCBI Taxonomy dump files: `citations.dmp`, `division.dmp`, `gencode.dmp`, `names.dmp`, `delnodes.dmp`, `gc.prt`, `merged.dmp`, and `nodes.dmp`. + - Returns a boolean indicating if the archive is a complete NCBI tax dump. + +2. **Taxonomy Loading (`LoadNCBITarTaxDump`)** + - Parses the `.tar` archive and extracts key files to build a `Taxonomy` object. + - Steps include: + - **Nodes**: Loads taxonomic hierarchy (`nodes.dmp`) via `loadNodeTable`. + - **Names**: Parses scientific and common names (`names.dmp`) via `loadNameTable`, with an option to load *only scientific names* (`onlysn`). + - **Merged Taxa**: Integrates taxonomic aliases from `merged.dmp`, using `loadMergedTable`. + - Sets the root taxon to NCBI’s default (`taxid = 1`, i.e., *root*). + +3. **Integration with Other Modules** + - Uses `obiutils.Ropen`, `TarFileReader` for robust file handling. + - Leverages `obitax.Taxonomy`, a structured representation of taxonomic data. + +### Key Parameters +- `onlysn`: If true, only scientific names are loaded (reduces memory usage). +- `seqAsTaxa`: Reserved for future use; currently unused. + +### Logging & Error Handling +- Uses `logrus` to log loading progress and counts. +- Returns descriptive errors if required files or the root taxon are missing. + +> **Note**: Designed for efficient, standards-compliant ingestion of NCBI Taxonomy data in bioinformatics pipelines. diff --git a/autodoc/docmd/pkg/obiformats/newick_write.md b/autodoc/docmd/pkg/obiformats/newick_write.md new file mode 100644 index 0000000..5e86484 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/newick_write.md @@ -0,0 +1,31 @@ +# Newick Format Export Functionality in `obiformats` + +This Go package provides utilities to export taxonomic data into the **Newick format**, a standard for representing phylogenetic trees. + +## Core Components + +- `Tree`: A struct modeling a node in a Newick tree, containing: + - `Children`: list of child nodes (nested trees), + - `TaxNode`: reference to a taxonomic entry (`obitax.TaxNode`), + - `Length`: optional branch length (evolutionary distance). + +- **`Newick()` methods**: + - `Tree.Newick(...)`: Recursively generates a Newick string for the subtree. + Supports optional annotations: `scientific_name`, `taxid` (with `'@'` for rank), and branch lengths. + - Package-level `Newick(...)`: Converts a full taxon set into a Newick tree string using the root node from `taxa.Sort().Get(0)`. + +- **Writing Functions**: + - `WriteNewick(...)`: Asynchronously writes the Newick representation to any `io.WriteCloser`. + - Accepts an iterator over taxa (`*obitax.ITaxon`). + - Validates single-taxonomy input. + - Applies compression (via `obiutils.CompressStream`) if configured via options (`WithOption`). + - `WriteNewickToFile(...)`: Convenience wrapper to write directly to a file. + - `WriteNewickToStdout(...)`: Outputs Newick tree to standard output. + +## Configuration Options + +Options (e.g., `WithScientificName`, `WithTaxid`, `WithRank`) control annotation content and behavior (e.g., file closing, compression). + +## Semantic Summary + +The module enables **conversion of hierarchical taxonomic datasets into structured Newick trees**, supporting rich node labeling for downstream phylogenetic or bioinformatic tools. diff --git a/autodoc/docmd/pkg/obiformats/ngsfilter_read.md b/autodoc/docmd/pkg/obiformats/ngsfilter_read.md new file mode 100644 index 0000000..985b815 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/ngsfilter_read.md @@ -0,0 +1,47 @@ +# NGSFilter Configuration Parser — Semantic Overview + +This Go package (`obiformats`) provides robust parsing and validation of NGS (Next-Generation Sequencing) filter configurations used in the OBITools4 ecosystem. It supports two legacy and modern formats: a line-based text format (`ReadOldNGSFilter`) and CSV-based configuration files with parameter headers. + +## Core Functionality + +- **Format Detection**: + `OBIMimeNGSFilterTypeGuesser` detects MIME type using content sniffing (via [`mimetype`](https://github.com/gabriel-vasile/mimetype)), distinguishing between `text/csv`, custom `text/ngsfilter-csv`, and plain text. + A heuristic CSV detector (`NGSFilterCsvDetector`) validates structure (consistent column count, non-empty rows). + +- **Dual Input Parsing**: + - `ReadOldNGSFilter`: Parses line-based config files (e.g., lines like `"EXP1@SAMPLE1:TAGFWD-TAGREV primer_f primer_r"`), supporting: + - Primer pairs (`forward`, `reverse`) + - Tag pairs (with optional `-` for untagged direction) + - Experiment/sample metadata + - OBIFeatures annotations (via `ParseOBIFeatures`) + - `ReadCSVNGSFilter`: Parses structured CSV files with mandatory columns: + `"experiment"`, `"sample"`, `"sample_tag"`, `"forward_primer"`, `"reverse_primer"` + Additional columns are stored as annotations. + +- **Parameter Configuration**: + A rich set of `@param` lines (in CSV or legacy format) configures global/primers-specific settings: + - `spacer`, `forward_spacer`, `reverse_spacer`: Tag-primer spacing (bp) + - `tag_delimiter` / directional variants: Symbol separating tags in sequences + - `matching`: Tag matching algorithm (e.g., exact, fuzzy) + - Error tolerance: + `primer_mismatches`, `forward_mismatches`, `reverse_mismatches` (max mismatches) + `tag_indels`, `forward_tag_indels`, etc. (allow indel errors) + - Indel handling: + `indels` / directional variants (`true/false`) to enable/disable indels in primer matching + +- **Validation & Integrity Checks**: + - `CheckPrimerUnicity`: Ensures each primer pair is defined only once. + - Duplicate tag-pair detection per marker (error on reuse). + - Strict column/field validation with informative error messages. + +- **Logging & Observability**: + Uses `logrus` for detailed info/warnings (e.g., parameter application, skipped unknown params). + +## Design Highlights + +- **Extensibility**: New parameters can be added via `library_parameter` map. +- **Robustness**: Handles BOM, line continuation (`ReadLines`), CSV quirks (lazy quotes, comments). +- **Semantic Clarity**: Separates *data* (samples/markers/tags) from *configuration* (parameters). +- **Integration Ready**: Returns a validated `obingslibrary.NGSLibrary` ready for downstream processing. + +> **Use Case**: Enables reproducible, metadata-rich NGS filtering setups in metabarcoding workflows. diff --git a/autodoc/docmd/pkg/obiformats/options.md b/autodoc/docmd/pkg/obiformats/options.md new file mode 100644 index 0000000..356bb51 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/options.md @@ -0,0 +1,14 @@ +# Semantic Description of `obiformats` Package Functionalities + +The `go` package `obiformats` provides a flexible, configuration-driven framework for handling biological sequence data (e.g., FASTA/FASTQ) and associated metadata. Its core component is the `Options` type, which encapsulates user-defined settings via an immutable configuration pattern using functional setters (`WithOption`). + +Key capabilities include: +- **I/O control**: file handling options (e.g., `OptionCloseFile`, `OptionsAppendFile`), compression support (`OptionsCompressed`), and batch processing modes (e.g., `FullFileBatch`, custom `BatchSize`). +- **Parallelism & performance tuning**: configurable number of workers (`OptionsParallelWorkers`) and memory buffer size (via `TotalSeqSize`). +- **Sequence parsing/formatting**: pluggable header parsers/writers for FASTA/FASTQ (e.g., `OptionsFastSeqHeaderParser`, `OptionFastSeqDoNotParseHeader`), with support for quality scores (`OptionsReadQualities`). +- **CSV export**: granular control over columns (ID, sequence, quality, taxon, count), separators (`CSVSeparator`), NA values (`CSVNAValue`), and auto-inferred keys (`CSVAutoColumn`). +- **Taxonomic metadata integration**: toggles for taxid, scientific name, rank, path (with/without root), parent relationships (`OptionsWithTaxid`, `OptionWithoutRootPath`), and U→T conversion for ambiguous bases. +- **Advanced features**: feature table inclusion (`WithFeatureTable`), pattern matching support (`OptionsWithPattern`), and paired-end read handling via `WritePairedReadsTo`. +- **Metadata extensibility**: arbitrary metadata fields can be attached via `OptionsWithMetadata`, with automatic cleanup (e.g., removal of `"query"` when pattern mode is active). + +All options are initialized with sensible defaults (e.g., `batch_size`, `parallel_workers`) and can be composed using the `MakeOptions` constructor. This design enables declarative, reusable configuration across sequence processing pipelines in OBITools4. diff --git a/autodoc/docmd/pkg/obiformats/rope_scanner.md b/autodoc/docmd/pkg/obiformats/rope_scanner.md new file mode 100644 index 0000000..90fdb1c --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/rope_scanner.md @@ -0,0 +1,27 @@ +# `ropeScanner` — Line-by-Line Text Scanning over a Rope Data Structure + +The `obiformats` package provides the `ropeScanner`, an efficient line-oriented iterator over a *Rope* (a tree-based immutable string representation, implemented here as `PieceOfChunk`). This scanner supports streaming large texts without full materialization. + +## Core Functionality + +- **`newRopeScanner(rope *PieceOfChunk)`** + Constructs a new scanner starting at the root of the rope. + +- **`ReadLine() []byte`** + Returns the next line (without trailing `\n`, or `\r\n`) as a byte slice. + - Returns `nil` when the end of the rope is reached. + - Reuses internal buffers (`carry`) to handle lines spanning multiple nodes efficiently. + - The returned slice aliases rope data and is only valid until the next call. + +- **`skipToNewline()`** + Advances internal position to just after the next newline (`\n`), discarding content. Useful for skipping unwanted lines or headers. + +## Implementation Highlights + +- **Buffered carry-over**: Lines split across rope nodes are assembled incrementally in the `carry` buffer, which grows dynamically. +- **Cross-platform line endings**: Automatically strips `\r\n`, leaving only the content (no trailing CR). +- **Zero-copy where possible**: When a line fits entirely within one node and no carry exists, it returns a slice directly into the rope’s underlying data. + +## Use Case + +Ideal for parsing large text files or streams (e.g., OBIE/Obi formats) where memory efficiency and streaming behavior are critical—without loading the entire content into RAM. diff --git a/autodoc/docmd/pkg/obiformats/taxonomy_read.md b/autodoc/docmd/pkg/obiformats/taxonomy_read.md new file mode 100644 index 0000000..ad78f73 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/taxonomy_read.md @@ -0,0 +1,34 @@ +# Taxonomy Loading Module (`obiformats`) + +This Go package provides semantic functionality to automatically detect and load taxonomic data from various file formats. It supports flexible, format-agnostic taxonomy ingestion via a unified interface. + +## Core Features + +1. **Format Detection** + - `DetectTaxonomyFormat(path)` identifies the taxonomy source format by inspecting file type (directory, MIME-type), filename patterns, or structure. + - Supports: + • NCBI Taxdump (both directory and `.tar` archive) + • CSV files (`text/csv`) + • FASTA/FASTQ sequences (via `mimetype` detection) + +2. **Modular Loaders** + - Returns a typed `TaxonomyLoader` function, enabling deferred loading with configurable options (`onlysn`, `seqAsTaxa`). + - Each loader abstracts format-specific parsing (e.g., NCBI `nodes.dmp`, FASTA header taxonomy extraction). + +3. **Sequence-Based Taxonomy Extraction** + - For sequence files (FASTA/FASTQ), taxonomy is inferred from headers or associated metadata, using `ExtractTaxonomy()`. + +4. **Integration with OBITools Ecosystem** + - Leverages `obitax.Taxonomy` as the canonical output structure. + - Uses custom MIME-type registration (`obiutils.RegisterOBIMimeType()`) for robust detection of bioinformatics formats. + +5. **Error Handling & Logging** + - Graceful failure with descriptive errors; informative logging via `logrus`. + +## Usage Flow + +```go +tax, err := LoadTaxonomy("path/to/data", onlysn=true, seqAsTaxa=false) +``` + +The module enables interoperability across taxonomic data sources in metabarcoding workflows. diff --git a/autodoc/docmd/pkg/obiformats/universal_read.md b/autodoc/docmd/pkg/obiformats/universal_read.md new file mode 100644 index 0000000..b5ed840 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/universal_read.md @@ -0,0 +1,26 @@ +# OBIFORMATS Package: Semantic Description + +The `obiformats` package provides robust, format-agnostic sequence reading capabilities for biological data in the OBITools4 ecosystem. + +It supports automatic detection and parsing of common bioinformatics file formats via MIME-type inference: +- **FASTA** (`text/fasta`): identified by lines starting with `>`. +- **FASTQ** (`text/fastq`): detected via leading `@` characters. +- **ecoPCR2**: recognized by the header line `#@ecopcr-v2`. +- **EMBL** (`text/embl`): detected by lines starting with `ID `. +- **GenBank** (`text/genbank`): identified by either `LOCUS ` or legacy `"Genetic Sequence Data Bank"` headers. +- **CSV** (`text/csv`): generic tabular support. + +Core functionality is exposed through: +- `OBIMimeTypeGuesser()`: inspects the first ~1 MiB of an input stream to infer MIME type using `github.com/gabriel-vasile/mimetype`, while preserving unread data for downstream processing. +- `ReadSequencesFromFile()`: reads sequences from a file path, infers format via MIME detection, and dispatches to dedicated parsers (e.g., `ReadFasta`, `ReadFastq`). +- `ReadSequencesFromStdin()`: convenience wrapper to read from stdin, treating `"-"` as filename and auto-closing the stream. + +Internally leverages: +- `obiutils.Ropen()` for unified file opening (including stdin handling). +- Path extension stripping and source tagging via `OptionsSource()`. +- Logging (`logrus`) for format diagnostics. +- Iterator interface (`obiiter.IBioSequence`) to abstract sequential access over sequences. + +The package ensures extensibility: new formats can be added by extending the `switch` dispatch in `ReadSequencesFromFile()` and registering corresponding MIME types. + +Error handling covers empty files, invalid streams, and unsupported formats via explicit logging or fatal exits. diff --git a/autodoc/docmd/pkg/obiformats/universal_write.md b/autodoc/docmd/pkg/obiformats/universal_write.md new file mode 100644 index 0000000..a9c4434 --- /dev/null +++ b/autodoc/docmd/pkg/obiformats/universal_write.md @@ -0,0 +1,29 @@ +# `obiformats` Package: Sequence Writing Utilities + +This Go package provides utilities for writing biological sequence data to files or standard output in FASTA/FASTQ formats. + +## Core Functionality + +- **`WriteSequence()`**: + Main dispatcher that detects sequence quality data and writes either FASTQ (if qualities present) or FASTA. + - Accepts an `IBioSequence` iterator, a writable stream (`io.WriteCloser`), and optional configuration. + - Preserves iterator state via `PushBack()` to allow chaining. + +- **`WriteSequencesToStdout()`**: + Convenience wrapper writing sequences to `stdout`. Automatically closes the output stream. + +- **`WriteSequencesToFile()`**: + Writes sequences to a specified file. Supports: + - File creation/truncation or append mode (`OptionAppendFile()`). + - Paired-end output: writes mate pairs to a second file if `OptionSavePaired()` is enabled. + +## Design Highlights + +- **Format-Aware Dispatch**: Automatically selects FASTQ vs. FASTA based on presence of quality scores (`HasQualities()`). +- **Iterator Preservation**: Ensures non-consumed sequences remain available after write operations. +- **Error Handling & Logging**: Uses `logrus` for fatal errors during file I/O; returns structured error codes. +- **Configurable Options**: Extensible via `WithOption` pattern (e.g., append mode, paired-end handling). + +## Integration + +Designed for use within the OBITools4 ecosystem—works with `obiiter.IBioSequence` iterators to support streaming, memory-efficient processing of large sequencing datasets. diff --git a/autodoc/docmd/pkg/obifp/uint128.md b/autodoc/docmd/pkg/obifp/uint128.md new file mode 100644 index 0000000..04ff385 --- /dev/null +++ b/autodoc/docmd/pkg/obifp/uint128.md @@ -0,0 +1,13 @@ +## Uint128 Type in `obifp`: Semantic Overview + +This Go package defines a custom 128-bit unsigned integer type (`Uint128`) composed of two `uint64` limbs (high and low). It provides comprehensive arithmetic, comparison, bitwise operations, and type conversions. + +- **Basic Constructors**: `Zero()`, `MaxValue()` initialize the smallest/largest possible values. +- **State Checks**: `IsZero()`, and equality/comparison methods (`Equals`, `Cmp`, `<`, `>`, etc.) enable conditional logic. +- **Type Casting**: Safe conversions to/from smaller (`Uint64`, `uint64`) and larger (`Uint256`) integer types, with overflow warnings where applicable. +- **Arithmetic**: Full support for addition (`Add`, `Add64`), subtraction (`Sub`), multiplication (`Mul`, `Mul64`) — with panic on overflow. +- **Division & Modulo**: Integer division (`Div`, `Div64`) and remainder (`Mod`, `Mod64`), implemented via optimized quotient-remainder pairs (`QuoRem`, `QuoRem64`) using hardware-assisted 64-bit operations. +- **Bit Manipulation**: Left/right shifts (`LeftShift`, `RightShift`), and bitwise logic: AND, OR, XOR, NOT. +- **Utility**: Direct access to low limb via `AsUint64()`. + +All operations preserve 128-bit precision, with strict overflow checking for correctness in high-precision contexts (e.g., bioinformatics counting). diff --git a/autodoc/docmd/pkg/obifp/uint128_test.md b/autodoc/docmd/pkg/obifp/uint128_test.md new file mode 100644 index 0000000..906402a --- /dev/null +++ b/autodoc/docmd/pkg/obifp/uint128_test.md @@ -0,0 +1,17 @@ +# `obifp.Uint128` Package — Semantic Feature Overview + +This Go package provides a 128-bit unsigned integer type (`Uint128`) with comprehensive arithmetic, comparison, and bitwise operations. Internally represented as two `uint64` limbs (`w1`: high, `w0`: low), it supports: + +- **Arithmetic Operations** + - `Add`, `Sub`, `Mul` (128×128), and `Mul64` (scalar multiplication) + - Division: `Div`, `Mod`, and combined quotient/remainder via `QuoRem` (and their 64-bit variants) +- **Comparison & Equality** + - `Cmp`, `Equals`, `LessThan`/`GreaterThan`, and their inclusive variants (`≤`, `≥`) + - Support for comparing against both `Uint128` and native `uint64` values +- **Bitwise Operations** + - Logical AND (`And`), OR (`Or`), XOR (`Xor`) between two `Uint128`s + - Bitwise NOT (`Not`) — inverts all bits of the value +- **Conversion & Utility** + - `AsUint64()` safely truncates to lower 64 bits (assumes upper limb is zero) + +All operations handle overflow/underflow correctly, including carry propagation in addition and borrow handling in subtraction. Tests cover edge cases: zero values, max `uint64` boundaries (e.g., wrapping in addition/subtraction), and large multiplications. Designed for cryptographic or high-precision numeric use where native integer types are insufficient. diff --git a/autodoc/docmd/pkg/obifp/uint256.md b/autodoc/docmd/pkg/obifp/uint256.md new file mode 100644 index 0000000..be264d9 --- /dev/null +++ b/autodoc/docmd/pkg/obifp/uint256.md @@ -0,0 +1,30 @@ +# Uint256 Type and Operations — Semantic Overview + +The `obifp` package provides a custom 256-bit unsigned integer type (`Uint256`) implemented in Go, composed of four 64-bit limbs (`w0` to `w3`). It supports arithmetic, comparison, bitwise operations, and safe casting with overflow detection. + +- **Core Representation**: `Uint256` stores values as four 64-bit words, enabling arbitrary-precision unsigned integers up to $2^{256} - 1$. + +- **Utility Methods**: + - `Zero()` / `MaxValue()`: Return the neutral and maximum values. + - `IsZero()`, `Equals(v)`, comparison methods (`LessThan`, etc.): Enable logical and ordering checks. + +- **Casting & Conversion**: + - `Uint64()`, `Uint128()` downcast with warnings on overflow. + - `Set64(v)`: Initializes from a standard `uint64`. + - `AsUint64()`: Direct access to least-significant limb. + +- **Bitwise Operations**: + - `And`, `Or`, `Xor`, `Not`: Standard bitwise logic per limb. + +- **Shifts**: + - `LeftShift(n)` / `RightShift(n)`: Multi-limb shifts with carry propagation. + +- **Arithmetic**: + - `Add(v)`, `Sub(v)` / `Mul(v)`: Use Go’s `math/bits` for carry-aware operations; panic on overflow. + - `Div(v)`: Implements long division via repeated subtraction of shifted multiples; panics on zero divisor. + +- **Safety & Logging**: + - Warnings via `obilog.Warnf` for silent overflows during narrowing casts. + - Panics on arithmetic overflow or division-by-zero using `log.Panicf`. + +This type is suitable for cryptographic, genomic (OBITools), or high-precision counting use cases requiring precise control over large unsigned integers. diff --git a/autodoc/docmd/pkg/obifp/uint64.md b/autodoc/docmd/pkg/obifp/uint64.md new file mode 100644 index 0000000..22915c2 --- /dev/null +++ b/autodoc/docmd/pkg/obifp/uint64.md @@ -0,0 +1,34 @@ +# Uint64 Type Functionalities Overview + +The `obifp` package provides a custom `Uint64` type wrapping Go’s native 64-bit unsigned integer (`uint64`) to support arithmetic, bitwise operations, and type conversions in a structured way. + +## Core Operations + +- **`Zero()` / `MaxValue()`**: Returns the zero and maximum representable values, respectively. +- **`IsZero()` / `Equals(v)`**: Checks if the value is zero or equal to another. +- **`Cmp(v)`, `LessThan(v)`**, etc.: Standard comparison operations returning `-1/0/+1` or boolean results. + +## Arithmetic with Overflow Detection + +- **Add/Sub/Mul**: Performs 64-bit addition, subtraction, and multiplication. + - Uses `math/bits` for low-level operations (`bits.Add64`, etc.). + - Panics on overflow (carry ≠ 0), enforcing strict safety. + +## Bitwise Operations + +- **`And`, `Or`, `Xor`, `Not()`**: Standard bitwise logic operations. +- **`LeftShift(n)` / `RightShift(n)`**: + - Shifts bits left/right by *n* positions. + - Uses internal `LeftShift64`/`RightShift64`, supporting *carry-in* for multi-word arithmetic. + +## Extended Precision Conversions + +- **`Uint128()` / `Uint256()`**: Casts the 64-bit value into larger unsigned integer types (zero-extended). +- **`Set64(v)`**: Reassigns the internal value from a raw `uint64`. + +## Utility & Logging + +- **`AsUint64()`**: Extracts the underlying `uint64`. +- **Warning on overflow in shift operations** (e.g., shifts ≥ 128 bits) via `obilog.Warnf`. + +> Designed for use in high-precision or cryptographic contexts where explicit overflow handling and type safety are critical. diff --git a/autodoc/docmd/pkg/obifp/unint.md b/autodoc/docmd/pkg/obifp/unint.md new file mode 100644 index 0000000..94d8cc3 --- /dev/null +++ b/autodoc/docmd/pkg/obifp/unint.md @@ -0,0 +1,32 @@ +# Obifp Package: Generic Fixed-Point Unsigned Integer Operations + +This Go package (`obifp`) provides a generic, type-safe interface for fixed-point unsigned integer arithmetic over three size variants: `Uint64`, `Uint128`, and `Uint256`. + +## Core Interface: `FPUint[T]` + +The interface defines a unified API for unsigned integer types, supporting: + +- **Initialization & Conversion**: + - `Zero()`, `Set64(v)`: Create zero or set from a `uint64`. + - `AsUint64()`: Downcast to standard `uint64`. + +- **Logical Operations**: + - Bitwise: `And`, `Or`, `Xor`, `Not`. + - Shifts: `LeftShift(n)`, `RightShift(n)`. + +- **Arithmetic**: + - Addition (`Add`), subtraction (`Sub`), multiplication (`Mul`). Division is commented out—likely reserved for future implementation. + +- **Comparison**: + - Full ordering: `<`, `<=`, `>`, `>=`. + +- **Utility Predicates**: + - `IsZero()` for zero-checking. + +## Helper Functions + +- `ZeroUint[T]`: Returns the neutral element (zero) for type `T`. +- `OneUint[T]`: Constructs value 1 via `Set64(1)`. +- `From64[T]`: Converts a standard Go `uint64` into the generic type. + +All operations are **method-chaining friendly** (return `T`, not pointers), enabling fluent syntax. The design promotes correctness and performance in cryptographic or financial contexts where large, fixed-size integers are required. diff --git a/autodoc/docmd/pkg/obigraph/graph.md b/autodoc/docmd/pkg/obigraph/graph.md new file mode 100644 index 0000000..8de9a1b --- /dev/null +++ b/autodoc/docmd/pkg/obigraph/graph.md @@ -0,0 +1,30 @@ +# `obigraph` Package: Semantic Overview + +The `obigraph` package provides a generic, type-safe undirected/directed graph implementation in Go. Its core features include: + +- **Generic Graph Structure**: Parametrized over vertex type `V` and edge data type `T`, enabling flexible use with arbitrary user-defined types. +- **Bidirectional Edge Tracking**: Maintains both forward (`Edges`) and reverse (`ReverseEdges`) adjacency maps for efficient neighbor/parent queries. +- **Edge Management**: + - `AddEdge`: Adds an *undirected* edge (inserted in both directions). + - `AddDirectedEdge`: Adds a *directed* edge (only one direction). + - `SetAsDirectedEdge`: Converts an existing undirected edge into a directed one by removing the reverse link. +- **Graph Queries**: + - `Neighbors(v)`: Returns all adjacent vertices (outgoing in directed case). + - `Parents(v)`: Returns incoming neighbors via reverse adjacency. + - `Degree(v)` / `ParentDegree(v)`: Compute vertex degrees (total or incoming). +- **Customizable Vertex/Edge Properties**: + - `VertexWeight`, `EdgeWeight`: Funcs to assign weights (default: constant weight = 1.0). + - `VertexId`: Custom vertex label generator (default: `"V%d"`). + +- **GML Export**: + - `Gml(...)` / `WriteGml(...)`: Generates or writes a Graph Modelling Language (GML) representation. + - Supports directed/undirected modes, degree-based filtering (`min_degree`), and visual styling: + - Vertex shape: `circle` if weight ≥ threshold, else `rectangle`. + - Size scaled by square root of vertex weight. + - Uses Go’s `text/template` for rendering. + +- **File I/O**: Directly writes GML to file via `WriteGmlFile(...)`. + +- **Logging & Safety**: Uses Logrus for bounds-checking errors; panics on template parsing/writing failures. + +The package is designed for lightweight, high-performance graph modeling and visualization-ready export. diff --git a/autodoc/docmd/pkg/obigraph/graphbuffer.md b/autodoc/docmd/pkg/obigraph/graphbuffer.md new file mode 100644 index 0000000..98441b9 --- /dev/null +++ b/autodoc/docmd/pkg/obigraph/graphbuffer.md @@ -0,0 +1,14 @@ +# `obigraph.GraphBuffer` Feature Overview + +The `GraphBuffer[V, T]` type provides a **thread-safe graph construction interface** using buffered edge insertion via Go channels. + +- **Asynchronous Edge Addition**: Edges are enqueued through a `chan Edge[T]`, processed in the background by a goroutine that updates an underlying static graph (`Graph[V, T]`). +- **Non-blocking API**: `AddEdge` and `AddDirectedEdge` are non-synchronous — they send to the channel without waiting for graph mutation, enabling high-throughput edge ingestion. +- **Graph Initialization**: `NewGraphBuffer` initializes both the graph and a dedicated worker goroutine to consume edges. +- **GML Export Support**: Full support for exporting the final graph in [Graph Modelling Language (GML)](https://en.wikipedia.org/wiki/Graph_Modelling_Language), with optional filtering (`min_degree`) and layout parameters (`threshold`, `scale`). +- **File & Stream Output**: Methods `WriteGml` and `WriteGmlFile` allow writing GML to any `io.Writer`, including files. +- **Resource Cleanup**: The explicit `Close()` method terminates the worker goroutine by closing the channel, ensuring clean shutdown. +- **Generic Design**: Fully generic over vertex (`V`) and edge data types (`T`), supporting arbitrary value semantics. + +> ⚠️ **Note**: The buffer is *not* safe for concurrent `AddEdge` calls without external synchronization beyond channel semantics. +> ✅ Ideal for producer-consumer patterns where edges are streamed from multiple goroutines into a single graph. diff --git a/autodoc/docmd/pkg/obiiter/batch.md b/autodoc/docmd/pkg/obiiter/batch.md new file mode 100644 index 0000000..2b15b28 --- /dev/null +++ b/autodoc/docmd/pkg/obiiter/batch.md @@ -0,0 +1,29 @@ +# BioSequenceBatch: A Container for Ordered Biological Sequences + +`BioSequenceBatch` is a structured data type encapsulating an ordered collection of biological sequences (`obiseq.BioSequenceSlice`) along with metadata: a `source` identifier and an integer `order`. It serves as a lightweight, immutable-friendly container for batch processing in bioinformatics pipelines. + +## Core Properties +- **`source`**: String identifying the origin (e.g., file, pipeline stage). +- **`order`**: Integer defining processing sequence or priority. +- **`slice`**: Holds the actual sequences via `obiseq.BioSequenceSlice`. + +## Key Functionalities +- **Construction**: + `MakeBioSequenceBatch(source, order, sequences)` creates a new batch. +- **Accessors**: + `Source()`, `Order()` return metadata; `Slice()` exposes the sequence slice. +- **Mutation (via copy)**: + `Reorder(newOrder)` returns a new batch with updated order. +- **Size & emptiness**: + `Len()` gives sequence count; `NotEmpty()` checks non-emptiness. +- **Consumption**: + `Pop0()` removes and returns the first sequence (FIFO behavior). +- **Safety**: + `IsNil()` detects uninitialized batches; a global `NilBioSequenceBatch` sentinel exists. + +## Design Notes +- Instances are value types (struct), enabling safe copying. +- Operations follow Go idioms: methods return updated values rather than mutating in place (except internal slice mutation via `Pop0`). +- Designed for interoperability with the OBITools4 ecosystem (`obiseq` package). + +This abstraction supports modular, traceable sequence processing workflows—ideal for pipeline stages where ordering and provenance matter. diff --git a/autodoc/docmd/pkg/obiiter/batchiterator.md b/autodoc/docmd/pkg/obiiter/batchiterator.md new file mode 100644 index 0000000..978ce58 --- /dev/null +++ b/autodoc/docmd/pkg/obiiter/batchiterator.md @@ -0,0 +1,47 @@ +# `obiiter`: Stream-Based Biosequence Iterator Library + +This Go package provides a concurrent, batch-oriented iterator for processing large collections of biological sequences (`BioSequence`), designed for high-throughput NGS data pipelines. + +## Core Functionality + +- **Batched Streaming**: Reads sequences in configurable batches (`BioSequenceBatch`) via a channel-based iterator. +- **Thread Safety**: Uses `sync.WaitGroup`, RWMutex, and atomic flags for safe concurrent access. +- **Lazy Evaluation**: Iteration is on-demand via `Next()`/`Get()`, supporting memory-efficient processing. + +## Iterator Management + +- **Construction**: `MakeIBioSequence()` initializes a new iterator with default settings. +- **Lifecycle Control**: + - `Add(n)`, `Done()`: Track active workers (like goroutines). + - `Lock/RLock` and `Unlock/RUnlock`: Explicit synchronization. + - `Wait()` / `Close()`, `WaitAndClose()`: Graceful shutdown. + +## Batch Transformation & Reorganization + +- **`Rebatch(size)`**: Redistributes sequences into fixed-size batches (requires sorting). +- **`RebatchBySize(maxBytes, maxCount)`**: Dynamic batching respecting memory and count limits. +- **`SortBatches()`**: Ensures batches are emitted in strict order (by `order` field). +- **Concatenation & Pooling**: + - `Concat(...)`: Sequentially merges multiple iterators. + - `Pool(...)`: Interleaves batches from several sources (preserves order via renumbering). + +## Filtering & Predicate-Based Processing + +- **`FilterOn(pred, size)`**: Applies a sequence predicate in parallel (configurable workers), recycling discarded sequences. +- **`FilterAnd(pred, size)`**: Same as `FilterOn`, but also checks paired-end consistency. +- **`DivideOn(pred, size)`**: Splits input into two iterators (`true`, `false`) based on predicate. + +## Utility & Analysis + +- **`Load()`**: Collects all sequences into a single slice (for small datasets). +- **`Count(recycle)`**: Returns `(variants, reads, nucleotides)`. +- **`Consume()` / `Recycle()`**: Drains iterator, optionally triggering sequence recycling. +- **`CompleteFileIterator()`**: Reads entire remaining file as one batch. + +## Additional Features + +- Supports **paired-end data** via `MarkAsPaired()` / `IsPaired()`. +- Batch ordering preserved for downstream reproducibility. +- Integrates with OBITools4’s `obidefault`, `obiutils` for config and resource management. + +> Designed for scalability, low memory footprint, and composability in bioinformatics workflows. diff --git a/autodoc/docmd/pkg/obiiter/distribute.md b/autodoc/docmd/pkg/obiiter/distribute.md new file mode 100644 index 0000000..d9c648c --- /dev/null +++ b/autodoc/docmd/pkg/obiiter/distribute.md @@ -0,0 +1,32 @@ +# `IDistribute`: Semantic Description of Biosequence Distribution Functionality + +The `IDistribute` type implements a thread-safe mechanism for distributing biosequences into classified, batched outputs. + +- **Core Purpose**: Enables concurrent processing of sequences by routing them to dedicated output channels based on classification keys. + +- **Key Fields**: + - `outputs`: A map from integer class codes to output streams (`IBioSequence`). + - `news`: An unbuffered channel emitting class codes when new output streams are created. + - `classifier`: A pointer to a sequence classifier used to assign sequences to keys during distribution. + +- **Thread Safety**: All access to shared state (`outputs`, `slices`) is synchronized via a mutex. + +- **Batching Strategy**: + - Sequences are accumulated per class key until either `BatchSizeMax()` sequences or `BatchMem()` bytes (per key) are reached. + - Batches are flushed automatically and on finalization. + +- **Asynchronous Processing**: + - The `Distribute()` method launches a goroutine that consumes the input iterator, classifies each sequence, and feeds batches to per-key outputs. + - Outputs are closed only after all sequences have been processed. + +- **Notifications**: + - The `News()` channel allows consumers to be notified of newly created output streams (i.e., when a new class key appears). + +- **Error Handling**: + - `Outputs(key)` returns an error if the requested key has no associated output. + +- **Integration**: + - Leverages `obidefault.BatchSizeMax()` and `BatchMem()` for configurable batch limits. + - Uses `SortBatches()` on the input iterator to ensure ordered processing. + +In summary, `IDistribute` provides a scalable, concurrent pipeline for classifying and batching biosequences based on user-defined classification logic. diff --git a/autodoc/docmd/pkg/obiiter/extract_taxonomy.md b/autodoc/docmd/pkg/obiiter/extract_taxonomy.md new file mode 100644 index 0000000..679fbc0 --- /dev/null +++ b/autodoc/docmd/pkg/obiiter/extract_taxonomy.md @@ -0,0 +1,24 @@ +# `ExtractTaxonomy` Function — Semantic Description + +The `ExtractTaxonomy` method is a core utility in the `obiiter` package, designed to aggregate taxonomic information across biological sequences processed by an iterator. + +- **Input**: + - A pointer to `IBioSequence`, representing a sequence iterator over biological data. + - A boolean flag `seqAsTaxa`: if true, each full sequence is treated as a single taxonomic unit; otherwise, individual elements within slices are processed separately. + +- **Process**: + - Iterates through all sequences via `iterator.Next()` and retrieves each current slice using `Get().Slice()`. + - For every slice, it calls the underlying `.ExtractTaxonomy()` method (from `obitax`), progressively building or updating a shared `*obitax.Taxonomy` object. + - Stops and returns immediately upon encountering the first error during taxonomy extraction. + +- **Output**: + - Returns a fully populated `*obitax.Taxonomy` object (or partial result if early failure occurs). + - Returns `nil` error on success; otherwise, returns the first encountered error. + +- **Semantic Role**: + Enables scalable taxonomic profiling of high-throughput sequencing data by delegating per-slice extraction logic to the `obitax` module, while ensuring robust iteration and error handling. + +- **Dependencies**: + Relies on `obitax.Taxonomy` for structured taxonomic representation and assumes slices implement the `.ExtractTaxonomy()` interface. + +This function exemplifies a *map-reduce*-style pattern: mapping taxonomy extraction over slices, and reducing results into a unified taxonomic summary. diff --git a/autodoc/docmd/pkg/obiiter/fragment.md b/autodoc/docmd/pkg/obiiter/fragment.md new file mode 100644 index 0000000..2a7e490 --- /dev/null +++ b/autodoc/docmd/pkg/obiiter/fragment.md @@ -0,0 +1,28 @@ +# `IFragments` Functionality Overview + +The `IFragments()` function in the `obiiter` package implements a parallelized sequence fragmentation pipeline for biological sequences. It is designed to split long nucleotide or protein sequences into smaller, overlapping fragments while preserving metadata and enabling concurrent processing. + +## Core Parameters +- `minsize`: Minimum sequence length to skip fragmentation. +- `length`: Desired fragment size (in bases/amino acids). +- `overlap`: Number of overlapping residues between consecutive fragments. +- `size`, `nworkers`: Batch size and number of worker goroutines (currently unused in active logic). + +## Workflow +1. **Batch Sorting**: Input sequences are batched and sorted for efficient processing. +2. **Parallel Fragmentation**: + - Each worker processes a subset of batches independently using goroutines. + - For each sequence longer than `minsize`, it is split into overlapping fragments of length `length` with step size = `length - overlap`. + - The final fragment is extended to cover the remainder (fusion mode), avoiding tiny trailing pieces. +3. **Resource Management**: + - Original sequences are recycled (`s.Recycle()`) to optimize memory usage. + - Fragments are reassembled into batches, sorted by source and order, then rebatched to respect memory/size limits. + +## Key Features +- **Overlap handling**: Ensures contiguous coverage without gaps. +- **Memory efficiency**: Uses recycling and batched output. +- **Scalability**: Leverages Go concurrency via `nworkers`. +- **Error safety**: Panics on subsequence errors (e.g., invalid indices). + +## Use Case +Ideal for preparing long-read sequencing data (e.g., PacBio, Nanopore) or assembled contigs for downstream analysis requiring fixed-length inputs (e.g., k-mer indexing, ML inference). diff --git a/autodoc/docmd/pkg/obiiter/limitmemory.md b/autodoc/docmd/pkg/obiiter/limitmemory.md new file mode 100644 index 0000000..6f95cd2 --- /dev/null +++ b/autodoc/docmd/pkg/obiiter/limitmemory.md @@ -0,0 +1,29 @@ +# Memory-Limited Biosequence Iterator + +This Go function extends an `IBioSequence` iterator with memory-aware throttling to prevent excessive heap allocation during data processing. + +## Core Functionality + +- **`LimitMemory(fraction float64)`** + Returns a new iterator that respects an upper bound on heap usage relative to total system memory. + +- **Memory Monitoring** + Uses `runtime.ReadMemStats()` and `github.com/pbnjay/memory.TotalMemory()` to compute the current heap fraction (`Alloc / TotalMemory`) dynamically. + +- **Backpressure Mechanism** + While the memory fraction exceeds `fraction`, the producer goroutine yields control (`runtime.Gosched()`) until sufficient memory becomes available. + +- **Logging** + Warns via `obilog.Warnf` when: + - Memory pressure persists (every ~1000 yields), + - Or wait duration becomes unusually long (>10,000 yielding cycles). + +- **Concurrency Model** + - A producer goroutine consumes from the original iterator and pushes items to `newIter`, pausing as needed. + - A dedicated consumer goroutine calls `WaitAndClose()` to ensure graceful termination and resource cleanup. + +## Semantic Behavior + +- **Non-blocking consumer**: Downstream consumers are not stalled; they read from an internal buffered channel (`newIter`). +- **Adaptive rate control**: The iterator automatically slows down when memory pressure rises, avoiding OOM conditions. +- **Predictable resource use**: Ensures heap usage stays below the specified `fraction` (e.g., 0.5 → ≤ 50% of total RAM). diff --git a/autodoc/docmd/pkg/obiiter/merge.md b/autodoc/docmd/pkg/obiiter/merge.md new file mode 100644 index 0000000..8b7882b --- /dev/null +++ b/autodoc/docmd/pkg/obiiter/merge.md @@ -0,0 +1,19 @@ +# Semantic Description of `IMergeSequenceBatch` and `MergePipe` + +This code defines two related functions in the `obiiter` package for batch-wise merging of biological sequences during iteration. + +- **`IMergeSequenceBatch(na, statsOn, sizes...) IBioSequence → IBioSequence`** + - Consumes an input sequence iterator (`IBioSequence`) and returns a new one. + - Groups incoming sequences into batches (default size: `100`, configurable via variadic argument). + - For each batch: + - Collects up to `batchsize` sequences via the input iterator. + - Applies `.Merge(na, statsOn)` on each sequence group (presumably merging reads based on `na`, e.g., nucleotide alignment or overlap). + - Wraps merged results into a `BioSequenceBatch` with ordering metadata. + - Emits batches asynchronously via goroutines; the output iterator is closed when input finishes. + +- **`MergePipe(na, statsOn, sizes...) Pipeable → func(IBioSequence) IBioSequence`** + - A *pipeline combinator* (higher-order function), enabling functional composition. + - Returns a `Pipeable` — i.e., a transformation function compatible with iterator pipelines. + +**Semantic Purpose**: +Enables efficient, memory-smoothed merging of biological sequence reads (e.g., paired-end merges) in streaming fashion, with optional statistics tracking (`statsOn`) and configurable batching. diff --git a/autodoc/docmd/pkg/obiiter/numbering.md b/autodoc/docmd/pkg/obiiter/numbering.md new file mode 100644 index 0000000..139a942 --- /dev/null +++ b/autodoc/docmd/pkg/obiiter/numbering.md @@ -0,0 +1,35 @@ +# `NumberSequences` Function — Semantic Description + +The `NumberSequences` method assigns a unique sequential identifier (`seq_number`) to each biological sequence in an `IBioSequence` iterator, preserving consistency for paired-end reads. + +## Core Functionality + +- **Sequential numbering**: Assigns integers (starting from `start`, defaulting to 0 or user-defined) incrementally across sequences. +- **Thread-safe**: Uses `sync.Mutex` and `atomic.Int64` to safely manage the global counter during concurrent processing. +- **Paired-read support**: When input is paired (`IsPaired()`), both reads in a pair receive the *same* `seq_number`, ensuring alignment between mates. + +## Parallelization Strategy + +- **Default mode**: Uses multiple workers (`ParallelWorkers()`) for performance; batches are processed concurrently. +- **Reordering mode**: If `forceReordering` is true: + - Input iterator is batch-sorted (`SortBatches()`). + - Parallelism disabled (1 worker) to ensure deterministic numbering order. + +## Implementation Details + +- Each goroutine processes its own split of the input iterator. +- A shared `next_first` counter tracks the next available sequence number globally. +- Locking ensures atomic increment and assignment, preventing race conditions. + +## Output + +Returns a new `IBioSequence` iterator: +- Contains the same sequence batches (possibly reordered if sorted). +- Each `BioSequence` object now carries a `"seq_number"` attribute. +- Paired sequences are co-numbered and marked accordingly. + +## Use Cases + +- Preparing data for downstream tools requiring unique sequence IDs. +- Maintaining cross-read identity in paired-end workflows (e.g., assembly, mapping). +- Reproducible numbering across pipeline stages or restarts. diff --git a/autodoc/docmd/pkg/obiiter/paired.md b/autodoc/docmd/pkg/obiiter/paired.md new file mode 100644 index 0000000..e4a285b --- /dev/null +++ b/autodoc/docmd/pkg/obiiter/paired.md @@ -0,0 +1,17 @@ +# Paired-End Sequence Handling in `obiiter` + +This Go package provides semantic functionality for managing **paired-end biological sequences** within batched iterators. + +- `BioSequenceBatch` methods: + - **`IsPaired()`**: Checks whether the batch contains paired reads. + - **`PairedWith()`**: Returns a new batch containing only the mate (partner) of each read in the current batch. + - **`PairTo(*BioSequenceBatch)`**: Synchronizes and pairs reads between two batches *of identical order*; fails if orders differ. + - **`UnPair()`**: Removes pairing metadata, treating reads as unpaired. + +- `IBioSequence` (iterator) methods: + - **`MarkAsPaired()`**: Marks the iterator as producing paired-end data. + - **`PairTo(IBioSequence)`**: Combines two iterators into a new paired-end iterator by aligning corresponding batches and calling `PairTo` on each pair. + - **`PairedWith()`**: Generates a new iterator yielding only the mate reads (i.e., second ends) from an existing paired-end stream. + - **`IsPaired()`**: Returns whether the iterator was explicitly marked as paired. + +All operations preserve batched processing and concurrency via goroutines, ensuring efficient handling of large NGS datasets while maintaining semantic correctness for paired-end workflows. diff --git a/autodoc/docmd/pkg/obiiter/pipe.md b/autodoc/docmd/pkg/obiiter/pipe.md new file mode 100644 index 0000000..862aa65 --- /dev/null +++ b/autodoc/docmd/pkg/obiiter/pipe.md @@ -0,0 +1,17 @@ +# Semantic Description of `obiiter` Package Features + +This Go package provides functional-style utilities for processing biological sequence data (e.g., FASTQ/FASTA), modeled via the `IBioSequence` interface. + +- **`Pipeable`**: A function type representing a unary transformation on an `IBioSequence`. +- **`Pipeline(start, parts...)`**: Composes a sequence of `Pipeable` operations into a single executable pipeline. It applies transformations sequentially: input → start → part₁ → … → output. + +- **`(IBioSequence).Pipe(start, parts...)`**: A convenience method enabling fluent chaining of transformations directly on a sequence object. + +- **`Teeable`**: A function type for operations that split input into two independent output streams (e.g., filtering + logging). + +- **`(IBioSequence).CopyTee()`**: A high-level tee operation that duplicates the input stream into two identical, concurrently readable `IBioSequence` instances. + - Uses goroutines to ensure non-blocking parallel consumption. + - Ensures proper lifecycle management: closing the second stream when the first is closed. + - Preserves paired-end status (`MarkAsPaired`) if applicable. + +Together, these features support modular, composable, and concurrent biosequence processing pipelines—ideal for scalable NGS data workflows. diff --git a/autodoc/docmd/pkg/obiiter/sequence_workers.md b/autodoc/docmd/pkg/obiiter/sequence_workers.md new file mode 100644 index 0000000..34619ac --- /dev/null +++ b/autodoc/docmd/pkg/obiiter/sequence_workers.md @@ -0,0 +1,28 @@ +# `MakeSetAttributeWorker` Functionality Overview + +The function `MakeSetAttributeWorker(rank string) obiiter.SeqWorker` constructs a reusable sequence-processing worker for taxonomic annotation. + +- **Input validation**: It first verifies that the provided `rank` is part of a predefined taxonomic hierarchy (`taxonomy.RankList()`). If invalid, it terminates execution with an informative error. + +- **Worker construction**: It returns a closure (`obiiter.SeqWorker`) — essentially a function that transforms biological sequences. + +- **Core behavior**: For each input `*obiseq.BioSequence`, it calls `taxonomy.SetTaxonAtRank(sequence, rank)`. This likely assigns or updates the taxonomic label (e.g., species, genus) at the specified rank in the sequence’s metadata. + +- **Purpose**: Enables modular, pipeline-friendly taxonomic annotation — e.g., in bioinformatics workflows where sequences must be annotated hierarchically (e.g., from phylum down to species). + +- **Design pattern**: Follows the *functional factory* and *worker interface* patterns, promoting composability in sequence processing pipelines. + +- **Side effects**: Modifies the input `BioSequence` *in-place* (via mutation of its taxonomic metadata), then returns it. + +- **Use case example**: + ```go + worker := MakeSetAttributeWorker("species") + seq = worker(seq) // annotates `seq` with species-level taxon + ``` + +- **Assumptions**: + - `taxonomy.SetTaxonAtRank` exists and handles rank-specific taxon assignment. + - Taxonomic ranks are ordered, finite, and validated (e.g., `["domain", "phylum", ..., "species"]`). + - Sequences carry mutable taxonomic metadata. + +- **Error handling**: Fails fast on invalid rank input, preventing silent misannotation. diff --git a/autodoc/docmd/pkg/obiiter/speed.md b/autodoc/docmd/pkg/obiiter/speed.md new file mode 100644 index 0000000..a3fb79d --- /dev/null +++ b/autodoc/docmd/pkg/obiiter/speed.md @@ -0,0 +1,31 @@ +# `Speed` Functionality Description + +The provided Go code defines a method and helper function to add **real-time progress tracking** to biosequence iterators in the OBITools4 framework. + +## Core Features + +- **Non-intrusive progress bar**: + The `Speed()` method wraps an existing iterator and displays a visual progress indicator on stderr, using the [`progressbar`](https://github.com/schollz/progressbar) library. + +- **Conditional rendering**: + The progress bar is only shown when: + - `--no-progressbar` flag is *not* set (via `obidefault.ProgressBar()`), + - stderr is connected to a terminal (`os.ModeCharDevice`), + - stdout is *not* piped (to avoid interfering with file output). + +- **Batch-aware counting**: + Progress is updated per batch (`batch.Len()`), not item-by-item, for efficiency and smoother UI updates (throttled to ≥100ms). + +- **Paired-end support**: + If the input iterator is paired (`IsPaired()`), this property is preserved in the returned iterator. + +- **Pipeable wrapper**: + `SpeedPipe()` enables integration into functional pipelines (e.g., `.Map(...).Filter(...)`) by returning a `Pipeable` function. + +## Implementation Highlights + +- Uses goroutines to decouple iteration and progress updates. +- Automatically closes the output iterator when input ends (`WaitAndClose()`). +- Prints a final newline to stderr upon completion. + +This utility enhances user experience during long-running sequence processing (e.g., FASTQ parsing, alignment), without affecting correctness or performance in non-interactive contexts. diff --git a/autodoc/docmd/pkg/obiiter/workers.md b/autodoc/docmd/pkg/obiiter/workers.md new file mode 100644 index 0000000..c0f20f8 --- /dev/null +++ b/autodoc/docmd/pkg/obiiter/workers.md @@ -0,0 +1,20 @@ +# Semantic Description of `obiiter` Package Functionalities + +This Go package (`obiiter`) provides utilities for applying functional transformations to biological sequence iterators, supporting parallel execution and modular piping. + +- **`MakeIWorker(worker, breakOnError bool, sizes ...int)`**: + Applies a `SeqWorker` (sequence-to-sequence transformation) to each sequence in the iterator. Supports configurable parallelism (`nworkers`) and optional channel buffering via `sizes`. Uses internal conversion to slice-based workers. + +- **`MakeIConditionalWorker(predicate, worker, breakOnError bool, sizes ...int)`**: + Applies a `SeqWorker` only to sequences satisfying a given boolean `predicate`. Enables conditional, parallelized processing while preserving iterator semantics. + +- **`MakeISliceWorker(worker, breakOnError bool, sizes ...int)`**: + Core method applying a `SeqSliceWorker` (batch-level transformation) across slices of sequences. Implements multi-goroutine parallelism using `nworkers`. Handles errors optionally via fatal logging (`breakOnError`). Preserves paired-end metadata. + +- **`WorkerPipe(worker, breakOnError bool, sizes ...int)`**: + Returns a `Pipeable` closure wrapping `MakeIWorker`, enabling composition in pipeline chains (e.g., for CLI or DSL-style workflows). + +- **`SliceWorkerPipe(worker, breakOnError bool, sizes ...int)`**: + Similar to `WorkerPipe`, but for slice-level workers (`SeqSliceWorker`). Facilitates modular, reusable pipeline stages. + +All methods support optional size arguments to override default parallelism (from `obidefault`). Internally, they rely on Go concurrency primitives (`go`, channels) and structured batch processing via `IBioSequence` interface. diff --git a/autodoc/docmd/pkg/obiitercsv/csv.md b/autodoc/docmd/pkg/obiitercsv/csv.md new file mode 100644 index 0000000..f7f80ef --- /dev/null +++ b/autodoc/docmd/pkg/obiitercsv/csv.md @@ -0,0 +1,33 @@ +# `obiitercsv`: CSV Record Iterator for Streaming and Batch Processing + +This Go package provides a thread-safe, channel-based iterator (`ICSVRecord`) for streaming and processing CSV records in batches. It supports ordered batch handling, concurrent access via mutexes, and dynamic header management. + +## Core Types + +- **`CSVHeader`**: A slice of strings representing column names. +- **`CSVRecord`**: A map from field name to value (`map[string]interface{}`). +- **`CSVRecordBatch`**: A batch of records with metadata: `source`, `order`, and the actual data slice. + +## Key Features + +- **Streaming via Channels**: Records are consumed as `CSVRecordBatch` items through a channel, enabling asynchronous producers/consumers. +- **Ordered Processing**: Batches include an `order` field, used by `SortBatches()` to reconstruct sequential order even when received out-of-order. +- **Thread Safety**: Uses `sync.RWMutex`, atomic operations (`batch_size`), and `abool.AtomicBool` for flags like `finished`. +- **Iterator Protocol**: Implements standard methods: + - `Next()` to advance, + - `Get()` to retrieve current batch, + - `PushBack()` for re-queuing the last record. +- **Batch Management**: + - `SetHeader()` / `AppendField()`: dynamic header updates. + - `Split()`: creates a new iterator sharing the same channel but with independent locking. +- **Lifecycle Control**: + - `Add()` / `Done()`: track active goroutines (via `sync.WaitGroup`). + - `WaitAndClose()` ensures all data is flushed before closing the channel. + +## Utility Methods + +- **`NotEmpty()`, `IsNil()`**: Check batch validity. +- **`Consume()`**: Drains the iterator (e.g., for side-effect processing). +- **`SortBatches()`**: Reorders batches by `order`, buffering out-of-sequence ones. + +Designed for bioinformatics pipelines (e.g., OBITools4), it enables scalable, memory-efficient CSV processing with strict ordering guarantees. diff --git a/autodoc/docmd/pkg/obikmer/counting.md b/autodoc/docmd/pkg/obikmer/counting.md new file mode 100644 index 0000000..9b9b6e6 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/counting.md @@ -0,0 +1,36 @@ +# Semantic Description of `obikmer` Package + +This Go package provides utilities for **k-mer (specifically 4-mer) counting and comparison** of biological sequences. + +## Core Functionalities + +1. **`Count4Mer(seq, buffer, counts)`** + Counts occurrences of all possible 16-mer (4-nucleotide) subsequences in a `BioSequence`. + - Encodes each 4-mer into an integer (0–255) using `Encode4mer`. + - Populates a fixed-size `[256]uint16` table (`Table4mer`) with counts. + - Reuses or allocates the `counts` buffer as needed. + +2. **`Common4Mer(count1, count2)`** + Computes the *intersection* of two 4-mer frequency profiles: sum over all k-mers of `min(count1[k], count2[k])`. + Used to measure shared content between sequences. + +3. **`Sum4Mer(count)`** + Returns the total number of 4-mers in a profile (i.e., sum over all entries). + +## Distance & Similarity Bounds + +4. **`LCS4MerBounds(count1, count2)`** + Estimates bounds for the *Longest Common Subsequence* (LCS) length between two sequences based on 4-mer profiles: + - **Lower bound**: `common_kmers + (3 if common > 0 else 0)` + - **Upper bound**: `min(total1, total2) + 3 − ceil((min_total – common)/4)` + Leverages the fact that overlapping k-mers constrain possible alignments. + +5. **`Error4MerBounds(count1, count2)`** + Estimates bounds for *alignment errors* (e.g., mismatches + indels): + - **Upper bound**: `max_total − common_kmers + 2 * floor((common_kmers + 5)/8)` + - **Lower bound**: `ceil(upper_bound / 4)` + Provides fast, approximate error estimates without full alignment. + +## Use Case + +Designed for **high-performance comparison of NGS reads** (e.g., in metabarcoding), where exact alignment is too costly, and k-mer-based heuristics enable scalable similarity estimation. diff --git a/autodoc/docmd/pkg/obikmer/debruijn.md b/autodoc/docmd/pkg/obikmer/debruijn.md new file mode 100644 index 0000000..1df62f5 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/debruijn.md @@ -0,0 +1,44 @@ +# Semantic Description of the `obikmer` Package + +This Go package implements a **De Bruijn graph** for efficient k-mer manipulation and sequence assembly, primarily used in bioinformatics (e.g., metagenomic read error correction or consensus building). + +### Core Functionalities + +- **K-mer Encoding**: K-mers are encoded as `uint64` using 2 bits per nucleotide (A=0, C=1, G=2, T=3), supporting IUPAC ambiguity codes via the `iupac` map. +- **Reverse Complement Handling**: The `revcompnuc` table enables nucleotide-wise reverse complementation. +- **Graph Construction**: The `DeBruijnGraph` struct maintains a map from k-mer hashes to integer weights (e.g., observed counts), with helper masks for bit manipulation (`kmermask`, `prevc/g/t`). + +### Graph Operations + +- **Node Queries**: + - `Previouses()` / `Nexts()`: Return predecessor/successor k-mers in the graph. + - `MaxNext()` / `MaxHead()`: Find neighbors or heads (sources) with maximum weight. +- **Path Exploration**: + - `MaxPath()`: Greedily traces the highest-weight path from a head. + - `LongestPath()`: Explores all heads to find the path with maximum cumulative weight (optionally bounded in length). + - `HaviestPath()`: Uses Dijkstra-like priority queue to find the *heaviest* (sum-weight) path, with cycle detection via DFS (`HasCycle()`). + +### Consensus & Filtering + +- **Consensus Generation**: + - `BestConsensus()` returns a sequence from the greedy max-weight path. + - `LongestConsensus(id, min_cov)` trims low-coverage ends using a coverage threshold (mode-based). +- **Weight Statistics**: + - `MaxWeight()`, `WeightMean()`, `WeightMode()` provide distribution summaries. + - `FilterMinWeight(min)` removes low-count nodes. +- **Decoding**: + - `DecodeNode()` converts a k-mer index to its DNA string. + - `DecodePath()` reconstructs the full consensus from a path. + +### I/O & Diagnostics + +- **GML Export**: `WriteGml()` outputs a directed graph in Graph Modelling Language (for visualization), with edge thickness and labels reflecting weights. +- **Hamming Distance**: `HammingDistance()` computes edit distance between two encoded k-mers using bit operations. +- **Sequence Insertion**: `Push()` adds a biosequence (with count weight) to the graph, expanding all IUPAC variants recursively. + +### Dependencies & Design + +- Leverages `obiseq` for sequence representation and `logrus`/`slices`/`heap` from Go’s stdlib. +- Designed for scalability and speed, using bit-level operations to minimize memory footprint. + +Overall: a robust k-mer graph engine for *de novo* assembly, error correction, and consensus recovery in high-throughput sequencing data. diff --git a/autodoc/docmd/pkg/obikmer/encodefourmer.md b/autodoc/docmd/pkg/obikmer/encodefourmer.md new file mode 100644 index 0000000..5b30fee --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/encodefourmer.md @@ -0,0 +1,35 @@ +# Semantic Description of `obikmer` Package + +The `obikmer` package provides efficient k-mer encoding and comparison utilities for biological sequences, optimized for DNA analysis. + +## Core Functionalities + +1. **Nucleotide Encoding** + - `EncodeNucleotide(b byte)`: Maps DNA bases (A, C, G, T/U) to 2-bit values: + `0→A`, `1→C`, `2→G`, `3→T/U`. + Ambiguous or non-standard characters (e.g., N, R, Y) default to `A` (`0`). + Uses a lookup table for O(1) performance. + +2. **4-mer Encoding** + - `Encode4mer(seq, buffer)`: Converts a biological sequence into overlapping 4-mers. + Each k-mer is encoded as an unsigned byte (0–255), where each nucleotide contributes 2 bits. + Supports optional buffer reuse for memory efficiency. + +3. **4-mer Indexing** + - `Index4mer(seq, index, buffer)`: Builds an inverted index mapping each 4-mer code (0–255) to all its occurrence positions in the sequence. + Returns `[][]int`, where rows correspond to k-mer codes and columns list positions. + +4. **Fast Sequence Comparison** + - `FastShiftFourMer(...)`: Compares two sequences using a FASTA-like shift-scoring algorithm. + - Uses precomputed 4-mer index of a reference sequence and encodes the query. + - Counts co-occurring 4-mers across all possible shifts (`refpos − queryPos`). + - Computes raw and relative scores (normalized by alignment length). + - Returns optimal shift, count of matching 4-mers, and maximum score (raw or relative). + +## Design Highlights + +- **Memory-aware**: Supports buffer reuse to minimize allocations. +- **Robustness**: Non-canonical bases handled gracefully (defaulting to A). +- **Performance-oriented**: O(n) encoding and indexing; efficient hash-based shift counting. + +Intended for rapid alignment-free sequence comparison in metabarcoding or metagenomic workflows. diff --git a/autodoc/docmd/pkg/obikmer/encodekmer.md b/autodoc/docmd/pkg/obikmer/encodekmer.md new file mode 100644 index 0000000..6a58b21 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/encodekmer.md @@ -0,0 +1,39 @@ +# Semantic Description of `obikmer` Package + +The `obikmer` package provides high-performance, zero-allocation utilities for **k-mer manipulation** in DNA sequences (A/C/G/T/U), targeting bioinformatics applications like genome indexing, assembly, and error correction. + +## Core Encoding & Decoding + +- **`EncodeKmer`, `DecodeKmer`**: Convert between DNA sequences and compact 62-bit uint64 representations (2 bits/base), preserving top 2 bits for optional error markers. +- **`EncodeCanonicalKmer`, `CanonicalKmer`**: Encode or normalize k-mers to their *biological canonical form* — the lexicographically smaller of a k-mer and its reverse complement. + +## Iterators (Memory-Efficient Streaming) + +- **`IterKmers`, `IterCanonicalKmers`**: Stream all overlapping k-mers from a sequence without allocating intermediate slices — ideal for large-scale processing (e.g., inserting into Roaring Bitmaps). +- **`IterCanonicalKmersWithErrors`**: Same as above, but detects ambiguous bases (N/R/Y/W/S/K/M/B/D/H/V) and encodes their count in the top 2 bits (error code: 0–3). Only valid for **odd k ≤ 31**. + +## Error Handling & Markers + +- `SetKmerError`, `GetKmerError`, and `ClearKmerError` manipulate the top 2 bits of a uint64 to store error metadata (e.g., ambiguous base count), enabling downstream filtering or correction. + +## Reverse Complement & Circular Normalization + +- **`ReverseComplement`, `CanonicalKmer`**: Compute biological reverse complement and canonical form. +- **`NormalizeCircular`, `EncodeCircularCanonicalKmer`**: Compute *circular canonical form* — the lexicographically smallest rotation (used for low-complexity masking). +- Distinction: `CanonicalKmer` uses **reverse complement**, while `NormalizeCircular` uses **rotation**. + +## Counting & Math Utilities + +- **`CanonicalCircularKmerCount`, `necklaceCount`, etc.**: Compute exact counts of unique circular k-mer equivalence classes using **Moreau’s necklace formula**, with Euler's totient function and divisor enumeration. + +## Performance & Safety + +- All functions avoid heap allocations where possible (reusing buffers). +- Panics on invalid `k` or length mismatches for correctness. +- Supports case-insensitive input (A/a, T/t…), and ambiguous bases via `__single_base_code_err__`. + +## Use Cases + +- K-mer counting in assemblers (e.g., with Bloom filters or bitmaps) +- Error-aware k-mer filtering in sequencing pipelines +- Low-complexity region detection via circular entropy normalization diff --git a/autodoc/docmd/pkg/obikmer/encodekmer_test.md b/autodoc/docmd/pkg/obikmer/encodekmer_test.md new file mode 100644 index 0000000..14f008e --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/encodekmer_test.md @@ -0,0 +1,36 @@ +# Obikmer: Efficient K-mer Encoding and Manipulation in Go + +This package provides high-performance utilities for DNA sequence analysis using *k*-mers—contiguous substrings of length `k`. It supports encoding, canonicalization (forward/reverse-complement normalization), minimizer-based super-*k*-mer extraction, and error tagging—all optimized for 64-bit integer arithmetic. + +## Core Functionalities + +### K-mer Encoding (`EncodeKmers`, `IterKmers`) +Encodes DNA sequences (A/C/G/T/U, case-insensitive) into `uint64` using 2 bits per nucleotide (A=00, C=01, G=10, T/U=11). Supports sliding-window extraction and streaming via an iterator. Handles sequences up to 31-mers (62 bits), with validation for invalid `k` values. + +### Reverse Complement (`ReverseComplement`) +Computes the reverse complement of a *k*-mer in constant time using bit manipulation. Preserves error metadata (see below) and satisfies involution: `RC(RC(x)) = x`. + +### Canonical K-mers (`CanonicalKmer`, `EncodeCanonicalKmers`) +Returns the lexicographically smaller of a *k*-mer and its reverse complement—enabling strand-agnostic analysis. Supports both single-kmer normalization (`CanonicalKmer`) and full-sequence canonical encoding. + +### Super *k*-mers Extraction (`ExtractSuperKmers`) +Groups overlapping *k*-mers sharing the same minimizer (minimal *m*-mer in sliding window) into contiguous regions ("super *k*-mers"). Output includes start/end positions and minimizer values, all canonicalized. + +### Error Marking (`SetKmerError`, `GetKmerError`, etc.) +Uses the top 2 bits of a `uint64` to tag error states (e.g., sequencing errors), leaving 62 bits for sequence data. Error operations preserve the underlying *k*-mer and work seamlessly with canonicalization/RC. + +## Key Features + +- **Memory Efficiency**: Reusable buffers via optional `*[]uint64` or `*[]SuperKmer` parameters. +- **Edge Case Handling**: Gracefully handles empty sequences, `k > len(seq)`, invalid parameters (`m ≥ k`), and max-length constraints. +- **Performance**: Optimized for speed—benchmarks included for all major functions (e.g., `BenchmarkEncodeKmers`, `BenchmarkExtractSuperKmers`). +- **Comprehensive Testing**: Covers basic cases, boundary conditions (e.g., 31-mers), symmetry properties (canonical/RC invariance), and error resilience. + +## Use Cases + +- Genome assembly &DBG construction +- Minimizer-based sketching (e.g., *Mash*, *Sourmash*) +- Error-aware k-mer counting & filtering +- Strand-unbiased sequence comparison + +All functions operate on `[]byte` DNA sequences and return canonicalized, efficient representations suitable for hashing or indexing. diff --git a/autodoc/docmd/pkg/obikmer/entropy.md b/autodoc/docmd/pkg/obikmer/entropy.md new file mode 100644 index 0000000..79ce01b --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/entropy.md @@ -0,0 +1,31 @@ +# Semantic Description of `obikmer` Entropy Functions + +The `obikmer` package provides high-performance tools to compute **Shannon entropy** for DNA *k*-mers, with a focus on detecting low-complexity sequences via sub-word repetition analysis. + +## Core Functionality + +- **`KmerEntropy(kmer, k, levelMax)`**: + Computes the *minimum normalized Shannon entropy* across all sub-word sizes from `1` to `levelMax`. + - Decodes the encoded *k*-mer (2 bits/base) into a DNA string. + - For each word size `ws`, extracts all overlapping substrings, normalizes them to their **circular canonical form**, and counts frequencies. + - Normalized entropy = `(log(N) − Σ(nᵢ log nᵢ)/N) / emax`, where `emax` is the theoretical max entropy given sequence length and alphabet constraints. + - Returns min entropy across `ws ∈ [1, levelMax]`. Values near **0** indicate repeats (e.g., `AAAAA…`); values near **1** suggest high complexity. + +- **`KmerEntropyFilter`**: + A reusable, precomputed filter for batch processing millions of *k*-mers efficiently: + - Pre-builds normalization tables (for circular canonical forms), entropy lookup values (`emax`, `logNwords`), and frequency tables. + - Avoids repeated allocations — critical for performance in pipelines (e.g., read filtering). + - **Not goroutine-safe** — each thread must instantiate its own filter. + +- **`NewKmerEntropyFilter(k, levelMax, threshold)`**: + Initializes a filter with precomputed tables and sets the entropy rejection `threshold`. + +- **`Accept(kmer)` / `Entropy(kmer)`**: + - `Accept()` returns `true` if entropy > threshold (i.e., *k*-mer is complex enough to pass). + - `Entropy()` computes entropy using precomputed tables — ~10× faster than standalone calls. + +## Design Highlights + +- **Circular canonical normalization** ensures symmetry (e.g., `AT` ≡ `TA`). +- **Sub-word-level entropy** captures local repetitiveness better than global *k*-mer uniqueness. +- Optimized for **speed and memory reuse**, suitable for large-scale genomic data filtering. diff --git a/autodoc/docmd/pkg/obikmer/kdi_merge.md b/autodoc/docmd/pkg/obikmer/kdi_merge.md new file mode 100644 index 0000000..092e18d --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/kdi_merge.md @@ -0,0 +1,37 @@ +# K-Way Merge for Sorted k-mer Streams + +This Go package implements a **k-way merge** over multiple sorted streams of *k*-mer values (`uint64`). It leverages a **min-heap** to efficiently produce the globally sorted sequence while aggregating duplicate counts across input streams. + +## Core Components + +- **`mergeItem`**: Stores a value and its source reader index for heap operations. +- **`mergeHeap`** & `heap.Interface`: Implements a min-heap for efficient retrieval of smallest values. +- **`KWayMerge`**: Main struct managing the heap and input readers. + +## Key Functionality + +- **Initialization (`NewKWayMerge`)**: + - Takes a slice of `*KdiReader`, each expected to yield sorted values. + - Preloads the heap with one value from each reader. + +- **Streaming Output (`Next`)**: + - Returns the next smallest *k*-mer, its frequency across readers (i.e., how many input streams contained it), and a success flag. + - Handles duplicates: pops *all* items equal to the current minimum before advancing readers. + +- **Cleanup (`Close`)**: + - Closes all underlying `KdiReader`s and returns the first encountered error. + +## Use Case + +Ideal for merging sorted *k*-mer databases (e.g., from multiple files or processes), enabling: +- Efficient deduplication with multiplicity tracking. +- Scalable union/intersection operations on large *k*-mer sets. + +## Complexity + +| Operation | Time | +|-----------|------------| +| `Next()` | *O(log k)* (heap ops per unique value) | +| Init | *O(k)* | + +Where `k` = number of input readers. diff --git a/autodoc/docmd/pkg/obikmer/kdi_merge_test.md b/autodoc/docmd/pkg/obikmer/kdi_merge_test.md new file mode 100644 index 0000000..556a759 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/kdi_merge_test.md @@ -0,0 +1,27 @@ +# K-Way Merge Functionality in `obikmer` + +This Go package provides utilities for merging sorted k-mer streams stored in `.kdi` files. Its core component is the `KWayMerge`, which performs a k-way merge of multiple sorted input streams, aggregating duplicate k-mers by counting their occurrences. + +## Key Features + +- **Sorted K-Mer Input**: Reads k-mers from `.kdi` files via `KdiReader`, assuming each file contains *sorted* 64-bit unsigned integers (`uint64`). +- **K-Way Merge**: Merges multiple sorted streams into a single globally sorted stream using an efficient priority queue (min-heap) internally. +- **Count Aggregation**: When identical k-mers appear across multiple streams, the merge counts how many times each unique k-mer occurs. +- **Memory-Efficient Streaming**: Processes data incrementally, avoiding full loading of all streams into memory. +- **Robust Test Coverage**: Includes unit tests for: + - Basic merging with overlapping and non-overlapping values. + - Single-stream input (degenerate case). + - Empty streams handling. + - All identical k-mers across inputs. + +## API Highlights + +- `NewKdiReader(path)` — opens a `.kdi` file for reading. +- `writeKdi(...)` (test helper) — writes sorted k-mers to a `.kdi` file. +- `NewKWayMerge([]*KdiReader)` — constructs the merger from multiple readers. +- `.Next()` → `(kmer uint64, count int, ok bool)` — yields next merged k-mer and its frequency; `ok=false` signals end-of-stream. +- `.Close()` — cleans up resources. + +## Use Case + +Ideal for aggregating k-mer counts across multiple sequencing samples (e.g., in bioinformatics), where each sample’s k-mers are pre-sorted and persisted, enabling scalable distributed counting without full in-memory deduplication. diff --git a/autodoc/docmd/pkg/obikmer/kdi_reader.md b/autodoc/docmd/pkg/obikmer/kdi_reader.md new file mode 100644 index 0000000..b3e83f3 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/kdi_reader.md @@ -0,0 +1,27 @@ +# KDI Reader: Streaming Delta-Varint Decoding for k-mers + +The `obikmer` package provides a high-performance, streaming reader for `.kdi` files—binary containers storing *sorted* k-mers (typically DNA substrings encoded as 64-bit integers). It supports both sequential and indexed access. + +## Core Features + +- **Streaming decoding**: K-mers are read incrementally using delta-varint compression to minimize I/O and memory footprint. +- **Delta encoding**: After the first absolute `uint64`, subsequent values are stored as *deltas* (difference from previous), encoded via custom `DecodeVarint`. +- **Magic & format validation**: A 4-byte magic header ensures file integrity; Little Endian `uint64` stores total count. +- **Sparse indexing**: When paired with a `.kdx` index, `SeekTo(target)` enables fast forward-only jumps to positions ≥ target k-mer. +- **Graceful fallback**: If `.kdx` is missing or invalid, the reader automatically degrades to sequential mode. + +## Key API + +- `NewKdiReader(path)` → opens `.kdi` for streaming (no index). +- `NewKdiIndexedReader(path)` → opens with optional `.kdx` for random access. +- `Next()` → returns `(nextKmer, true)` or `(0, false)` when exhausted. +- `SeekTo(target uint64) error` → jumps to first k-mer ≥ target using index (no backward seek). +- `Count()` / `Remaining()` → total and unread k-mers. +- `Close()` → releases file handle. + +## Design Highlights + +- Uses 64 KB buffer for efficient I/O. +- Index entries record `(kmer, byteOffset)` at fixed strides (e.g., every 1024 k-mers). +- `SeekTo` is idempotent and safe: no-op if target ≤ current position or index unavailable. +- Designed for large-scale genomic k-mer catalogs (e.g., from minimizers or de Bruijn graphs). diff --git a/autodoc/docmd/pkg/obikmer/kdi_test.md b/autodoc/docmd/pkg/obikmer/kdi_test.md new file mode 100644 index 0000000..1253302 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/kdi_test.md @@ -0,0 +1,34 @@ +# KDI File Format and API + +The `obikmer` package implements a compact, sorted k-mer storage format (`.kdi`) with delta compression for efficient disk persistence and retrieval. + +## Core Features + +- **Sorted k-mer serialization**: K-mers (as `uint64`) are written in ascending order. +- **Delta encoding**: Consecutive differences (deltas) between k-mers are stored using variable-length integers (`varint`), drastically reducing size for dense sequences. +- **Round-trip integrity**: Full write/read cycles preserve exact k-mer values and counts. + +## File Structure + +A `.kdi` file contains: +1. **Magic header** (4 bytes): Identifies the format. +2. **Count field** (8 bytes, `uint64`): Number of stored k-mers. +3. **First value** (8 bytes, `uint64`): Initial k-mer. +4. **Delta-encoded tail**: `(n−1)` deltas, each encoded as a `varint`. + +## API + +- **`NewKdiWriter(path string)`**: Creates a writer; `Write(v uint64)` appends k-mers. +- **`Writer.Count()`**: Returns the number of written items before closing. +- **`NewKdiReader(path string)`**: Opens a reader; `Next() (uint64, bool)` yields k-mers in order. +- **`Reader.Count()`**: Returns total stored count. + +## Tests Validate + +1. Basic round-trip with diverse values (including large `uint64`s). +2. Empty and single-k-mer files. +3. Exact file size for minimal cases (e.g., 20 bytes for one k-mer). +4. Delta compression efficiency on dense sequences (e.g., 10k even numbers → ~9,999 extra bytes). +5. Real-world usage: extracting canonical k-mers from DNA sequences, sorting/deduplicating, and persisting them. + +The format is optimized for memory-mapped access or streaming traversal in bioinformatics pipelines. diff --git a/autodoc/docmd/pkg/obikmer/kdi_writer.md b/autodoc/docmd/pkg/obikmer/kdi_writer.md new file mode 100644 index 0000000..f4eedd7 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/kdi_writer.md @@ -0,0 +1,38 @@ +# KDI File Format and Writer + +The `obikmer` package implements a compact, sorted sequence storage format for 64-bit k-mers using delta encoding and sparse indexing. + +## Core Format (`.kdi`) + +- **Magic header**: `KDI\x01` (`4 bytes`) identifies the file type. +- **Count field**: `uint64 LE`, total number of k-mers (patched at close). +- **First value**: `uint64 LE`, the initial k-mer stored as an absolute integer. +- **Deltas**: Subsequent values encoded via *delta-varint* (difference from previous k-mer), enabling high compression for sorted sequences. + +## Writer (`KdiWriter`) + +- **Strict ordering**: K-mers must be written in *strictly increasing order*. +- Efficient buffering via `bufio.Writer` (64 KB buffer). +- Internally tracks: + - Current k-mer count, + - Previous value (for delta computation), + - Bytes written in data section. +- **Sparse indexing**: Every `defaultKdxStride` k-mers, an entry is recorded in memory for random access. + +## Companion Index (`.kdx`) + +- Written automatically on `Close()` if indexing entries exist. +- Stores `(kmer, file_offset)` pairs for fast seek-to-position lookups (e.g., binary search on k-mer range). +- Enables efficient random access without full file scan. + +## Usage Pattern + +```go +w, _ := obikmer.NewKdiWriter("data.kdi") +for _, kmer := range sortedKMers { + w.Write(kmer) +} +w.Close() // finalizes header, writes .kdx index +``` + +The format is optimized for memory-efficient storage and fast retrieval of sorted uint64 k-mers in genomic or sequence analysis pipelines. diff --git a/autodoc/docmd/pkg/obikmer/kdx.md b/autodoc/docmd/pkg/obikmer/kdx.md new file mode 100644 index 0000000..df8f4f1 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/kdx.md @@ -0,0 +1,29 @@ +# KDX Index Format and Functionality + +The `obikmer` package provides a sparse indexing mechanism for `.kdi` files (likely storing sorted k-mers with delta encoding). The **`.kdx` file** serves as a fast lookup table to accelerate k-mer searches. + +## Core Concepts + +- **Magic bytes**: `KDX\x01` validates file integrity. +- **Stride-based sparsity**: One index entry every *N* k-mers (default: 4096), balancing memory vs. search speed. +- **Entry structure**: Each entry stores: + - `kmer`: the k-mer value at that index position. + - `offset`: absolute byte offset in the corresponding `.kdi` file. + +## Key Operations + +- **Loading**: `LoadKdxIndex()` reads and validates a `.kdx` file; returns `(nil, nil)` if missing (graceful degradation). +- **Searching**: `FindOffset(target uint64)` performs binary search over index entries to find the *best jump point*: + - Returns `offset`, `skipCount` (k-mer count already passed), and a boolean success flag. + - Enables efficient seeking: after `offset`, only up to *stride* k-mers need linear scanning. +- **Writing**: `WriteKdxIndex()` serializes an in-memory index to disk (for building indexes). +- **Helper**: `KdxPathForKdi()` derives the `.kdx` path from a given `.kdi` file. + +## Performance + +- Search complexity: **O(log M)** for the binary search (where *M* = #index entries), plus ≤ stride linear steps. +- Memory footprint: Linear in index size (16 bytes per entry), highly scalable for large k-mer sets. + +## Design Philosophy + +Minimalist, binary-safe format with explicit endianness (little-endian), no external dependencies beyond `encoding/binary`, and robust error handling. diff --git a/autodoc/docmd/pkg/obikmer/kmer_match.md b/autodoc/docmd/pkg/obikmer/kmer_match.md new file mode 100644 index 0000000..9f41001 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/kmer_match.md @@ -0,0 +1,14 @@ +# Semantic Description of `obikmer` Package + +The `obikmer` package implements efficient k-mer matching between query sequences and an indexed reference using **canonical k-mers** partitioned by minimizer-based hashing. + +- `QueryEntry` represents a single canonical k‑mer with its origin: sequence index and 1-based position. +- `PreparedQueries` groups queries into sorted buckets per partition, enabling batched and parallelized matching. +- `PrepareQueries` scans input sequences using *super-kmers* (with window size `m`) to compute minimizers, assigns each k‑mer to a partition via modulo hashing, and sorts buckets by k‑mer value. +- `MergeQueries` combines two sets of prepared queries across batches using a merge-sort strategy, correctly offsetting sequence indices to preserve global ordering. +- `MatchBatch` performs parallel matching per partition: each goroutine runs a **merge-scan** between sorted queries and the corresponding KDI (K-mer Disk Index) stream. + - Efficient seeking is used only when beneficial, avoiding costly syscalls for small skips. + - Matches are recorded with thread-safe per-sequence mutexes; final positions within each sequence are sorted post-match. +- `matchPartition` implements the core merge-scan: it opens a KDI reader, seeks to relevant regions of the index, and walks both query list and k‑mer stream in lockstep. + +The design supports **large-scale batch processing**, incremental query accumulation, and high-performance parallel lookup—ideal for metagenomic or biodiversity sequencing workflows. diff --git a/autodoc/docmd/pkg/obikmer/kmer_set_builder.md b/autodoc/docmd/pkg/obikmer/kmer_set_builder.md new file mode 100644 index 0000000..8196638 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/kmer_set_builder.md @@ -0,0 +1,49 @@ +# `obikmer` K-mer Set Group Builder — Functional Overview + +The `KmerSetGroupBuilder` enables scalable construction of k-mer indexes from biological sequences, supporting both new and incremental (append) workflows. It operates in two phases: **collection** of super-kmers into partitioned temporary files (`.skm`), and **finalization**, where partitions are processed in parallel into final k-mer indexes (`.kdi`). + +## Core Features + +- **K-mer & Minimizer Configuration**: + Supports `k ∈ [2,31]`; auto-computes optimal minimizer size (`m ≈ k/2.5`) and partition count (up to `4^m`, capped at 4096). + +- **Functional Options for Filtering**: + - `WithMinFrequency(n)`: Keep only k-mers with frequency ≥ *n* (enables deduplication). + - `WithMaxFrequency(n)`: Discard k-mers with frequency > *n*. + - `WithEntropyFilter(threshold, levelMax)`: Remove low-complexity k-mers (entropy ≤ threshold). + - `WithSaveFreqKmers(n)`: Save top-*n* most frequent k-mers per set to `top_kmers.csv`. + +- **Concurrent & Pipeline-Aware Processing**: + Uses a two-stage pipeline: *I/O-bound readers* (2–4 goroutines) feed k-mers to *CPU-bound workers*, one per core, maximizing throughput. + +- **Partitioned I/O & Thread Safety**: + Super-kmers are written to per-partition `.skm` files using mutex-protected writers, enabling safe concurrent `AddSequence()` calls. + +## Workflow + +1. **Build Phase**: + - Input sequences → super-kmers extracted via minimizer-based partitioning. + - Super-kmers written to `.build/set_*/part_*.skm`. + +2. **Finalization (`Close()`)**: + - `.skm` files loaded → canonical k-mers extracted. + - K-mers sorted, counted (frequency spectrum), and filtered per config. + - Final `.kdi` files written; `spectrum.bin`, and optionally `top_kmers.csv`. + - Metadata (`metadata.toml`) generated; `.build/` cleaned. + +3. **Append Mode**: + `AppendKmerSetGroupBuilder()` extends an existing group, inheriting its parameters and appending new sets. + +## Output Artifacts + +- `.kdi`: Sorted, deduplicated (and optionally filtered) k-mers. +- `spectrum.bin`: Per-set frequency spectrum (`count → #k-mers`). +- `top_kmers.csv` (optional): Top *N* k-mers per set with counts. +- `metadata.toml`: Global and per-set metadata (k, m, partitions, counts). + +## Design Highlights + +- **Memory-efficient**: Streams large `.skm` files; reuses slices to minimize GC pressure. +- **Scalable**: Parallel finalization scales with CPU cores and I/O bandwidth. +- **Robust error handling**: Early termination on first failure; cleanup of partial state. + diff --git a/autodoc/docmd/pkg/obikmer/kmer_set_builder_test.md b/autodoc/docmd/pkg/obikmer/kmer_set_builder_test.md new file mode 100644 index 0000000..931d0da --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/kmer_set_builder_test.md @@ -0,0 +1,44 @@ +# K-mer Set Group Builder — Semantic Description + +This Go module (`obikmer`) provides a **disk-backed builder and accessor** for managing *k-mer sets* across multiple biological sequence datasets. It supports efficient construction, persistence, and querying of canonical *k*-mers (accounting for DNA reverse-complement symmetry), with optional frequency filtering. + +### Core Functionalities + +- **K-mer Set Group Construction**: + `NewKmerSetGroupBuilder` creates a builder configured with: + - *k* (k-mer length), + - *m* (minimal unique substring for partitioning), + - number of sets (`nSets`), + - and optional parameters like `WithMinFrequency`. + +- **Sequence Ingestion**: + Sequences are added per set via `AddSequence(setID, bioseq)`. Internally: + - Canonical *k*-mers are extracted (using `IterCanonicalKmers`), + - Deduplicated and optionally filtered by occurrence frequency. + +- **Persistence & Round-Trip**: + `builder.Close()` materializes the *k*-mer sets to disk (in temp or specified directory). + `OpenKmerSetGroup(dir)` reloads them — preserving all metadata and structure. + +- **Metadata & Attributes**: + Supports custom identifiers (`SetId`) and key-value attributes (e.g., `"organism": "test"`), saved to disk via `SaveMetadata`. + +- **Efficient Iteration**: + The iterator (`ksg.Iterator(setID)`) yields *sorted*, deduplicated canonical *k*-mers — using a k-way merge across internal partitions. + +- **Frequency Filtering**: + `WithMinFrequency(n)` ensures only *k*-mers appearing ≥*n* times across inputs survive — enabling noise suppression (e.g., in error correction or abundance-based filtering). + +- **Multi-set Support**: + Handles multiple independent *k*-mer sets (e.g., per sample or taxonomic group), verified via `Size()` and indexed access (`Len(setID)`). + +### Testing Coverage + +Comprehensive unit tests validate: +- Basic construction & correctness, +- Multi-sequence ingestion and deduplication, +- Frequency-based inclusion/exclusion logic, +- Cross-set isolation (`nSets > 1`), +- Metadata round-trip integrity. + +This module is designed for scalable, reproducible *k*-mer indexing in metagenomic or amplicon analysis pipelines (e.g., OBITools4 ecosystem). diff --git a/autodoc/docmd/pkg/obikmer/kmer_set_disk.md b/autodoc/docmd/pkg/obikmer/kmer_set_disk.md new file mode 100644 index 0000000..e8b38e0 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/kmer_set_disk.md @@ -0,0 +1,44 @@ +# `obikmer` Package: Disk-Based K-mer Set Group Management + +The `obikmer` package provides a streaming, disk-backed implementation for managing collections of *k*-mer sets (called **K-mer Set Groups**), optimized for large-scale metagenomic or genomic analyses. + +### Core Concepts +- A **KmerSetGroup** stores *N* disjoint sets of sorted *k*-mers, partitioned into *P* files per set. +- Each group is defined by immutable parameters: `k` (*mer size), `m* (minimizer size), and *P* partitions. +- Data is stored on disk as `.kdi` files (sorted k-mers) with optional sparse indices (`.kdx`) for fast lookup. +- Metadata is serialized in TOML format (`metadata.toml`), supporting both group-level and per-set attributes. + +### Key Functionalities + +#### 1. **Lifecycle Management** +- `OpenKmerSetGroup(directory)` loads an existing index in read-only mode. +- `NewFilteredKmerSetGroup(...)` constructs a new group (e.g., after filtering). +- `SaveMetadata()` persists metadata changes to disk. + +#### 2. **Accessors & Metadata** +- Basic properties: `K()`, `M()`, `Partitions()`, `Size()` (i.e., *N*), and group ID. +- Attribute API: get/set/delete user-defined metadata (group-level or per-set). + - Supports type coercion (`GetIntAttribute`, `GetStringAttribute`). + +#### 3. **Membership & Iteration** +- `Contains(setIndex, kmer)` checks presence using indexed binary search + linear scan across all partitions (parallelized). +- `Iterator(setIndex)` yields sorted *k*-mers via k-way merge of partition readers. + +#### 4. **Similarity & Distance Metrics** +- `JaccardDistanceMatrix()` and `JaccardSimilarityMatrix()`: compute pairwise metrics in a streaming fashion. + - Per-partition processing with parallel goroutines and sorted merge for accurate set intersection/union counts. + +#### 5. **Set Management** +- `CopySetsByIDTo(ids, destDir)` copies selected sets (with metadata) to another group. + - Supports compatibility checks and optional overwriting (`force`). +- `RemoveSetByID(id)` deletes a set, renumbers remaining sets for contiguous indices. +- Glob pattern matching: `MatchSetIDs(patterns)` resolves IDs like `"sample_*"`. + +#### 6. **Compatibility & Utility** +- `IsCompatibleWith(other)` verifies same `(k, m, partitions)`. +- Helper methods: `PartitionPath`, `Spectrum(...)`, and spectrum file I/O. + +### Design Highlights +- **Streaming**: Operations avoid loading full datasets into memory. +- **Immutability after creation** ensures consistency; modifications require explicit save operations. +- Thread-safe for concurrent partition processing (via `sync.Mutex`/`WaitGroup`). diff --git a/autodoc/docmd/pkg/obikmer/kmer_set_disk_ops.md b/autodoc/docmd/pkg/obikmer/kmer_set_disk_ops.md new file mode 100644 index 0000000..2d564d4 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/kmer_set_disk_ops.md @@ -0,0 +1,26 @@ +# Semantic Description of `obikmer` Set Operations + +This Go package implements scalable set operations over collections of *k*-mers stored in disk-backed, sorted structures (`.kdi` files). A `KmerSetGroup` represents a group of *N* disjoint sets (e.g., per-sample or per-partition), each containing sorted unique *k*-mers. + +## Core Set Operations + +- **`Union()`**: Computes the union across all *N* sets — a k-mer appears in output if present in ≥1 input set. +- **`Intersect()`**: Computes the intersection — a k-mer appears only if present in *all* sets. +- **`Difference()`**: Computes `set₀ \ (set₁ ∪ … ∪ setₙ₋₁)` — keeps k-mers unique to the first set. +- **`QuorumAtLeast(q)`**: Returns k-mers present in ≥ *q* sets. +- **`QuorumExactly(q)`**: Returns k-mers present in exactly *q* sets. +- **`QuorumAtMost(q)`**: Returns k-mers present in ≤ *q* sets. + +## Pairwise Group Operations + +- **`UnionWith(other)` / `IntersectWith(other)`**: Performs *per-set* binary operations between two compatible groups (same k, m, partitions, size). Result has *N* sets: `setᵢ = this.setᵢ ⊕ other.setᵢ`, where ⊕ is union or intersection. + +## Implementation Highlights + +- **Partitioned & Parallelized**: Each operation processes partitions in parallel using `runtime.NumCPU()` workers. +- **Streaming K-way Merge**: Uses efficient sorted-stream merging (via `KWayMerge`) to avoid loading full sets into memory. +- **Quorum Filtering**: Counts occurrences per k-mer across partitions by merging sorted streams and tallying hits. +- **Compatibility Check**: Ensures groups share metadata (k, m, partitions) before pairwise operations. +- **Disk Output**: All results materialized as new `KmerSetGroup` in a directory, with per-partition `.kdi` files and metadata. + +All operations preserve sorted order and support large-scale genomic datasets via streaming, partitioning, and minimal memory footprint. diff --git a/autodoc/docmd/pkg/obikmer/kmer_set_disk_ops_test.md b/autodoc/docmd/pkg/obikmer/kmer_set_disk_ops_test.md new file mode 100644 index 0000000..3751098 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/kmer_set_disk_ops_test.md @@ -0,0 +1,28 @@ +# Semantic Description of `obikmer` Package Functionalities + +The `obikmer` package provides disk-backed operations on *k*-mer sets derived from biological sequences. It supports scalable set algebra and similarity computations via the `KmerSetGroup` type. + +## Core Features + +- **Sequence-to-*k*-mer Indexing**: Sequences are converted into *k*-mers (of length `k`) and stored in a group of sets (`KmerSetGroup`), with one set per sequence. Minimizer-based sampling (parameter `m`) reduces redundancy. + +- **Set Operations on Disk**: Efficient disk-resident implementations of standard set operations: + - `Union`: Merges all *k*-mers from selected sets. + - `Intersect`: Retains only *k*-mers present in all input sets. + - `Difference` (`A \ B`): Keeps *k*-mers present in set A but not in B. + - `QuorumAtLeast(r)`: Returns *k*-mers appearing in ≥`r` sets (generalizes union (`r=1`) and intersection (`r=n`)). + +- **Consistency Guarantees**: Operations obey mathematical identities (e.g., `|A ∪ B| = |A| + |B| − |A ∩ B|`), validated via unit tests. + +- **Similarity & Distance Metrics**: + - `JaccardDistanceMatrix()`: Computes pairwise Jaccard *distances* (1 − similarity) between all sets. + - `JaccardSimilarityMatrix()`: Computes pairwise Jaccard *similarities* (`|A ∩ B| / |A ∪ B|`). + - Identical sets yield distance = `0.0`, disjoint ones give `1.0`; similarity is complementary. + +## Design Principles + +- **Temporary Directory Usage**: All operations use OS temp dirs for isolation and cleanup. +- **Testing-Focused API**: Helper functions (`buildGroupFromSeqs`, `collectKmers`) simplify test setup. +- **Scalability**: Disk-backed design avoids memory overflow for large sequence collections. + +This package enables robust, reproducible *k*-mer set analysis in bioinformatics pipelines—especially useful for metagenomic binning, error correction, or read clustering. diff --git a/autodoc/docmd/pkg/obikmer/kmermap.md b/autodoc/docmd/pkg/obikmer/kmermap.md new file mode 100644 index 0000000..bea5a41 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/kmermap.md @@ -0,0 +1,37 @@ +# Semantic Description of `KmerMap` Functionality + +The provided Go package implements a **k-mer indexing and matching system** for biological sequences (`BioSequence`). It supports both standard and *sparse* k-mer representations (where one position is masked, typically for handling ambiguous bases or symmetry). + +### Core Data Structures +- `KmerMap[T]`: A generic hash map associating *normalized* k-mers (type `T`, e.g., uint64 encoded in 2 bits per base) to lists of sequences containing them. +- `KmerMatch`: A map from sequence pointers to k-mer match counts, used for query results. + +### Key Features +1. **K-mer Normalization** + - Handles both forward and reverse-complement k-mers. + - Selects the lexicographically smaller representation (canonical form). + - Supports *sparse* k-mers: when `SparseAt ≥ 0`, the central base is ignored (replaced by `#` in string view), and k-mers are symmetrically normalized. + +2. **Efficient Indexing (`Push`)** + - Builds an index of all canonical k-mers from a set of sequences. + - Optionally limits per-k-mer storage (`maxocc`), useful for filtering high-frequency k-mers (e.g., contaminants). + +3. **Querying (`Query`)** + - Given a query sequence, returns all sequences in the index sharing k-mers with it. + - Counts per-sequence how many shared k-mers exist (used for similarity estimation or clustering). + +4. **Result Utilities (`KmerMatch`)** + - `FilterMinCount`: Remove low-count matches. + - `Max()`, `Sequences()`: Retrieve best match or all matched sequences. + +5. **Construction (`NewKmerMap`)** + - Automatically adjusts k-mer size: odd for sparse mode, even otherwise. + - Precomputes bitmasks for efficient k-mer manipulation (masking, shifting). + - Integrates progress bar during indexing. + +### Use Cases +- Read clustering (e.g., OTU/ASV picking). +- Error correction via k-mer abundance. +- Sequence similarity search or contamination screening. + +The implementation leverages low-level bit operations for performance and memory efficiency, especially critical in large-scale NGS data processing. diff --git a/autodoc/docmd/pkg/obikmer/minimizer_utils.md b/autodoc/docmd/pkg/obikmer/minimizer_utils.md new file mode 100644 index 0000000..4736742 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/minimizer_utils.md @@ -0,0 +1,27 @@ +# Minimizer Size Utilities in `obikmer` + +This Go package provides helper functions to compute and validate the **minimizer size** `m` in k-mer-based genomic algorithms (e.g., minimizer schemes for sequence comparison or indexing). + +## Core Functions + +- **`DefaultMinimizerSize(k)`** + Returns a *recommended* minimizer size: `ceil(k / 2.5)`, clamped to `[1, k−1]`. + → Ensures `m` is reasonably large for uniqueness while keeping window size (`k − m + 1`) manageable. + +- **`MinMinimizerSize(nworkers)`** + Computes the *minimum* `m` such that there are ≥ `nworkers` distinct minimizers: + solves `4^m ≥ n_workers`, i.e., `ceil(log₄(nworkers))`. + → Guarantees enough diversity for parallelization (e.g., hashing-based distribution across workers). + +- **`ValidateMinimizerSize(m, k, nworkers)`** + Enforces constraints on `m`: + - Lower bound: ≥ `MinMinimizerSize(nworkers)` (warns & adjusts if violated) + - Hard bounds: `1 ≤ m < k` + → Prevents invalid or inefficient parameter choices. + +## Semantic Purpose + +These functions ensure that minimizer-based workflows are: +- **Theoretically sound** (sufficient entropy for parallelism), +- **Practically viable** (avoiding degenerate cases like `m = 0` or `m ≥ k`), +- **User-friendly** (providing sensible defaults + clear warnings on adjustment). diff --git a/autodoc/docmd/pkg/obikmer/skm_reader.md b/autodoc/docmd/pkg/obikmer/skm_reader.md new file mode 100644 index 0000000..63c588c --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/skm_reader.md @@ -0,0 +1,24 @@ +# SKM File Reader for Super-Kmers + +This Go package provides a binary file reader (`SkmReader`) for `.skm` files, which store *super-kmers* — compact representations of DNA sequences using 2-bit encoding. + +## Core Functionality + +- **Binary Format Parsing**: Reads structured data from `.skm` files, where each record contains: + - A 2-byte little-endian integer specifying the sequence length. + - Packed nucleotide data, where every byte encodes up to four bases (2 bits per base). + +- **Decoding Logic**: Converts packed 2-bit codes (`00`, `01`, `10`, `11`) to nucleotide characters using the mapping: + `{ 'a', 'c', 'g', 't' }`. + +- **Memory-Efficient Reading**: Uses buffered I/O (64 KiB buffer) for fast sequential access. + +- **Streaming Interface**: `Next()` returns the next super-kmer as a struct with: + - `Sequence`: decoded nucleotide byte slice. + - `Start`, `End`: positional metadata (currently fixed to full length). + +- **Resource Management**: Provides a clean `.Close()` method for file handle cleanup. + +## Use Case + +Designed for high-performance processing of large genomic datasets (e.g., in k-mer analysis or sequence indexing), where storage size and read speed are critical. diff --git a/autodoc/docmd/pkg/obikmer/skm_test.md b/autodoc/docmd/pkg/obikmer/skm_test.md new file mode 100644 index 0000000..c406749 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/skm_test.md @@ -0,0 +1,23 @@ +# SKM File Format Specification + +This Go package implements a binary format for storing *super-kmers*—compact representations of DNA sequences used in bioinformatics. The tests validate reading/writing, padding behavior, and file size correctness. + +## Core Functionalities + +- **SuperKmer Structure**: Each super-kmer stores a DNA sequence (as bytes), likely padded to 4-base boundaries for efficient storage. +- **SkmWriter**: Serializes super-kmers into a binary file. Each entry writes: + - A 2-byte little-endian length (number of bases), + - Then `ceil(length/4)` bytes encoding nucleotides in 2 bits each (A=0, C=1, G=2, T=3). +- **SkmReader**: Parses the binary format back into memory. Returns `(SuperKmer, bool)` via `Next()`, with EOF signaled by `ok = false`. +- **Case Handling**: Writes preserve original case; reads normalize to lowercase (via `| 0x20` in tests), ensuring robust comparison. + +## Test Coverage + +- **Round-trip integrity**: Verifies exact sequence recovery after write/read. +- **Empty file handling**: Confirms reader returns `ok = false` immediately on empty files. +- **Variable-length padding**: Validates correct encoding/decoding for sequences of length 1–5. +- **Size validation**: Confirms file size = `2 + ceil(L/4)` bytes for a sequence of length *L*. + +## Use Case + +Efficient, lossless storage and retrieval of super-kmers for downstream genomic analysis (e.g., assembly or alignment acceleration). diff --git a/autodoc/docmd/pkg/obikmer/skm_writer.md b/autodoc/docmd/pkg/obikmer/skm_writer.md new file mode 100644 index 0000000..9fb098c --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/skm_writer.md @@ -0,0 +1,24 @@ +# `.skm` File Format and `SkmWriter` Functionality + +The Go package `obikmer` provides a binary writer for `.skm` (super-kmer) files, optimized for compact storage of DNA sequences. + +- **Purpose**: Efficiently serialize *super-kmers* (long k-mers) into a binary format. +- **Format per super-kmer**: + - `len: uint16 LE` — length of the sequence in bases (little-endian, 2 bytes). + - `data: ⌈len/4⌉ bytes` — nucleotide sequence encoded as **2 bits per base**, packed tightly. + +- **Encoding scheme**: + - `A → 00`, `C → 01`, `G → 10`, `T → 11`. + - Padding: trailing bits in the final byte are zeroed if `len % 4 ≠ 0`. + +- **Implementation details**: + - Uses buffered I/O (`bufio.Writer` with 64 KiB buffer) for performance. + - `NewSkmWriter(path)` opens/creates the file and returns a writer instance. + - `Write(sk SuperKmer)` encodes sequence length, then packs bases using a lookup (`__single_base_code__[seq[pos]&31]`). + - `Close()` flushes buffers and closes the file handle. + +- **Use case**: Ideal for high-throughput genomic preprocessing (e.g., indexing, sketching), where space and I/O speed matter. + +- **Assumptions**: `SuperKmer` type exposes a `.Sequence []byte`; bases are ASCII (`A,C,G,T,a,c,g,t`) — `&31` normalizes to lowercase index. + +- **Efficiency**: 4× compression vs. ASCII (1 byte/base → ~0.25 bytes/base), minimal overhead. diff --git a/autodoc/docmd/pkg/obikmer/spectrum.md b/autodoc/docmd/pkg/obikmer/spectrum.md new file mode 100644 index 0000000..9bc00dc --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/spectrum.md @@ -0,0 +1,35 @@ +# K-mer Spectrum Analysis Package (`obikmer`) + +This Go package provides tools for analyzing k-mer frequency distributions in biological sequences. + +## Core Data Structures + +- **`SpectrumEntry`**: Represents a bin in the k-mer frequency spectrum: + `Frequency`: how often a k-mer was observed; `Count`: number of distinct k-mers with that frequency. + +- **`KmerSpectrum`**: A sorted list of non-zero `SpectrumEntry`s (ascending by frequency), enabling efficient statistics and serialization. + +## Key Functionalities + +### Spectrum Management +- `MapToSpectrum()` / `ToMap()`: Convert between map and structured spectrum representations. +- `MergeSpectraMaps()` / `MergeTopN()`: Combine spectral or top-k data from multiple sources. +- `MaxFrequency()` returns the highest observed k-mer count. + +### I/O & Persistence +- Binary format (`KSP\x01` magic header) with varint encoding for compact storage: + - `WriteSpectrum()` / `ReadSpectrum()`: Save/load full spectra to disk. +- CSV export: + - `WriteTopKmersCSV()`: Outputs top-k k-mers with their sequences (decoded from uint64) and frequencies. + +### Top-N K-mer Tracking +- Uses a **min-heap** to efficiently maintain the *N most frequent* k-mers in streaming scenarios: + - `NewTopNKmers(n)`: Initialize collector. + - `Add(kmer, freq)`: Insert/update while respecting capacity *n*. + - `Results()`: Return top-kmers sorted descending by frequency. + +## Design Highlights +- Memory-efficient: Uses `uint64` for k-mers (suitable up to *k* ≤ 32). +- Streaming-friendly: Top-N collector supports incremental updates. +- Thread-safety note: External synchronization required for concurrent access. + diff --git a/autodoc/docmd/pkg/obikmer/superkmer.md b/autodoc/docmd/pkg/obikmer/superkmer.md new file mode 100644 index 0000000..5ab418b --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/superkmer.md @@ -0,0 +1,48 @@ +# SuperKmer and Minimizer-Based Sliding Window Analysis + +This Go package provides functionality for extracting *super k-mers* from DNA sequences using a minimizer-based sliding window approach. + +## Core Concepts + +- **K-mers**: Substrings of length `k` from a DNA sequence. +- **Minimizer**: The lexicographically smallest canonical *m*-mer (substring of length `m`) among all `(k − m + 1)` overlapping *m*-mers in a given k-mer. +- **Super K-mer**: A maximal contiguous subsequence where *every* consecutive k-mer shares the **same minimizer**. + +## Data Structures + +### `SuperKmer` +Represents a maximal region with uniform minimizer: +- `Minimizer`: Canonical 64-bit hash of the shared m-mer. +- `Start`, `End`: Slice-style bounds (0-indexed, exclusive end). +- `Sequence`: Raw byte slice of the DNA subsequence. + +### `dequeItem` +Used internally to maintain a monotone deque: +- `position`: Index of the m-mer in the sequence. +- `canonical`: Canonical hash value (e.g., lexicographically smallest of forward/reverse-complement). + +## Main Function + +### `ExtractSuperKmers(seq, k, m, buffer)` +- Extracts all maximal super k-mers from `seq`. +- Parameters validated: + - `1 ≤ m < k`, + - `2 ≤ k ≤ 31`, + - sequence length ≥ `k`. +- Uses an efficient **O(n)** time algorithm via internal iteration. +- Supports optional preallocation (`buffer`) to reduce memory allocations. + +## Algorithm Highlights + +- Maintains a sliding window of size `k − m + 1` over *m*-mers. +- Tracks the current minimizer using a monotone deque for O(1) updates per step. +- Detects *minimizer transitions* to delimit super k-mer boundaries. + +## Complexity + +| Aspect | Bound | +|---------------|-------------------------------| +| Time | **O(n)** (linear in sequence length) | +| Space | **O(k − m + 1)** for deque + output size | + +Useful in genome compression, read clustering, and minimizer-based alignment acceleration. diff --git a/autodoc/docmd/pkg/obikmer/superkmer_iter.md b/autodoc/docmd/pkg/obikmer/superkmer_iter.md new file mode 100644 index 0000000..59d740c --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/superkmer_iter.md @@ -0,0 +1,32 @@ +# Super K-mers Extraction Module (`obikmer`) + +This Go package provides efficient tools for extracting **super k-mers** from DNA sequences using *minimizer-based sliding windows*. Super k-mers are maximal contiguous subsequences sharing the same minimal canonical minimizer in a window of size `k`. + +## Core Functionality + +- **`IterSuperKmers(seq, k, m)`** + Returns an iterator over `SuperKmer` structs. Each struct contains: + - `Start`, `End`: genomic positions of the super k-mer in the original sequence + - `Minimizer`: canonical minimizer value (uint64) for that segment + - `Sequence`: the actual DNA subsequence + +- **`SuperKmer.ToBioSequence(...)`** + Converts a raw `SuperKmer` into an enriched `obiseq.BioSequence`, embedding metadata: + - ID: `{parentID}_superkmer_{start}_{end}` + - Attributes: minimizer sequence (`minimizer_seq`), value, `k`, `m`, positions, and parent ID + +- **`SuperKmerWorker(k, m)`** + A `SeqWorker` adapter for pipeline integration (e.g., with `obiiter`). Processes a full BioSequence and returns all extracted super k-mers as a slice of `BioSequence`s. + +## Algorithm Highlights + +- Uses **canonical minimizers** (forward/reverse-complement minimum) to ensure strand-invariance +- Maintains a monotonic deque for efficient *sliding-window minimizer* tracking (O(n) time complexity) +- Supports DNA bases `A/C/G/T/U` case-insensitively via bitmasking (`seq[i] & 31`) +- Enforces parameter constraints: `1 ≤ m < k ≤ 31`, sequence length ≥ `k` + +## Use Cases + +- Read partitioning in metagenomics (e.g., for error correction or clustering) +- Efficient k-mer space segmentation without storing all individual kmers +- Integration into modular bioinformatics pipelines via `SeqWorker` interface diff --git a/autodoc/docmd/pkg/obikmer/superkmer_iter_test.md b/autodoc/docmd/pkg/obikmer/superkmer_iter_test.md new file mode 100644 index 0000000..1f0e67b --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/superkmer_iter_test.md @@ -0,0 +1,39 @@ +# Semantic Description of `obikmer` Package Functionalities + +The `obikmer` package provides tools for **super k-mer extraction and minimizer-based sequence analysis** in bioinformatics. + +## Core Concepts + +A **super k-mer** is a maximal contiguous subsequence of DNA where *all* embedded *k*-mers share the **same minimizer**—a compact representative (typically lexicographically minimal) of *m*-mers, considering both forward and reverse-complement strands. + +## Key Functions & Features + +- **`IterSuperKmers(seq, k, m)`**: + An iterator over all super *k*-mers in input sequence `seq`, parameterized by: + - `k`: length of embedded *k*-mers, + - `m`: size of minimizer window (`m ≤ k`). + Yields structured objects with: + - `Sequence`: the super *k*-mer substring, + - `Start`/`End`: genomic coordinates (0-based half-open), + - `Minimizer`: canonical hash of the shared minimizer. + +- **`ExtractSuperKmers(...)`**: + Synchronous counterpart returning a slice of all super *k*-mers. + +## Verified Properties (via Tests) + +1. **Boundary correctness**: Extracted subsequences match `seq[start:end]`. +2. **Consistency between iterator and slice versions**: Both APIs produce identical results. +3. **Bijection property**: + - Each unique super *k*-mer sequence maps to exactly one minimizer. + - All embedded *k*-mers within a super *k-mer* share the same minimizer. + +## Implementation Notes + +- Minimizers are computed canonically (min of forward and reverse-complement encodings). +- Uses base encoding via `__single_base_code__` (assumed helper mapping A/C/G/T → 0/1/2/3). +- Tests cover simple, homopolymer-rich, and complex genomic patterns. + +## Design Rationale + +Super *k*-mers enable efficient compression, indexing (e.g., in minimizer spaces), and alignment-free comparisons—crucial for scalable genomic analysis. diff --git a/autodoc/docmd/pkg/obikmer/varint.md b/autodoc/docmd/pkg/obikmer/varint.md new file mode 100644 index 0000000..88c030a --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/varint.md @@ -0,0 +1,33 @@ +# Variable-Length Integer Encoding/Decoding Utility + +This Go package (`obikmer`) provides efficient serialization of `uint64` integers using **protobuf-style variable-length encoding (varint)**. + +## Core Features + +- ✅ `EncodeVarint(io.Writer, uint64) (n int, err error)` + Writes a `uint64` as a compact varint to any `io.Writer`. Uses **7 bits per byte**, with the MSB as a continuation flag. Max 10 bytes for `uint64`. + +- ✅ `DecodeVarint(io.Reader) (val uint64, err error)` + Reads and decodes a varint from any `io.Reader`. Handles multi-byte sequences safely; returns error on malformed input or overflow (>70 bits). + +- ✅ `VarintLen(uint64) int` + Computes the exact byte length required to encode a value *without* performing I/O — useful for buffer preallocation or size estimation. + +## Encoding Scheme + +- Each byte holds 7 bits of data; bit 8 (MSB) = `1` if more bytes follow, else `0`. +- Example: + - `0x7F` → `1 byte`: `0111_1111` + - `0x80` → `2 bytes`: `1000_0000 0000_0001` + +## Use Cases + +- Network protocols & binary file formats requiring compact integer representation +- Serialization frameworks (e.g., custom protobuf-like codecs) +- Embedded systems or bandwidth-constrained environments where space efficiency matters + +## Design Notes + +- No external dependencies; uses only `io` from the standard library. +- Thread-safe *per call* (no shared state), but `io.Reader`/`Writer` concurrency must be handled externally. +- Compatible with standard protobuf varint format (e.g., interoperable with `encoding/binary` or gRPC). diff --git a/autodoc/docmd/pkg/obikmer/varint_test.md b/autodoc/docmd/pkg/obikmer/varint_test.md new file mode 100644 index 0000000..626e817 --- /dev/null +++ b/autodoc/docmd/pkg/obikmer/varint_test.md @@ -0,0 +1,37 @@ +# Varint Encoding and Decoding Module (`obikmer`) + +This Go package implements **variable-length integer encoding/decoding**, commonly used in binary protocols (e.g., Protocol Buffers, SQLite) to efficiently store small integers using fewer bytes. + +## Core Features + +- **`EncodeVarint(w io.Writer, v uint64) (n int, err error)`** + Encodes a `uint64` value into the minimal number of bytes (1–10) using **LEB128-style varint**, writing the result to a writer. Returns bytes written and any I/O error. + +- **`DecodeVarint(r io.Reader) (uint64, error)`** + Reads and decodes a varint from an `io.Reader`, reconstructing the original `uint64`. Fails on malformed or incomplete data. + +- **`VarintLen(v uint64) int`** + Computes the exact number of bytes required to encode `v`, without performing I/O. + +## Test Coverage + +- **Round-trip correctness**: All test values (including edge cases like `0`, powers of two, and max `uint64`) encode → decode back identically. +- **Length validation**: Encoded length matches `VarintLen` predictions exactly (e.g., 127 → 1 byte; 16384 → 3 bytes). +- **Sequence handling**: Multiple varints can be concatenated and decoded in order, preserving data integrity. + +## Efficiency & Design + +- Uses **7-bit groups per byte**, with the MSB as a continuation flag (`1` = more bytes follow). +- Minimal memory footprint — no allocations beyond buffer I/O. +- Designed for streaming use (e.g., network or file serialization). + +## Edge Cases Verified + +| Value | Encoded Length | +|----------------|---------------| +| `0` | 1 byte | +| `2⁷−1 = 127` | 1 byte | +| `2⁷ = 128` | 2 bytes | +| `2¹⁴−1 = 16383`| 2 bytes | +| `^uint64(0)` | **10 bytes** | + diff --git a/autodoc/docmd/pkg/obilog/warning.md b/autodoc/docmd/pkg/obilog/warning.md new file mode 100644 index 0000000..0bfcc25 --- /dev/null +++ b/autodoc/docmd/pkg/obilog/warning.md @@ -0,0 +1,30 @@ +# `obilog` Package — Semantic Overview + +The `obilog` package provides a lightweight, conditional logging interface for the OBItools4 ecosystem. It wraps `logrus`, a structured logger, to emit warnings only when explicitly allowed by application-wide settings. + +## Core Functionality + +- **`Warnf(format string, args ...interface{})`** + A convenience wrapper around `logrus.Warnf`, enabling formatted warning messages. It respects a global "silent warnings" toggle defined in `obidefault.SilentWarning()`. + +## Design Intent + +- **Conditional Warning Output**: + Warnings are suppressed when `obidefault.SilentWarning()` returns `true`, supporting quiet or batch execution modes (e.g., CI pipelines, automated runs). + +- **Consistency & Integration**: + Leverages `obidefault` to enforce centralized control over verbosity, aligning logging behavior with higher-level application configuration. + +- **Minimal Abstraction**: + Keeps the interface simple and idiomatic, avoiding over-engineering while preserving flexibility for future extensions (e.g., adding `Debugf`, `Infof` wrappers). + +## Use Case + +Ideal for non-fatal issues in command-line tools or libraries—where warnings should be visible by default but suppressible on demand, without altering core logic. + +## Dependencies + +- `logrus`: Structured logging backend +- `obidefault`: Configuration layer for global behavior (e.g., silence mode) + +> **Note**: This package is *not* a full logging subsystem—it’s a targeted, policy-aware warning emitter. diff --git a/autodoc/docmd/pkg/obilua/lua.md b/autodoc/docmd/pkg/obilua/lua.md new file mode 100644 index 0000000..da07ef8 --- /dev/null +++ b/autodoc/docmd/pkg/obilua/lua.md @@ -0,0 +1,33 @@ +# Obilua: Lua-Based Sequence Processing Framework + +The `obilua` package provides a bridge between Go and the Lua scripting language for high-performance, parallelizable biological sequence processing. It enables users to write custom analysis logic in Lua while leveraging Go’s concurrency and I/O capabilities. + +## Core Features + +- **Lua Interpreter Initialization**: `NewInterpreter()` creates an isolated Lua state preloaded with Obi-specific types (`BioSequence`, etc.). +- **Compilation Support**: `Compile()` and `CompileScript()` parse and compile Lua code into efficient function prototypes. +- **Worker Conversion**: `LuaWorker(proto)` wraps a compiled Lua script as a Go-compatible `SeqWorker`, allowing seamless integration into sequence pipelines. +- **Pipeline Integration**: + - `LuaProcessor()` executes a Lua script over an iterator of sequences using configurable parallelism. + - It supports optional `begin()` and `finish()` hook functions in Lua for initialization/cleanup. + - Errors can be handled either by halting (`breakOnError=true`) or logging warnings. + +- **Pipeable Interface**: + - `LuaPipe()` and `LuaScriptPipe()` expose Lua scripts as reusable, chainable pipeline stages (`obiiter.Pipeable`), supporting both inline programs and external `.lua` files. + +## Lua API Contract + +Scripts must define a global `worker(sequence)` function returning either: +- A single `BioSequence` +- A list (`BioSequenceSlice`) +Or return nothing (interpreted as filtered out). + +Optionally, `begin()` and `finish()` functions may be defined for lifecycle management. + +## Parallel Execution + +Uses Go routines to run multiple workers concurrently, with batched input and output management. Default worker count falls back to system-wide parallelism settings if `nworkers ≤ 0`. + +## Logging & Error Handling + +Uses Logrus for structured logging; fatal errors are logged during setup, while runtime issues respect the `breakOnError` flag. diff --git a/autodoc/docmd/pkg/obilua/lua_obicontext.md b/autodoc/docmd/pkg/obilua/lua_obicontext.md new file mode 100644 index 0000000..015aad7 --- /dev/null +++ b/autodoc/docmd/pkg/obilua/lua_obicontext.md @@ -0,0 +1,29 @@ +# `obilua` Module: Lua-Accessible Shared Context with Thread Safety + +This Go package exposes a thread-safe, shared key-value context to Lua scripts via the Gopher-Lua interpreter. + +## Core Features + +- **Global `obicontext` Table**: Registered in Lua with the following methods: + - `obicontext.item(key [, value])`: + Get or set a context variable. Supports types: `bool`, number, string, tables (converted via helper), and user data. + - `obicontext.lock()`: Acquire exclusive lock on the context (blocking). + - `obicontext.unlock()`: Release the global lock. + - `obicontext.trylock()`: Attempt to acquire non-blocking lock; returns boolean success. + - `obicontext.inc(key)` / `dec(key)`: Atomically increment/decrement numeric values (float64 only), with lock protection. + +## Thread Safety + +- Uses `sync.Mutex` for serializing write operations (e.g., inc/dec, lock/unlock). +- `sync.Map` for concurrent-safe read/write of key-value pairs. +- Critical sections (e.g., increment/decrement) are explicitly wrapped with locks to ensure atomicity. + +## Lua Integration + +- Values stored in the context persist across script calls. +- Type coercion is handled explicitly: Lua types map directly to Go equivalents, with fallback logging on unsupported types. +- Errors (e.g., incrementing non-number) trigger fatal logs—suitable for controlled environments. + +## Use Case + +Ideal for embedding Lua logic in Go applications requiring shared state (e.g., config, counters), with explicit locking for race-free updates. diff --git a/autodoc/docmd/pkg/obilua/lua_push_interface.md b/autodoc/docmd/pkg/obilua/lua_push_interface.md new file mode 100644 index 0000000..926aa5e --- /dev/null +++ b/autodoc/docmd/pkg/obilua/lua_push_interface.md @@ -0,0 +1,31 @@ +# Semantic Description of `obilua` Package + +The `obilua` package provides utilities for **bi-directional data marshaling between Go and Lua**, specifically focusing on converting native Go values into equivalent `lua.LValue` types for use in a Lua state (`*lua.LState`). This enables Go applications to expose structured data (e.g., maps, slices) or synchronization primitives (`*sync.Mutex`) directly to Lua scripts. + +## Core Functionality + +- **`pushInterfaceToLua(L, val)`**: + Main dispatcher that inspects the type of a Go `interface{}` value and routes it to specialized conversion functions. Supported types include: + - Basic scalar types: `string`, `bool`, `int`, `float64` + - Collections: + - Maps: `map[string]{string,int,bool,float64,interface{}}` + - Slices/arrays: `[]{string,int,byte,float64,bool]interface{}}` + - Special cases: + - `nil` → Lua’s `LNil` + - `*sync.Mutex` (via dedicated handler) + +- **Type-Specific Pushers**: + Each helper function (`pushMapStringIntToLua`, `pushSliceBoolToLua`, etc.) constructs a new Lua table and populates it with converted elements using appropriate `lua.LValue` constructors (`LString`, `LNumber`, `LBool`). + - Maps are converted as associative tables (keyed by string). + - Slices become indexed Lua arrays (`1..n`). + +- **Generic Slice Support**: + `pushSliceNumericToLua[T]()` uses Go generics to handle numeric slices (`int`, `float64`, `byte`) uniformly. + +## Design Notes + +- **No reverse conversion** (Lua → Go) is included — only *pushing* to Lua. +- **Strict typing**: Unsupported types trigger a fatal log (`log.Fatalf`), enforcing explicit type handling. +- **Lua semantics respected**: Tables are 1-indexed, and numeric types map to `lua.LNumber`. + +This package is ideal for embedding Lua in Go services where dynamic configuration, rule evaluation, or scripting requires safe and predictable data injection. diff --git a/autodoc/docmd/pkg/obilua/lua_table.md b/autodoc/docmd/pkg/obilua/lua_table.md new file mode 100644 index 0000000..fb431a1 --- /dev/null +++ b/autodoc/docmd/pkg/obilua/lua_table.md @@ -0,0 +1,28 @@ +# Semantic Description of `obilua` Package + +This Go package provides utilities for converting Lua tables—used in a Gopher-Lua environment—to native Go data structures. + +- **`Table2Interface`**: + Converts a Lua `*lua.LTable` into either: + - A Go slice (`[]interface{}`) if the table is array-like (keys are numeric, starting at 1), preserving order and type coercion (`nil`, `bool`, `float64`, `string`). + - A Go map (`map[string]interface{}`) if the table contains string keys (i.e., a hash/dictionary). + +- **`Table2ByteSlice`**: + Specifically converts an array-like Lua table into a `[]byte`, assuming all values are numeric and ≤ 255. + - Fails with a fatal log if non-numeric or out-of-range values are encountered. + - Also fails fatally for hash-like (non-array) tables. + +- **Key Design Notes**: + - Type coercion is explicit and safe: only `LTNil`, `LTBool`, `LTNumber`, `LTString` are supported. + - Array detection relies on key type: if *all* keys are `LNumber`, the table is treated as an array. + - Uses [`logrus`](https://github.com/sirupsen/logrus) for fatal error reporting. + - No dependency on external serialization (e.g., JSON); conversions are direct and lightweight. + +- **Use Cases**: + - Bridging Lua scripting layers with Go backends (e.g., embedded config parsing, plugin systems). + - Efficiently extracting structured data from Lua state into idiomatic Go types. + +> ⚠️ **Limitations**: +> - No support for nested tables or custom types. +> - Array indexing assumes 1-based Lua semantics (converted to 0-indexed Go slices). +> - No error handling: misuse triggers `log.Fatalf`. diff --git a/autodoc/docmd/pkg/obilua/mutex.md b/autodoc/docmd/pkg/obilua/mutex.md new file mode 100644 index 0000000..ffaabec --- /dev/null +++ b/autodoc/docmd/pkg/obilua/mutex.md @@ -0,0 +1,30 @@ +# `obilua.Mutex`: Thread-Safe Synchronization in Lua via Go's sync.Mutex + +This package exposes **Go’s `sync.Mutex`** to the Lua environment using [gopher-lua](https://github.com/yuin/gopher-lua), enabling safe concurrent access from Lua scripts. + +## Key Features + +- **Custom userdata type**: Registers a new metatable `"Mutex"` in the Lua state. +- **Constructor function**: + - ` Mutex.new() → mutex userdata` + Creates and returns a new Go-backed mutex instance. +- **Instance methods**: + - `mutex:lock()` — Acquires the lock (blocks until available). + - `mutex:unlock()` — Releases the lock. +- **Type safety**: Validates that only valid mutex userdatas are passed to `lock`/`unlock`. +- **Integration**: Designed for embedding Lua in Go applications requiring synchronization (e.g., multi-threaded scripting). + +## Usage Example + +```lua +local m = Mutex.new() +m:lock() -- Acquire lock (safe across goroutines) +-- critical section +m:unlock() +``` + +## Implementation Notes + +- Mutex state is stored in a Go `*sync.Mutex` inside Lua userdata. +- No reference counting or finalizers — user must manually manage lock/unlock lifecycle to avoid deadlocks. +- Thread-safe *from Go side only*; Lua calls must respect goroutine safety (e.g., avoid calling from multiple VMs concurrently). diff --git a/autodoc/docmd/pkg/obilua/obilib.md b/autodoc/docmd/pkg/obilua/obilib.md new file mode 100644 index 0000000..6ffd5a7 --- /dev/null +++ b/autodoc/docmd/pkg/obilua/obilib.md @@ -0,0 +1,30 @@ +# Obilib Module Overview + +The `obilua` package provides Lua bindings for core OBIL (Ontology-Based Information Library) functionality, enabling scripting and extension of ontological data processing within a Lua environment. + +## Core Components + +- **`RegisterObilib(luaState *lua.LState)`** + Main registration function; initializes and exposes OBIL modules to a given Lua state. + +- **`RegisterObiSeq(luaState *lua.LState)`** + Registers sequence-related operations (e.g., parsing, manipulation, and analysis of biological sequences like DNA/RNA/proteins). + +- **`RegisterObiTaxonomy(luaState *lua.LState)`** + Registers taxonomy utilities (e.g., classification, lineage lookup, and hierarchical navigation of taxonomic trees). + +## Semantic Capabilities + +- Enables *semantic querying* over structured biological data via Lua scripts. +- Supports integration of ontological reasoning (e.g., using GO, NCBI Taxonomy) in dynamic workflows. +- Provides extensibility: new modules can be added by implementing `Register*` functions. + +## Design Principles + +- Minimal, non-intrusive API: only exposes essential high-level operations. +- Leverages `gopher-lua` for seamless interoperability between Go and Lua. + +## Use Cases + +- Custom annotation pipelines in bioinformatics. +- Interactive exploration of ontologies and sequences (e.g., via REPL or embedded Lua engines). diff --git a/autodoc/docmd/pkg/obilua/obiseq.md b/autodoc/docmd/pkg/obilua/obiseq.md new file mode 100644 index 0000000..add9cbb --- /dev/null +++ b/autodoc/docmd/pkg/obilua/obiseq.md @@ -0,0 +1,34 @@ +# `obilua` Package: Biosequence Lua Bindings + +The `obilua` Go package provides **Lua bindings** for biological sequence objects (`obiseq.BioSequence`) used in the OBITools4 ecosystem. It enables scripting and automation of sequence analysis directly from Lua. + +## Core Functionality + +- **Type Registration**: Registers a new userdata type `BioSequence` in the Lua state, exposing methods and constructors. +- **Constructor**: + ```lua + BioSequence.new(id, sequence[, definition]) →BioSequence``` +- **Accessors & Mutators**: + - `id()`, `sequence()`, `definition()` – get/set identifiers and sequence data. + - `qualities([table])` – handle PHRED-quality scores (as Lua table or string). + - `count()`, `taxid()` – numeric abundance and taxonomic ID. +- **Taxonomy Integration**: + - `taxon([Taxon])` – get/set taxonomic assignment via integrated taxonomy engine. +- **Attributes**: + - `attribute(name[, value])` – arbitrary metadata storage (supports tables, strings, numbers). +- **Sequence Operations**: + - `len()` – length of the sequence. + - `has_sequence()`, `has_qualities()` – boolean checks for presence of data. +- **Computation & Transformation**: + - `subsequence(start, end)` – extract a region. + - `reverse_complement()` → BioSequence. + - `md5()`, `md5_string()` – compute sequence checksums (raw bytes or hex string). +- **Serialization**: + - `fasta([format])`, `fastq([format])` – output in FASTA/FASTQ, supporting `"json"` or `"obi"` header formats. + - `string([format])` – smart formatting: FASTQ if qualities present, else FASTA. + +## Implementation Notes + +- Uses `gopher-lua` for interpreter integration. +- UserData wrapping ensures type safety and GC management of Go-backed objects. +- Error handling via Lua `ArgError` or `RaiseError`. diff --git a/autodoc/docmd/pkg/obilua/obiseqslice.md b/autodoc/docmd/pkg/obilua/obiseqslice.md new file mode 100644 index 0000000..342e1b8 --- /dev/null +++ b/autodoc/docmd/pkg/obilua/obiseqslice.md @@ -0,0 +1,31 @@ +# `obilua` Package: Lua Bindings for BioSequence Slicing + +This Go module provides **Lua scripting support** for biological sequence manipulation via the `obilua` package. It exposes a custom Lua type, `"BioSequenceSlice"`, wrapping Go’s `*obiseq.BioSequenceSlice` to enable high-level sequence operations in Lua. + +## Core Features + +- **Type Registration**: Registers `BioSequenceSlice` as a userdata type in Lua with metatable support. +- **Constructor**: `new([capacity])` creates a new slice (optionally pre-sized). +- **Indexing & Assignment**: `slice[i] = seq` or `seq = slice[i]`, with bounds checking. +- **Dynamic Operations**: + - `push(seq)`: Append a sequence. + - `pop()`: Remove and return the last sequence. +- **Length Query**: `len()` returns number of sequences in slice. + +## Output Formatting + +Provides multiple export methods to format all contained sequences: + +- `fasta([format])`: Returns FASTA string (supports `"json"` or `"obi"` headers). +- `fastq([format])`: Returns FASTQ string (same format options as above). +- `string([format])`: Smart formatter: + - Uses FASTQ if *all* sequences have quality scores. + - Falls back to FASTA otherwise. + +## Design Notes + +- All methods validate input types and indices. +- Format selection is optional; defaults to `"obi"` header style unless specified as `"json"`. +- Integrates with `obiseq.BioSequence` and formatting utilities from the OBItools4 ecosystem. + +This enables Lua users to process NGS data (e.g., FASTA/FASTQ) interactively within pipelines, leveraging Go’s performance and Lua’s expressiveness. diff --git a/autodoc/docmd/pkg/obilua/obitaxon.md b/autodoc/docmd/pkg/obilua/obitaxon.md new file mode 100644 index 0000000..da92147 --- /dev/null +++ b/autodoc/docmd/pkg/obilua/obitaxon.md @@ -0,0 +1,30 @@ +# Lua Bindings for Taxonomic Operations in `obilua` + +This Go package provides a set of **Lua-accessible functions** for manipulating taxonomic data through the `obitax` library. It exposes a custom Lua type, `"Taxon"`, enabling users to create and query hierarchical taxonomic entities directly from Lua scripts. + +## Core Features + +- **Taxon Type Registration**: + A new userdata type `Taxon` is registered in the Lua state, with methods exposed via a metatable and `"__index"` delegation. + +- **Taxon Creation**: + The `Taxon.new(taxid, parent, sname, rank[, isroot])` constructor creates a new taxon node in the taxonomy. It supports optional root flag and raises errors on failure. + +- **Scientific Name Management**: + `taxon:scientific_name([newname])` gets or sets the scientific name of a taxon. + +- **Taxonomic Navigation**: + Methods allow upward/downward traversal: + - `taxon:parent()` → returns the parent taxon (or nil if root). + - `taxon:species()`, `.genus()`, `.family()` → return the nearest taxon at that rank. + - `taxon:taxon_at_rank(rank)` → returns the ancestor taxon at a given rank (e.g., `"order"`, `"class"`). + +- **String Representation**: + `taxon:string()` returns a human-readable string (typically the scientific name). + +- **Integration with Taxonomy Context**: + All operations assume an active taxonomy context (enforced via `checkTaxonomy`), and taxon instances are wrapped as Lua userdata with proper type checking. + +## Use Case + +Ideal for scripting biodiversity pipelines (e.g., in OBITools), where users need to dynamically inspect or build taxonomies during sequence annotation, filtering, or reporting. diff --git a/autodoc/docmd/pkg/obilua/obitaxonomy.md b/autodoc/docmd/pkg/obilua/obitaxonomy.md new file mode 100644 index 0000000..e08c3e0 --- /dev/null +++ b/autodoc/docmd/pkg/obilua/obitaxonomy.md @@ -0,0 +1,29 @@ +# ObiTax Lua Module Documentation + +This Go package (`obilua`) provides **Lua bindings** for the `obitax` taxonomy management module of OBItools4, enabling scripting in Lua with rich taxonomic operations. + +## Core Features + +- **Type Registration**: Registers two main types in the Lua state: `Taxonomy` and `Taxon`. +- **Factory Functions**: + - `obitax.Taxonomy.new(name, code [, charset])`: Creates a new taxonomy instance. + - `obitax.Taxonomy.default()`: Returns the globally configured default taxonomy (raises error if none exists). + - `obitax.Taxonomy.has_default()`: Boolean check for existence of a default taxonomy. + - `obitax.Taxonomy.nil`: Represents the nil taxon (used for missing data). + +## Taxonomy Object Methods + +- `name()`: Returns the taxonomy name (e.g., `"NCBI"`). +- `code()`: Returns the internal code used for taxonomic identifiers (e.g., `"txid"`). +- `taxon(id)`: Retrieves a taxonomic node by ID; returns: + - the corresponding *Taxon* object, + - raises an error if not found or on alias resolution when `FailOnTaxonomy()` is enabled. + +## Taxon Object Support + +- A dedicated `registerTaxonType` (not shown here) exposes a Lua-accessible *Taxon* type with methods like `rank`, `parent`, and string representation. + +## Integration + +- Built on top of standard OBItools4 types (`obitax.Taxonomy`, `obiutils.AsciiSetFromString`). +- Leverages GopherLua for seamless interoperability between Go and Lua. diff --git a/autodoc/docmd/pkg/obingslibrary/marker.md b/autodoc/docmd/pkg/obingslibrary/marker.md new file mode 100644 index 0000000..6bb2024 --- /dev/null +++ b/autodoc/docmd/pkg/obingslibrary/marker.md @@ -0,0 +1,40 @@ +# Semantic Description of `obingslibrary` Marker Module + +The `Marker` struct defines a molecular biology primer pair (forward/reverse) for PCR-based sample demultiplexing in high-throughput sequencing workflows. It supports flexible configuration of primer binding, tag (barcode) extraction, mismatch tolerance, and indel handling. + +## Core Functionalities + +- **Primer Pattern Compilation**: + `Compile()` and `Compile2()` initialize forward/reverse primer patterns using the underlying `obiapat.ApatPattern`, including reverse-complement variants (`cforward`, `creverse`). They accept parameters for maximum error tolerance and indel allowance. + +- **Sequence Matching & Demultiplexing**: + `Match()` scans a given sequence (`BioSequence`) for primer binding sites. It prioritizes forward-primer detection, then falls back to reverse if needed. For each match: + - Extracts primer region and adjacent tag (barcode). + - Computes mismatches. + - Links to a pre-registered `PCR` object via the tag pair (`TagPair`) key in internal map. + +- **Sample Registration & Lookup**: + `GetPCR()` retrieves or registers a new PCR reaction entry indexed by forward/reverse tag pair (case-insensitive). Enables tracking of sample-specific amplification data. + +- **Tag Length Validation**: + `CheckTagLength()` ensures all registered tags have uniform length for both directions; otherwise, returns an error. + +- **Configurable Parameters**: + Supports tuning of: + - Tag lengths (`Forward_tag_length`, `Reverse_tag_length`) + - Spacer between tag and primer (`SetTagSpacer()`) + - Delimiter for tag-primer boundary (e.g., `a`, `c`, `g`, `t` or none via `'0'`) + - Allowed mismatches and indels per primer (`SetAllowedMismatch()`, `SetTagIndels()`) + - Matching strategy: `"strict"` (exact), `"hamming"`, or `"indel"` + +- **Matching Strategy Enforcement**: + `SetForward/ReverseMatching()` validates and sets matching modes; invalid values raise errors. + +## Design Highlights + +- Uses `log.Fatalf` for critical configuration failures (e.g., invalid delimiter). +- Leverages reference-counted sequences (`Recycle()`) for memory efficiency. +- Prioritizes forward primer match but gracefully handles reverse orientation. +- Fully supports case-insensitive tag comparison and normalization. + +This module serves as the core engine for sample assignment in amplicon-based NGS pipelines, balancing sensitivity (via error/indel tolerance) and specificity (through tag uniqueness). diff --git a/autodoc/docmd/pkg/obingslibrary/match.md b/autodoc/docmd/pkg/obingslibrary/match.md new file mode 100644 index 0000000..2b36726 --- /dev/null +++ b/autodoc/docmd/pkg/obingslibrary/match.md @@ -0,0 +1,32 @@ +# Demultiplexing Functionality in `obingslibrary` + +This package provides tools for **demultiplexing NGS reads** by matching them against known primer pairs and extracting associated barcodes. + +## Core Types + +- `DemultiplexMatch`: Struct holding alignment results for forward/reverse primers, mismatches, barcode coordinates (`BarcodeStart`, `BarcodeEnd`), and metadata (e.g., sample/experiment info via `PCR`). Includes error handling. + +## Key Methods + +- **`Match(sequence)`**: + Scans the input `BioSequence` against all primer pairs in `NGSLibrary.Markers`. Returns a populated `DemultiplexMatch` if any primer pair matches. + +- **`ExtractBarcode(sequence, inplace)`**: + Uses the result of `Match()` to: + - Extract the barcode region (if valid: non-dimer). + - Reverse-complement if read is in reverse orientation (`IsDirect == false`). + - Annotate the sequence with: + - Primer names and match details (positions, mismatches). + - Direction (`direct`/`reverse`). + - Sample/experiment info (if assignment succeeds), or error message. + +## Behavior Notes + +- **Primer dimer detection**: If `BarcodeStart > BarcodeEnd`, the read is flagged as a primer dimer and not extracted. +- **Error handling**: Errors (e.g., no match, sample unassignment) are stored in `match.Error` and propagated as annotations. +- **Annotation richness**: Output sequences carry rich metadata (sample, experiment, primers, errors), supporting downstream filtering/analysis. + +## Dependencies + +- Uses `logrus` for fatal logging (e.g., subsequence extraction failure). +- Integrates with `obiseq.BioSequence` for sequence representation and manipulation. diff --git a/autodoc/docmd/pkg/obingslibrary/multimatch.md b/autodoc/docmd/pkg/obingslibrary/multimatch.md new file mode 100644 index 0000000..a662ade --- /dev/null +++ b/autodoc/docmd/pkg/obingslibrary/multimatch.md @@ -0,0 +1,43 @@ +# Semantic Description of `obingslibrary` Package + +The `obingslibrary` package provides core functionality for **multiplexed high-throughput sequencing (HTS) data processing**, specifically designed to extract, validate, and assign biological samples from NGS reads using **dual-indexed barcodes** flanked by primers. + +## Key Functionalities + +1. **Primer & Tag Matching Structures** + - `PrimerMatch`: Encodes location, orientation (`Forward`), mismatch count, and marker identity of primer hits. + - `TagMatcher`: Functional interface for extracting sample-specific tags from sequence regions. + +2. **Distance Metrics** + - `Hamming`: Counts character mismatches between equal-length strings (for strict mismatch tolerance). + - `Levenshtein`: Computes edit distance allowing insertions/deletions (for indel-tolerant matching). + +3. **Tag Extraction Strategies** + - `lookForTag`: Extracts delimited tags (e.g., between two identical delimiters). + - `lookForRescueTag`: Robustly extracts tags despite indels or variable delimiter lengths. + - `*Fixed/Delimited/RescueTagExtractor` methods: Support three tag formats per primer direction (fixed-length, delimited with exact delimiters, or rescue-tolerant). + +4. **Marker & Library Abstraction** + - `NGSLibrary`: Holds a map of primer pairs (`PrimerPair`) to `Marker` objects. + - Each `Marker`: Defines forward/reverse primer sequences, tag specifications (length/spacer/delimiter/indels), and sample-to-tag mappings. + +5. **Tag Assignment & Sample Identification** + - `TagExtractor`: Extracts forward/reverse tags from primer-flanked regions and annotates them. + - `SampleIdentifier`: Matches extracted tags to known samples using configurable matching modes: + - `"strict"`: Exact match only. + - `"hamming"`: Closest tag by Hamming distance (substitutions). + - `"indel"`: Closest tag by Levenshtein distance. + - Annotates results with matching mode, distances, and proposed tags. + +6. **Multi-Barcode Extraction** + - `ExtractMultiBarcode`: Scans a full sequence for primer pairs (forward/reverse + their complements), detects valid amplicon intervals, and: + - Extracts the internal barcode region. + - Assigns tags → sample via `SampleIdentifier`. + - Annotates each barcode with primer matches, errors, directionality. + - Handles both orientations (`forward` and `reverse`) of the amplicon. + +7. **Parallel Processing Integration** + - `ExtractMultiBarcodeSliceWorker`: Returns a reusable worker function for batch processing sequences, supporting options like indel tolerance and mismatch limits. + +## Use Case +This package enables **demultiplexing** of NGS reads in amplicon-based workflows (e.g., metabarcoding), where samples are labeled with unique dual barcodes. It ensures robustness against sequencing errors and supports flexible tag design. diff --git a/autodoc/docmd/pkg/obingslibrary/ngslibrary.md b/autodoc/docmd/pkg/obingslibrary/ngslibrary.md new file mode 100644 index 0000000..dafed69 --- /dev/null +++ b/autodoc/docmd/pkg/obingslibrary/ngslibrary.md @@ -0,0 +1,17 @@ +# Semantic Description of `obingslibrary` Package + +The `obingslibrary` package defines core data structures and methods for managing **PCR-based NGS library designs**, particularly in metabarcoding workflows. + +- `PrimerPair` and `TagPair`: Represent forward/reverse primer or tag sequences. +- `PCR`: Encapsulates a single PCR amplification experiment with sample metadata and annotations (via `obiseq.Annotation`). +- `NGSLibrary`: Central struct storing primer definitions (`Primers`) and associated marker specifications (`Markers`), where each `Marker` defines how primers (and attached tags) are processed. + +Key functionality includes: +- **Dynamic marker creation**: `GetMarker()` lazily initializes a new `Marker` for any primer pair if not already present. +- **Compilation**: Two compilation modes (`Compile`, `Compile2`) prepare internal search structures (e.g., error-tolerant index) using user-defined parameters like max errors and indel allowance. +- **Tag configuration**: Methods to set spacer length, delimiter character (e.g., `N` or `X`), and indel tolerance for tags—globally (`SetTagSpacer`, etc.) or per-primer. +- **Matching strategy**: Configure alignment behavior (e.g., `"strict"` vs. `"fuzzy"`) via `SetMatching*`. +- **Unicity & validation**: `CheckPrimerUnicity()` ensures no primer is reused across multiple markers and prevents self-complementary pairs. +- **Error handling**: Supports configurable mismatch/indel budgets per primer direction. + +This library enables flexible, reproducible specification of molecular identifiers (tags) and amplification primers—essential for accurate demultiplexing and sequence assignment in high-throughput sequencing pipelines. diff --git a/autodoc/docmd/pkg/obingslibrary/worker.md b/autodoc/docmd/pkg/obingslibrary/worker.md new file mode 100644 index 0000000..1b12dc7 --- /dev/null +++ b/autodoc/docmd/pkg/obingslibrary/worker.md @@ -0,0 +1,31 @@ +# PCR Simulation and Barcode Extraction Module + +This Go package (`obingslibrary`) provides configuration-driven tools for **PCR simulation and barcode extraction** from NGS libraries. + +## Core Concepts + +- `Options`: A fluent configuration object for customizing behavior via functional setters. +- Default options are defined in `MakeOptions`, supporting: + - Error handling (`discardErrors`) + - Mismatch/indel tolerance via `allowedMismatch` and `allowsIndel` + - Parallelization (`parallelWorkers`) and batching control (`batchSize`) + - Optional progress tracking (`withProgressBar`) + +## Key Functionalities + +- **Barcode Extraction**: + - `ExtractBarcodeSlice`: Extracts barcodes from a slice of sequences using the NGS library, applying configured error handling and alignment parameters. + - `ExtractBarcodeSliceWorker`: Returns a reusable worker function for batch processing (e.g., in pipelines or parallel workers). + +- **Compilation Step**: + - `ngslibrary.Compile(...)` prepares internal indexing based on mismatch/indel settings before extraction. + +- **Error Handling**: + - If `discardErrors` is true (default), sequences causing extraction errors are filtered out. + - Alternatively, error-containing reads can be retained or logged via `OptionUnidentified`. + +## Design Highlights + +- Uses the *option pattern* for extensibility and clean API. +- Integrates with default settings from `obidefault` (e.g., parallelism, batch size). +- Designed for both direct use and integration into concurrent workflows. diff --git a/autodoc/docmd/pkg/obioptions/options.md b/autodoc/docmd/pkg/obioptions/options.md new file mode 100644 index 0000000..3655522 --- /dev/null +++ b/autodoc/docmd/pkg/obioptions/options.md @@ -0,0 +1,47 @@ +# OBIOptions Package: Global Command-Line Interface Utilities + +The `obioptions` package provides shared command-line argument parsing and runtime configuration for OBITools4 commands. It centralizes common options, logging setup, profiling controls, and taxonomy handling. + +## Core Functionalities + +- **Global Option Registration**: `RegisterGlobalOptions()` defines shared flags such as: + - `--version`, `--debug` + - CPU/thread control (`--max-cpu`) + - Batch processing parameters: `--batch-size`, `-size-max`, `--batch-mem` + - Quality encoding (`--solexa`) + - Warning suppression (`--silent-warning`) + +- **Option Processing**: `ProcessParsedOptions()` handles post-parsing logic: + - Prints help/version and exits on request + - Loads default taxonomy via `obiformats.LoadTaxonomy()` + - Configures log level (debug/info) + - Starts `pprof` HTTP servers for performance profiling: + - Generic (`/debug/pprof`) + - Mutex contention (`--pprof-mutex`, `runtime.SetMutexProfileFraction()`) + - Goroutine blocking (`--pprof-goroutine`, `runtime.SetBlockProfileRate()`) + +- **Parser Generator**: `GenerateOptionParser()` builds a reusable argument parser with: + - Bundled short options (`-abc`) + - Strict unknown-option handling + - Automatic `--help` support + +## Taxonomy Integration + +- `LoadTaxonomyOptionSet()` registers taxonomy-specific flags: + - Required/optional path to DB (`--taxonomy`, `-t`) + - Alternative names search (`--alternative-names`) + - Taxonomic validation: `--fail-on-taxonomy`, automatic updates via `--update-taxid` + - Raw taxID output (`--raw-taxid`) + - Leaf sequences inclusion via `--with-leaves` + +## Runtime Accessors + +- `CLIIsDebugMode()`, `SeqAsTaxa()` → read current state +- `SetDebugOn/Off()` → programmatic debug toggling + +## Design Principles + +- Environment variable support (`OBIMAXCPU`, `OBIWARNING`, etc.) +- Thread-safe taxonomy loading with mutex +- Graceful error handling (parse errors → help + exit) +- Integration with `logrus` and Go’s standard profiling tools diff --git a/autodoc/docmd/pkg/obioptions/subcommand.md b/autodoc/docmd/pkg/obioptions/subcommand.md new file mode 100644 index 0000000..dca7e6f --- /dev/null +++ b/autodoc/docmd/pkg/obioptions/subcommand.md @@ -0,0 +1,23 @@ +# Semantic Description of `GenerateSubcommandParser` + +The function `GenerateSubcommandParser` constructs a command-line argument parser with support for **subcommands**, leveraging the `go-getoptions` library. + +- It accepts: + - `program`: The program name (used for help/version). + - `documentation`: A top-level description of the tool. + - `setup`: A callback to register subcommands and their options. + +- Internally: + - Initializes a `GetOpt` instance with bundling mode (`-abc`) and strict unknown-option handling. + - Registers **global options** (e.g., `--debug`, `--verbose`) that are inherited by all subcommands. + - Invokes the user-provided `setup` function to define **subcommand-specific options and commands**. + - Automatically adds a built-in `help` subcommand for command-level documentation. + +- Returns: + - The root `*GetOpt`, required to invoke `.Dispatch()`. + - An `ArgumentParser` function (signature: `func([]string) (*GetOpt, []string)`), which: + - Parses command-line arguments (skipping `args[0]`, typically the binary name), + - Handles errors via `ProcessParsedOptions`, + - Returns parsed state and remaining positional arguments. + +This design enables a clean, hierarchical CLI structure: global flags → subcommands → per-command options/positional args. diff --git a/autodoc/docmd/pkg/obioptions/version.md b/autodoc/docmd/pkg/obioptions/version.md new file mode 100644 index 0000000..8543c60 --- /dev/null +++ b/autodoc/docmd/pkg/obioptions/version.md @@ -0,0 +1,35 @@ +# OBIOptions Package – Semantic Description + +The `obioptions` package provides a lightweight, version-aware utility for the OBITools suite. Its core functionality is centered around exposing runtime version information in a standardized and programmatic way. + +## Key Features + +- **Version Exposure**: + Exposes the current version of OBITools via a simple, read-only function `VersionString()`. This allows other modules or external tools to query the package version at runtime. + +- **Automated Versioning**: + The `_Version` variable is automatically populated from an external `version.txt` file during the build process (via Makefile), ensuring consistency between source metadata and compiled artifacts. + +- **Patch-Level Tracking**: + The version follows semantic conventions (`MAJOR.MINOR.PATCH`), with the patch number incremented automatically on each repository push—enabling precise tracking of development iterations. + +- **No Side Effects**: + The `VersionString()` function is pure: it takes no parameters and performs only a string return, making it safe for use in logging, diagnostics, or compatibility checks. + +- **Documentation Ready**: + Includes inline GoDoc comments for clarity and tooling support (e.g., `go doc`), improving maintainability. + +## Use Cases + +- Debugging and logging (e.g., including version in error reports). +- Conditional logic based on OBITools compatibility. +- CI/CD validation (e.g., verifying deployed version matches expectations). + +## Version Format + +`"Release X.Y.Z"` (e.g., `"Release 4.4.29"`), where: +- `X` = Major release (breaking changes), +- `Y` = Minor release (new features, backward-compatible), +- `Z` = Patch level (incremented per push for hotfixes/bug fixes). + +No external dependencies or configuration required. diff --git a/autodoc/docmd/pkg/obiphylo/tree.md b/autodoc/docmd/pkg/obiphylo/tree.md new file mode 100644 index 0000000..5894f22 --- /dev/null +++ b/autodoc/docmd/pkg/obiphylo/tree.md @@ -0,0 +1,30 @@ +# `obiphylo` Package: Semantic Description + +The `obiphylo` package provides a minimal yet expressive data structure and utilities for representing **phylogenetic trees** in Go. + +## Core Type: `PhyloNode` +- Represents a node (taxon or internal branch point) in a phylogeny. +- Fields: + - `Name`: Optional label (e.g., species name, OTU ID). + - `Children`: A map of child nodes to **branch lengths** (evolutionary distances). + - `Attributes`: A flexible key-value store for metadata (e.g., bootstrap support, posterior probability). + +## Key Functionalities +- **Tree Construction**: + - `NewPhyloNode()`: Instantiates an empty node. + - `AddChild(child, distance)`: Appends a child with associated branch length (supports NaN for unlabeled branches). +- **Metadata Access**: + - `SetAttribute(key, value)` / `GetAttribute(key)`: Enables extensible node annotation. + - Supports arbitrary types (via `any`), ideal for dynamic metadata. + +## Output: Newick Format +- Recursive method `Newick(level int)` generates a **human-readable, standard phylogenetic tree string**: + - Properly indented for readability. + - Supports branch lengths (`:distance`) on edges (skips if `NaN`). + - Terminates with semicolon (`;`) at root level. +- Designed for interoperability (e.g., export to tools like RAxML, FigTree). + +## Design Notes +- Lightweight and dependency-free. +- Uses Go’s idiomatic maps for efficient child lookup (O(1) average). +- Recursive Newick generation ensures correct nesting and formatting. diff --git a/autodoc/docmd/pkg/obiseq/attributes.md b/autodoc/docmd/pkg/obiseq/attributes.md new file mode 100644 index 0000000..12a510a --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/attributes.md @@ -0,0 +1,22 @@ +# BioSequence Attribute Management API + +This Go package (`obiseq`) provides a rich set of methods for managing metadata and structural attributes associated with biological sequences (`BioSequence`). Below is a semantic overview of the core functionalities: + +- **Key Discovery & Existence Checks**: + - `Keys()` and `AttributeKeys()` return all attribute names (optionally excluding container/statistics fields or the `"definition"` key). + - `HasAttribute(key)` verifies presence of a given attribute (including standard fields: `"id"`, `"sequence"`, `"qualities"`). + +- **Generic Attribute Access**: + - `GetAttribute(key)` retrieves any attribute value (as `interface{}`), with thread-safe locking. + - `SetAttribute(key, value)` assigns values to attributes (including automatic conversion for `"id"`, `"sequence"` and `"qualities"`). + +- **Typed Attribute Retrieval**: + - Type-specific getters (`GetIntAttribute`, `GetFloatAttribute`, `GetStringAttribute`, etc.) ensure safe conversion and *auto-upgrade* of stored values (e.g., string `"42"` → integer `42`). + - Supports maps (`GetIntMap`, `GetStringMap`) and slices (`GetIntSlice`). + +- **Convenience & Domain-Specific Helpers**: + - `Count()` / `SetCount()`: manage observation frequency (default = 1). + - OBITag indexing: `OBITagRefIndex()` / `SetOBITagRefIndex()`, and geometry variants (`geomref`). Supports flexible input map types with dynamic conversion. + - Coordinate & landmark support: `GetCoordinate()` / `SetCoordinate()`, and `landmark_id`-based operations (`IsALandmark()`, `GetLandmarkID()`). + +All methods are designed for robustness: they handle type conversions gracefully, use locking to ensure concurrency safety, and provide fallbacks (e.g., default count = 1). The API abstracts internal storage (`annotations` map) while exposing a clean, consistent interface for sequence annotation manipulation. diff --git a/autodoc/docmd/pkg/obiseq/biosequence.md b/autodoc/docmd/pkg/obiseq/biosequence.md new file mode 100644 index 0000000..8c78f49 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/biosequence.md @@ -0,0 +1,41 @@ +# BioSequence: A High-Performance Biological Sequence Representation + +The `obiseq` package defines the `BioSequence` struct, a memory-efficient and thread-safe container for biological DNA sequences. Beyond raw sequence data (`[]byte`), it supports rich metadata and operations essential for NGS pipelines. + +## Core Features + +- **Metadata Fields**: + - `id`: Unique sequence identifier. + - `source`: Filename (without path/extension) of origin. + - `definition`: Optional descriptive text, stored in annotations. + +- **Sequence & Quality Support**: + - Stores sequence as lowercase `[]byte` (normalized via in-place lowercasing). + - Quality scores (`Quality = []uint8`) with fallback to default Phred+40 values when missing. + - Methods for incremental writing (`Write`, `WriteByte`) and clearing. + +- **Annotations & Features**: + - Generic `Annotation` map (`map[string]interface{}`) for flexible metadata. + - Thread-safe access via `annot_lock` mutex (explicit locking/unlocking methods). + - Raw feature table storage (`[]byte`, e.g., EMBL/GenBank features). + +- **Biological Relationships**: + - `paired`: Pointer to mate/read-pair sequence. + - `revcomp`: Pointer to reverse-complement variant (lazy or precomputed). + +- **Introspection & Utility**: + - `Len()`, `HasSequence()`, `Composition()` (nucleotide counts: a,c,g,t,o). + - MD5 checksums (`MD5()` and `MD5String()`) for deduplication. + - Memory footprint estimation (`MemorySize()`), critical for streaming/batching. + +- **Efficiency Optimizations**: + - `NewBioSequenceOwning`/`TakeQualities`: Zero-copy slice adoption (caller must not reuse input). + - `Recycle()`: Reuses slices via pool-aware functions (`RecycleSlice`, etc.). + - Global counters track creation/destruction/in-memory sequences for diagnostics. + +- **Safety & Compatibility**: + - Copy semantics via `Copy()` (deep copy of slices + annotations). + - Validation: `HasValidSequence` enforces allowed characters (`a-z`, `-`, `.`, `[`, `]`). + - Uses unsafe string conversion for quality ASCII output (Phred shift configurable via `obidefault`). + +Designed for scalability in large-scale metabarcoding workflows (e.g., OBITools4), balancing performance, correctness, and extensibility. diff --git a/autodoc/docmd/pkg/obiseq/biosequence_test.md b/autodoc/docmd/pkg/obiseq/biosequence_test.md new file mode 100644 index 0000000..4aa36e1 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/biosequence_test.md @@ -0,0 +1,35 @@ +# `obiseq` Package: Semantic Overview + +The `obiseq` package provides a robust, thread-safe implementation of biological sequence objects in Go. It defines the core `BioSequence` type and associated utilities for handling nucleotide sequences (DNA/RNA), quality scores, annotations, features, memory management, and metadata operations. + +### Core Functionalities + +- **Construction & Initialization** + - `NewEmptyBioSequence(cap)` creates an empty sequence with optional preallocated capacity. + - `NewBioSequence(id, seq, def)` builds a basic sequence with ID (case-normalized), byte-level sequence (`[]byte`), and definition. + - `NewBioSequenceWithQualities(...)` extends the above with per-base quality scores (`[]byte` or `Quality`). + +- **Accessors & Properties** + - `Id()`, `Definition()` return metadata fields. + - `Sequence()` returns the normalized (lowercase) sequence as a copy of internal bytes. + - `Len()` returns the length (number of bases). + - `String()` provides a human-readable sequence string. + +- **Quality & Feature Support** + - `HasQualities()` checks if quality scores are present. + - `Qualities()`, `SetQualities(...)` manage per-base quality data (with fallback to default values). + - `Features()` retrieves optional feature annotations as a string. + +- **Annotation System** + - `Annotations()`, `HasAnnotation()` allow inspection of arbitrary metadata (key-value map). + - Thread-safe via internal `sync.Mutex`, exposed through `AnnotationsLock()`. + +- **Utility & Safety** + - `Recycle()` safely resets internal slices and annotations (enables object pooling). Handles nil receivers gracefully. + - `Copy()` performs deep copy of all fields, including annotations and locks (new mutex). + - `MD5()` computes the MD5 hash of the sequence bytes. + +- **Analysis Methods** + - `Composition()` returns a nucleotide count map (`a`, `c`, `g`, `t`, and `'o'` for others), case-insensitive. + +All operations are designed with performance, safety (nil-safety, copy semantics), and extensibility in mind—ideal for bioinformatics pipelines requiring immutable or pooled sequence handling. diff --git a/autodoc/docmd/pkg/obiseq/biosequenceslice.md b/autodoc/docmd/pkg/obiseq/biosequenceslice.md new file mode 100644 index 0000000..5436e15 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/biosequenceslice.md @@ -0,0 +1,37 @@ +# `obiseq` Package: BioSequence Collection Management + +The `obiseq` package provides a high-performance, memory-efficient implementation for managing collections of biological sequences (`BioSequence`) in Go. Its core type is `BioSequenceSlice`, a slice of pointers to `BioSequence` objects, optimized for batch processing in metagenomic pipelines. + +### Key Functionalities + +- **Memory Pooling & Allocation Control**: + `NewBioSequenceSlice` and `MakeBioSequenceSlice` allow creating slices with optional capacity hints. + `EnsureCapacity(capacity)` dynamically grows the underlying slice while logging warnings or panicking on persistent allocation failures. + +- **Efficient Element Management**: + - `Push(sequence)`: Appends a sequence to the end. + - `Pop()`: Removes and returns the last element (nil-safe). + - `Pop0()`: Efficiently removes and returns the first element. + +- **Collection Metadata Queries**: + - `Len()`: Returns number of sequences in the slice. + - `Size()`: Computes total sequence length (summing all `.Len()`). + - `NotEmpty()`: Boolean check for non-empty collections. + +- **Attribute Aggregation**: + `AttributeKeys(skip_map, skip_definition)` aggregates all attribute keys across sequences into a set—useful for schema inference or validation. + +- **Sorting Capabilities**: + - `SortOnCount(reverse)`: Sorts by read count (descending/ascending). + - `SortOnLength(reverse)`: Sorts by sequence length. + +- **Taxonomy Integration**: + `ExtractTaxonomy(taxonomy, seqAsTaxa)` builds or extends a taxonomic tree from sequence paths. + When `seqAsTaxa=true`, it injects pseudo-taxonomic labels for individual sequences (e.g., `OTU:SEQ0000012345 [seqID]@sequence`), enabling unified taxonomic/rarefaction workflows. + +### Design Highlights + +- Minimal allocations via manual slice management and `slices.Grow`. +- Explicit niling of popped elements to aid garbage collection. +- Integrated logging (via `logrus`) for allocation issues—critical in large-scale NGS data processing. +- Designed to support `BioSequenceBatch`, a higher-level abstraction for streaming or parallelizable sequence batches. diff --git a/autodoc/docmd/pkg/obiseq/class.md b/autodoc/docmd/pkg/obiseq/class.md new file mode 100644 index 0000000..b020158 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/class.md @@ -0,0 +1,32 @@ +# BioSequence Classifier Module Overview + +This Go package (`obiseq`) provides a flexible and thread-safe framework for classifying biological sequences using different strategies. Each classifier implements four core methods: +- `Code(sequence) int`: assigns an integer class to a sequence. +- `Value(k) string`: retrieves the original value (or representation) for class index *k*. +- `Reset()`: clears internal state. +- `Clone() *BioSequenceClassifier`: creates a fresh copy of the classifier. + +## Supported Classifier Types + +1. **`AnnotationClassifier(key, na)`** + Classifies sequences based on a single annotation field. Missing annotations default to `na`. Internally maps string values → integer codes via a thread-safe dictionary. + +2. **`DualAnnotationClassifier(key1, key2, na)`** + Uses *two* annotation fields. Combines them (as JSON array) to form unique class identifiers, enabling multi-dimensional classification. + +3. **`PredicateClassifier(predicate)`** + Binary classifier: returns `1` if the provided predicate function evaluates to true, else `0`. Useful for rule-based grouping (e.g., length > 200). + +4. **`HashClassifier(size)`** + Assigns sequences to one of `size` buckets via CRC32 hash of the raw sequence. Deterministic and memory-efficient, but may cause collisions. + +5. **`SequenceClassifier()`** + Unique class per *exact* sequence string (case-sensitive). Uses a lock-protected map to deduplicate and index sequences. + +6. **`RotateClassifier(size)`** + Cyclic assignment: sequence *i* → class `i mod size`. No memoization; state resets only manually. + +7. **`CompositeClassifier(...)`** + Combines multiple classifiers: concatenates their integer outputs (e.g., `"3:17:0"`) to form a composite class key. Enables layered or hierarchical classification. + +All classifiers are immutable after creation (state is internal and synchronized), supporting concurrent use in pipelines. diff --git a/autodoc/docmd/pkg/obiseq/compare.md b/autodoc/docmd/pkg/obiseq/compare.md new file mode 100644 index 0000000..02fb472 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/compare.md @@ -0,0 +1,20 @@ +# Semantic Description of `obiseq` Comparison Functions + +The `obiseq` package provides utility functions for comparing biological sequence records (`*BioSequence`) based on different fields. These comparators are designed to support sorting, deduplication, or grouping operations in bioinformatics workflows. + +- **`CompareSequence(a, b *BioSequence) int`** + Compares the raw nucleotide or amino acid sequences (`a.sequence`) lexicographically using `bytes.Compare`. Returns: + - `<0` if `a < b`, + - `0` if equal, + - `>0` if `a > b`. + +- **`CompareQuality(a, b *BioSequence) int`** + Compares the base quality scores (`a.qualities`) lexicographically (as byte strings), following same semantics as above. Useful for sorting reads by quality profiles. + +- **Commented-out `CompareAttributeBuilder(key string)`** + A planned higher-order function to generate custom comparators based on sequence attributes (e.g., `RG`, `NM`). It would: + - Extract attribute values using `.GetAttribute(key)`. + - Handle missing attributes (treat absent as "less than" present). + - Eventually support typed comparisons for ordered types. + +These functions assume `BioSequence` implements a consistent internal structure with `.sequence []byte` and `.qualities []byte`. They enable flexible, field-based ordering in collections of sequencing records. diff --git a/autodoc/docmd/pkg/obiseq/eval.md b/autodoc/docmd/pkg/obiseq/eval.md new file mode 100644 index 0000000..2bf1ff5 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/eval.md @@ -0,0 +1,28 @@ +# Semantic Description of `obiseq` Expression-Based Workers + +This module provides **expression-driven transformation workers** for biological sequence objects (`BioSequence`). It leverages a custom expression language (via `OBILang`) to dynamically compute values based on sequence metadata and content. + +## Core Components + +- **`Expression(expression string)`**: + Returns a function that evaluates the given expression in context. The evaluation scope includes: + - `annotations`: sequence annotations (metadata). + - `sequence`: the full `BioSequence` object itself. + +- **`EditIdWorker(expression string)`**: + A sequence worker that updates the *ID* of a `BioSequence` by evaluating the expression. + - On success: sets `sequence.Id()` to string representation of result. + - On failure: logs and returns an error with context. + +- **`EditAttributeWorker(key string, expression string)`**: + A sequence worker that sets a *custom attribute* (identified by `key`) on the sequence, using evaluated expression result. + - Supports arbitrary metadata enrichment. + - Errors are reported with sequence ID and failed expression. + +## Use Cases + +- Generate new IDs from annotation fields (e.g., `"gene_" + annotations["locus_tag"]`). +- Compute and store derived attributes (e.g., GC content, ORF length) as sequence metadata. +- Apply conditional logic or transformations across large sets of sequences in pipelines. + +All workers conform to the `SeqWorker` interface, enabling composition and chaining. diff --git a/autodoc/docmd/pkg/obiseq/iupac_nog.md b/autodoc/docmd/pkg/obiseq/iupac_nog.md new file mode 100644 index 0000000..ad96a69 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/iupac_nog.md @@ -0,0 +1,27 @@ +# Semantic Description of `obiseq` Package + +The `obiseq` package provides utilities for handling **IUPAC nucleotide ambiguity codes** in biological sequences. + +## Core Components + +- `_iupac`: A lookup table mapping lowercase ASCII letters (`a`–`z`) to numeric IUPAC nucleotide codes: + - `A=1`, `C=2`, `G=4`, `T/U=8` (standard bases) + - Ambiguous codes are bitwise OR combinations: + e.g., `R = A|G = 1+4=5`, `Y = C|T = 2+8=10`, etc. +- Invalid or non-nucleotide characters map to `0`. + +## Key Functionality + +### `SameIUPACNuc(a, b byte) bool` +Performs **case-insensitive comparison** of two nucleotide symbols using IUPAC ambiguity rules. + +- Converts uppercase letters to lowercase via bitwise OR (`|= 32`). +- For valid nucleotides, checks if their IUPAC codes have **non-zero bitwise AND**: + - Returns `true` only if the symbols share at least one possible base. + *Example*: `'R' & 'A' → (5 & 1) = 1 > 0 ⇒ true` + `'Y' & 'G' → (10 & 4) = 0 ⇒ false` +- For non-IUPAC or invalid characters, falls back to exact equality (`a == b`). + +## Use Case + +Enables robust comparison of DNA/RNA sequences where ambiguity codes (e.g., `N`, `R`, `W`) are used—critical for alignment, variant calling, or primer design tools. diff --git a/autodoc/docmd/pkg/obiseq/join.md b/autodoc/docmd/pkg/obiseq/join.md new file mode 100644 index 0000000..fc4e3d5 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/join.md @@ -0,0 +1,35 @@ +# `obiseq` Package: Sequence Concatenation via `.Join()` + +The `BioSequence.Join()` method enables semantic concatenation of two biological sequences (e.g., DNA, RNA, or protein strings). + +- **Signature**: + ```go + func (sequence *BioSequence) Join(seq2 *BioSequence, inplace bool) *BioSequence + ``` + +- **Purpose**: + Combines the current sequence (`sequence`) with a second one (`seq2`), returning a new or modified `BioSequence`. + +- **Parameters**: + - `seq2`: The sequence to append. Must be a valid `*BioSequence`. + - `inplace`: Boolean flag: if `true`, modifies the receiver in-place; otherwise, operates on a copy. + +- **Semantics**: + - If `inplace == false`, the method first creates a deep copy of the original sequence to avoid side effects. + - It then appends `seq2.Sequence()` (the underlying string/byte representation) to the target sequence using an internal `.Write()` method. + - The final concatenated result is returned as a `*BioSequence`. + +- **Behavioral Guarantees**: + - *Pure operation*: When `inplace = false`, the original sequences remain unaltered. + - *Chaining-friendly*: Returns a pointer, enabling method chaining (e.g., `seq.Join(a, false).Join(b, true)`). + +- **Use Cases**: + - Building multi-domain proteins or gene fusions. + - Merging fragments from sequencing reads. + - Constructing synthetic constructs in silico. + +- **Assumptions**: + - `BioSequence.Sequence()` returns a valid string/byte slice. + - `.Write(...)` handles appending correctly (e.g., no validation of biological compatibility — e.g., frame shifts are not checked). + +This method supports flexible, functional-style sequence manipulation while preserving memory safety via optional in-place mutation. diff --git a/autodoc/docmd/pkg/obiseq/kmers.md b/autodoc/docmd/pkg/obiseq/kmers.md new file mode 100644 index 0000000..09c3944 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/kmers.md @@ -0,0 +1,20 @@ +## BioSequence.Kmers(k int) — Semantic Description + +The `Kmers` method is a generator function that yields all contiguous *k*-length subsequences (called **k-mers**) from a biological sequence (`BioSequence`). + +- It operates on `[]byte` data, assuming the underlying sequence is stored as a byte slice (e.g., DNA bases `A`, `C`, `G`, `T`). +- Uses Go’s new iterator protocol (`iter.Seq[[]byte]`) for memory-efficient, lazy evaluation. +- Validates input: returns an empty iterator if `k ≤ 0` or exceeds sequence length. +- Iterates linearly from index `i = 0` to `len(seq) - k`, extracting slices of length *k*. +- Each yielded value is a **non-copying slice view** (efficient, but mutable if original data changes). +- Supports early termination: the consumer can stop iteration by returning `false` from the yield callback. +- Designed for downstream tasks like sequence analysis, motif discovery, or hashing (e.g., in k-mer counting). +- Does *not* handle reverse-complement or ambiguous bases—assumes raw sequence input. + +Usage example: +```go +for kmer := range seq.Kmers(3) { + fmt.Printf("%s\n", string(kmer)) +} +``` +This yields all 3-mers (e.g., `"ACG"`, `"CGT"`...) in order. diff --git a/autodoc/docmd/pkg/obiseq/language.md b/autodoc/docmd/pkg/obiseq/language.md new file mode 100644 index 0000000..6e69544 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/language.md @@ -0,0 +1,41 @@ +# Semantic Description of `obiseq` Language Extensions + +The `package obiseq` extends the [Gval](https://github.com/PaesslerAG/gval) expression language with domain-specific functions tailored for bioinformatics and data processing. It integrates utility helpers from `obiutils` to provide type-flexible, robust operations over sequences and collections. + +## Core Functionalities + +- **Data Inspection**: + `len`, `ismap`, `isvector` — retrieve size and type information. + +- **Aggregation & Comparison**: + `min`, `max` — compute extremal values in slices/maps (via `obiutils.Min/Max`). + *(Note: commented-out helper functions suggest prior attempts at manual implementations.)* + +- **Type Conversion**: + `int`, `numeric` (→ float64), `bool`, `string` — safely coerce arbitrary inputs to target types; fail with fatal logs on invalid data. + +- **String Manipulation**: + `sprintf`, `subspc` (replace spaces with underscores), `replace` (regex-based substitution), and `substr` — support formatting, normalization, and slicing. + +- **Sequence Analysis (Bioinformatics)**: + `gc`, `gcskew`, and `composition` — compute nucleotide composition metrics for DNA/RNA sequences (`BioSequence`). + - `gc`: GC content ratio (excluding ambiguous bases `'o'`) + - `gcskew`: `(G−C)/(G+C)` asymmetry measure + - `composition`: returns a map of base counts (e.g., `"a":20.0`, `"g":15.0`) + +- **Element Access**: + `elementof(seq, idx)` — retrieves item at index/key for slices (`[]interface{}`), maps (`map[string]interface{}`), or strings (by byte position). + +- **Control Flow**: + `ifelse(cond, then_val, else_val)` — conditional branching within expressions. + +- **Quality Support**: + `qualities(seq)` — extracts per-base quality scores as a float slice from sequencing reads. + +## Design Principles + +- **Dynamic Typing**: Accepts `...interface{}` arguments for flexibility. +- **Error Handling**: Uses fatal logging (`log.Fatalf`) on conversion failures; returns typed errors for runtime issues. +- **Extensibility**: Built atop `gval.Language`, enabling custom expression evaluation in pipelines (e.g., filtering reads via GC thresholds). + +This package serves as a bridge between high-level scripting and low-level biosequence computation, ideal for rule-based filtering or annotation in NGS workflows. diff --git a/autodoc/docmd/pkg/obiseq/merge.md b/autodoc/docmd/pkg/obiseq/merge.md new file mode 100644 index 0000000..19afb72 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/merge.md @@ -0,0 +1,39 @@ +# Semantic Description of `obiseq` Statistics and Merging Features + +This package provides infrastructure for **tracking, aggregating, and merging statistical occurrences** of sequence attributes across biological sequences (`BioSequence`). It supports both **count-based and weighted statistics**, with thread-safe operations. + +## Core Components + +- `StatsOnValues`: A concurrent map (`map[string]int`) with R/W locking to store occurrence counts per attribute value (e.g., taxon, primer, quality bin). +- `StatsOnDescription`: Defines *how* to extract and weight statistics from a sequence (e.g., count per read, or sum of quality scores). +- `StatsOnSlotName(key)`: Generates internal annotation keys (e.g., `"merged_taxon"`) to store precomputed statistics. + +## Key Functionalities + +1. **Per-Sequence Statistics Initialization & Update** + - `StatsOn(desc, na)`: Ensures a statistics slot exists for attribute `desc.Key`, initializes if needed. + - `StatsPlusOne(...)`: Adds contribution of a *single* sequence to the statistics (e.g., increment count for its taxon). + +2. **Thread-Safe Aggregation** + - `Merge(*StatsOnValues)`: Safely merges counts from another `StatsOnValues`, used to combine per-sequence stats. + +3. **Sequence Merging with Stat Propagation** + - `BioSequence.Merge(...)`: + - Combines two sequences (e.g., consensus/overlap). + - Updates statistics for specified attributes (`statsOn`), preserving or aggregating counts. + - Resolves conflicting annotations by deleting non-merged fields if mismatched. + +4. **Bulk Merging** + - `BioSequenceSlice.Merge(...)`: Efficiently merges *N* sequences into one, recycling inputs and updating statistics incrementally. + +## Use Cases + +- Tracking taxonomic assignments across merged reads. +- Aggregating primer or barcode counts in amplicon merging. +- Summarizing quality scores, abundance weights, or custom metadata during consensus building. + +## Design Notes + +- Uses `sync.RWMutex` for safe concurrent access. +- Supports only JSON-marshalable, serializable statistics (via `MarshalJSON`). +- Enforces type safety: only strings/integers/booleans allowed for attribute values. diff --git a/autodoc/docmd/pkg/obiseq/paired_reads.md b/autodoc/docmd/pkg/obiseq/paired_reads.md new file mode 100644 index 0000000..4a25a3a --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/paired_reads.md @@ -0,0 +1,19 @@ +# BioSequence Pairing Functionality + +This package provides semantic tools for managing biological sequence pairings—typically used in genomics (e.g., paired-end reads). Key features: + +- **Single-sequence pairing**: + - `IsPaired()` checks if a sequence is currently paired. + - `PairedWith()` returns the linked partner, or `nil`. + - `PairTo(p)` establishes a bidirectional link between two sequences. + - `UnPair()` safely severs the pairing on both ends. + +- **Batch (slice) handling**: + - `IsPaired()` and `UnPair()` operate uniformly across all sequences in a slice. + - `PairedWith()` returns the corresponding paired slice (element-wise). + - `PairTo(p)` enforces length compatibility and pairs sequences index-by-index. + +- **Error handling**: + - Mismatched slice lengths during `PairTo` trigger a fatal log (via Logrus), preventing inconsistent pairings. + +Semantically, the API supports both *atomic* and *bulk* pairing operations while preserving consistency through bidirectional references—ideal for processing paired-end sequencing data. diff --git a/autodoc/docmd/pkg/obiseq/pool.md b/autodoc/docmd/pkg/obiseq/pool.md new file mode 100644 index 0000000..6fe8cbe --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/pool.md @@ -0,0 +1,34 @@ +# Semantic Overview of `obiseq` Package Functionalities + +This Go package (`obiseq`) provides memory-efficient utilities for managing slices and annotations—key data structures in biosequence processing. + +## Slice Management + +- **`GetSlice(capacity int) []byte`** + Retrieves a reusable `[]byte` with ≥ requested capacity. For capacities ≤1024 bytes, it pulls from a `sync.Pool` (`_BioSequenceByteSlicePool`). Larger slices are freshly allocated. + +- **`RecycleSlice(s *[]byte)`** + Clears and recycles small slices (≤1024 bytes) back to the pool. For large slices (≥100 KB), it nils them and triggers explicit `runtime.GC()` every ~256 MB of discarded memory to prevent heap bloat. + +- **`CopySlice(src []byte) []byte`** + Efficiently copies a source slice into a pooled or newly allocated destination, preserving semantics without unnecessary allocations. + +## Annotation Management + +- **`BioSequenceAnnotationPool`** + A `sync.Pool` for reusable map-based annotations (`map[string]string`, inferred from usage), initialized with capacity 1. + +- **`GetAnnotation(values ...Annotation) Annotation`** + Fetches an annotation map from the pool, optionally pre-populated via shallow copy of input annotations using `obiutils.MustFillMap`. + +- **`RecycleAnnotation(a *Annotation)`** + Clears all keys from an annotation map and returns it to the pool for reuse. + +## Design Rationale + +The package prioritizes low-latency, high-throughput scenarios (e.g., NGS data pipelines) by minimizing GC pressure via: +- Tiered pooling strategy (`small` vs `large`) +- Explicit garbage collection triggers for large-object churn +- Safe reuse patterns avoiding aliasing or stale references + +All operations are thread-safe via `sync.Pool` and atomic counters. diff --git a/autodoc/docmd/pkg/obiseq/predicate.md b/autodoc/docmd/pkg/obiseq/predicate.md new file mode 100644 index 0000000..0602a28 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/predicate.md @@ -0,0 +1,33 @@ +# Sequence Predicate Framework in `obiseq` + +This Go package provides a flexible and composable predicate system for filtering biological sequences (`BioSequence`) based on diverse criteria. + +## Core Concepts + +- **`SequencePredicate`**: A function type `func(*BioSequence) bool`, enabling conditional logic on sequences. +- **Predicate Composition**: Supports logical operations (`And`, `Or`, `Xor`, `Not`) and chaining. +- **Paired-end Support**: Predicates can be adapted to consider read pairs via `PredicateOnPaired` and `PairedPredicat`, with modes: + - `ForwardOnly`: Only the forward read is evaluated. + - `ReverseOnly`, `And`, `Or`, `AndNot`, `Xor`: Combine forward and reverse evaluations. + +## Built-in Predicates + +| Predicate | Description | +|-----------|-------------| +| `HasAttribute(name)` | Checks if a sequence has an annotation with the given name. | +| `IsAttributeMatch(name, pattern)` | Tests if a named annotation matches the provided regex (case-sensitive). | +| `IsMoreAbundantOrEqualTo(count)` / `IsLessAbundantOrEqualTo(count)` | Filters by sequence abundance (count field). | +| `IsLongerOrEqualTo(length)` / `IsShorterOrEqualTo(length)` | Filters by sequence length. | +| `OccurInAtleast(sample, n)` | Checks if the sequence appears in at least *n* samples (via description stats). | +| `IsSequenceMatch(pattern)` | Matches the raw sequence against a regex (case-insensitive). | +| `IsDefinitionMatch(pattern)` | Matches the definition/description line against a regex. | +| `IsIdMatch(pattern)` / `IsIdIn(ids...)` | Filters by sequence ID using regex or explicit set. | +| `ExpressionPredicat(expression)` | Evaluates a custom boolean expression (via OBILang) using annotations and sequence metadata. | + +## Design Highlights + +- **Null-safe**: `nil` predicates are handled gracefully in compositions. +- **Extensible**: Custom predicates can be defined and combined seamlessly. +- **Logging & Safety**: Invalid regex patterns or expression syntax trigger fatal errors; runtime evaluation issues emit warnings. + +This framework enables powerful, declarative filtering pipelines for high-throughput sequencing data analysis. diff --git a/autodoc/docmd/pkg/obiseq/revcomp.md b/autodoc/docmd/pkg/obiseq/revcomp.md new file mode 100644 index 0000000..01b192e --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/revcomp.md @@ -0,0 +1,35 @@ +# BioSequence Reverse Complement Functionality + +This Go package (`obiseq`) provides utilities for computing the reverse complement of biological sequences (e.g., DNA), including support for quality scores and structured metadata. + +## Core Functions + +- **`nucComplement(n byte) byte`** + Returns the nucleotide complement using a lookup table (`_revcmpDNA`). Handles special cases: + - `.` / `-` → unchanged (gaps) + - `[`, `]` → swapped (`[` ↔ `]`) + - A–Z letters → complemented (case-insensitive via bitwise masking) + - Unknown characters → `'n'` + +- **`BioSequence.ReverseComplement(inplace bool) *BioSequence`** + Performs reverse complement on the sequence and (if present) its quality string: + - If `inplace = false`, a copy is made; original preserved. + - Reverses indices and complements each base using `nucComplement`. + - Also reverses the quality array symmetrically. + - Caches result in `sequence.revcomp` for reuse. + +- **`BioSequence._revcmpMutation() *BioSequence`** + Adjusts mutation metadata (e.g., `"pairing_mismatches"`) to reflect the reversed-complement orientation: + - Reverses and complements symbolic mutation strings (e.g., `"A>T"` → `"T>A"`). + - Updates positional indices to match reversed sequence coordinates. + +- **`ReverseComplementWorker(inplace bool) SeqWorker`** + Returns a reusable `SeqWorker` function for batch processing: applies reverse complement to each sequence in a stream. + +## Design Notes + +- Uses ASCII bitwise tricks (`&31`, `|0x20`) for case-insensitive indexing and lowercase output. +- Supports non-standard symbols (e.g., IUPAC ambiguity codes via lookup table). +- Integrates quality scores and structured attributes seamlessly. + +> Ideal for NGS preprocessing pipelines where orientation matters (e.g., paired-end alignment, variant calling). diff --git a/autodoc/docmd/pkg/obiseq/revcomp_test.md b/autodoc/docmd/pkg/obiseq/revcomp_test.md new file mode 100644 index 0000000..163a31d --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/revcomp_test.md @@ -0,0 +1,19 @@ +## Semantic Description of `obiseq` Package Functionality + +The `obiseq` package provides core bioinformatics utilities for nucleic acid sequence manipulation in Go. It centers around two key operations: + +- **Nucleotide Complementation (`nucComplement`)** + Implements standard Watson-Crick base pairing rules: `A↔T`, `C↔G`. It also handles ambiguous or symbolic characters (e.g., `'n' → 'n'`, `'[ ↔ ]'`), preserving non-standard symbols like gaps (`'-'`) and missing data (`'.'`). This function serves as the atomic building block for reverse-complement logic. + +- **Reverse Complementation (`BioSequence.ReverseComplement`)** + A method on the `BioSequence` type that returns a new (or in-place modified) sequence representing: + - The *reverse* of the original nucleotide string, followed by + - Each base replaced with its complement (via `nucComplement`). + + The method supports two modes: + - **Non-destructive (`inplace=false`)**: Returns a new `BioSequence`, leaving the original unchanged. + - **In-place (`inplace=true`)**: Modifies and returns the same object for memory efficiency. + + Crucially, it preserves associated quality scores (e.g., Phred-scaled sequencing qualities), reversing their order to match the reversed sequence—ensuring correctness in downstream analyses like alignment or variant calling. + +Tests validate both functions across edge cases: degenerate bases, ambiguous symbols, and quality-aware sequences—confirming robustness for typical NGS (Next-Generation Sequencing) workflows. diff --git a/autodoc/docmd/pkg/obiseq/subseq.md b/autodoc/docmd/pkg/obiseq/subseq.md new file mode 100644 index 0000000..224ca47 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/subseq.md @@ -0,0 +1,13 @@ +# `obiseq.Subsequence` Functionality Overview + +The `Subsequence()` method extracts a contiguous segment from a biological sequence (`BioSequence`), supporting both linear and circular topologies. + +- **Input validation**: Checks ensure `from < to` (unless circular), positions are non-negative, and bounds respect sequence length. +- **Circular handling**: Positions exceeding the sequence length wrap around using modular arithmetic; debug logs record corrections. +- **Linear extraction**: When `from < to`, it slices the underlying nucleotide/peptide sequence and, if present, its quality scores. +- **Circular extraction**: When `from > to`, it concatenates two linear segments: from `from` → end, and start → `to`. +- **Metadata preservation**: Quality scores (if available) and annotations are copied to the new subsequence. +- **ID formatting**: The resulting sequence ID is suffixed with `[from..to]` (1-based indexing). +- **Mutation tracking**: A private `_subseqMutation()` adjusts stored pairing mismatch positions by subtracting the extraction shift, ensuring coordinate consistency post-extraction. + +This enables robust subsequence generation for genomic analysis workflows involving circular genomes (e.g., plasmids) or fragmented reads. diff --git a/autodoc/docmd/pkg/obiseq/subseq_test.md b/autodoc/docmd/pkg/obiseq/subseq_test.md new file mode 100644 index 0000000..ecd9667 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/subseq_test.md @@ -0,0 +1,29 @@ +# `obiseq` Package: Subsequence Extraction Functionality + +The `Subsequence()` method enables extraction of a contiguous segment from biological sequence data (`BioSequence`). It supports both linear and circular (wrapped) slicing. + +- **Input Parameters**: + - `from`, `to`: 0-based inclusive indices defining the slice range. + - `circular`: boolean flag enabling wrap-around when `from > to`. + +- **Behavior**: + - For linear (`circular = false`), `from ≤ to`, and indices within bounds `[0, len(seq))`. + - For circular (`circular = true`), allows wrap-around (e.g., `from=3, to=2` on a 4-mer yields indices `[3,0,1]`). + - Validates inputs: returns descriptive errors for: + - `from > to` (non-circular), + - out-of-bounds indices (`< 0` or `≥ length`), + - invalid ranges. + +- **Quality Support**: + - When sequence includes base quality scores (`BioSequenceWithQualities`), the method preserves corresponding sub-slice of `Quality[]`. + +- **Return Value**: + - Returns a new `BioSequence` (or subclass) instance containing the extracted subsequence and its optional qualities. + +- **Use Case**: + - Ideal for region-of-interest extraction (e.g., primer binding sites, domain segments), especially in circular genomes or plasmids. + +- **Testing**: + - Unit tests (`TestSubsequence`) cover valid/invalid inputs, circular/non-circular modes, and quality consistency. + +This functionality provides robust, semantics-aware slicing for biosequence manipulation in Go. diff --git a/autodoc/docmd/pkg/obiseq/taxonomy_classifier.md b/autodoc/docmd/pkg/obiseq/taxonomy_classifier.md new file mode 100644 index 0000000..92efb1f --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/taxonomy_classifier.md @@ -0,0 +1,26 @@ +# Taxonomic Classification via `TaxonomyClassifier` + +The `obiseq` package provides a taxonomic classification mechanism through the `TaxonomyClassifier` function. + +- **Purpose**: Constructs a reusable classifier for biological sequences based on taxonomic hierarchy. +- **Inputs**: + - `taxonomicRank`: Target rank (e.g., `"species"`, `"genus"`). + - `taxonomy`: Reference taxonomy (`*obitax.Taxonomy`), with fallback via `.OrDefault(true)`. + - `abortOnMissing`: Boolean flag to enforce strict taxon resolution. + +- **Core Logic**: + - For each sequence, retrieves its `Taxon`, then drills down to the requested rank using `.TaxonAtRank()`. + - If `abortOnMissing` is true, exits on failure to resolve the taxon or rank. + - Internally maps `*TaxNode`s to integer codes for efficient storage/comparison. + +- **Returned Object (`BioSequenceClassifier`)**: + - `Code(sequence) int`: Assigns a unique integer code to the taxonomic assignment of a sequence. + - `Value(code) string`: Returns the scientific name corresponding to a code. + - `Reset()`: Reinitializes internal mappings (useful for batch processing). + - `Clone() *BioSequenceClassifier`: Creates a fresh, identical classifier instance. + +- **Design Rationale**: + - Uses integer codes to avoid repeated string operations and enable fast indexing (e.g., for counting). + - Supports both strict (`abortOnMissing=true`) and lenient classification modes. + +This design enables scalable, efficient taxonomic profiling of sequencing datasets. diff --git a/autodoc/docmd/pkg/obiseq/taxonomy_lca.md b/autodoc/docmd/pkg/obiseq/taxonomy_lca.md new file mode 100644 index 0000000..343a6be --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/taxonomy_lca.md @@ -0,0 +1,22 @@ +# Taxonomic Analysis Functions in `obiseq` Package + +This module provides tools for assigning taxonomic labels to biological sequences using a reference taxonomy. + +- **`TaxonomicDistribution(taxonomy)`**: + Returns a map from taxonomic nodes to read counts, based on `taxid` annotations in the sequence metadata. It validates taxids against the taxonomy and enforces strict handling of aliases. + +- **`LCA(taxonomy, threshold)`**: + Computes the *Lowest Common Ancestor* (LCA) of all taxonomic assignments for a sequence, weighted by their abundances. + - Iteratively traverses upward from each taxon’s path in the taxonomy tree. + - At each level, computes the relative weight (`rmax`) of the most frequent taxon. + - Stops when `rmax < threshold`, returning: + • the LCA taxon, + • its confidence score (`rans`), and + • total read count used. + +- **`AddLCAWorker(...)`**: + Creates a `SeqWorker` function to annotate sequences with LCA results: + - Sets attributes like `_taxid`, `_name`, and `_error` (rounded to 3 decimals). + - Automatically appends `_taxid` if missing in `slot_name`. + +All functions integrate with the OBITools4 ecosystem, supporting robust taxonomic inference for metabarcoding workflows. diff --git a/autodoc/docmd/pkg/obiseq/taxonomy_methods.md b/autodoc/docmd/pkg/obiseq/taxonomy_methods.md new file mode 100644 index 0000000..efac620 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/taxonomy_methods.md @@ -0,0 +1,41 @@ +# Taxonomic Annotation Features in `obiseq` Package + +This package provides semantic taxonomic annotation capabilities for biological sequences (`BioSequence`). It integrates with a taxonomy database to assign, retrieve, and manage taxonomic identifiers (taxids) and related metadata. + +## Core Functions + +- **`Taxid()`**: Retrieves the taxonomic ID as a string (e.g., `"12345"` or `"NA"`), supporting multiple internal representations (`string`, `int`, `float64`). Returns `"NA"` if no taxid is set. + +- **`Taxon(taxonomy)`**: Returns the corresponding `*obitax.Taxon` object, or `nil` if taxid is `"NA"`. + +- **`SetTaxid(taxid, rank...)`**: Assigns a taxonomic ID to the sequence. Validates against default taxonomy; handles aliases and errors based on configuration flags (`FailOnTaxonomy`, `UpdateTaxid`). Optionally stores taxid under a custom rank (e.g., `"genus_taxid"`). + +- **`SetTaxon(taxon, rank...)`**: Assigns a `*obitax.Taxon` object directly; stores its string representation as taxid. + +## Rank-Specific Annotation + +- **`SetTaxonAtRank(taxonomy, rank)`**: Annotates the sequence with taxid and scientific name at a specified Linnaean rank (e.g., `"species"`, `"genus"`). Sets two attributes: `rank_taxid` and `rank_name`. Returns the taxon at that rank (or `nil`). + +- **Convenience wrappers**: + - `SetSpecies(...)` + - `SetGenus(...)` + - `SetFamily(...)` + All delegate to `SetTaxonAtRank`. + +## Taxonomic Path & Metadata + +- **`SetPath(taxonomy)`**: Computes and stores the full taxonomic lineage (from root to species) as a string slice under attribute `"taxonomic_path"`. + +- **`Path()`**: Retrieves the stored taxonomic path; recomputes it if missing and a default taxonomy exists. + +- **`SetScientificName(taxonomy)`**: Stores the sequence’s species-level scientific name under `"scientific_name"`. + +- **`SetTaxonomicRank(taxonomy)`**: Stores the taxon’s rank (e.g., `"species"`, `"genus"`) under `"taxonomic_rank"`. + +## Error Handling & Configuration + +- Uses `logrus` and custom logging (`obilog`) for warnings/errors. +- Behavior on taxonomy mismatches (e.g., unknown taxid, alias) is configurable via `obidefault` settings. +- Ensures type consistency: taxid must be string, int, or float; invalid types trigger fatal errors. + +All methods are designed for seamless integration into bioinformatics pipelines, enabling robust taxonomic profiling of sequencing data. diff --git a/autodoc/docmd/pkg/obiseq/taxonomy_predicate.md b/autodoc/docmd/pkg/obiseq/taxonomy_predicate.md new file mode 100644 index 0000000..c7026e1 --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/taxonomy_predicate.md @@ -0,0 +1,20 @@ +# Semantic Description of `obiseq` Package Functionalities + +This Go package provides **sequence filtering predicates** for biological sequences, integrated with taxonomic validation and hierarchy analysis. + +- `IsAValidTaxon(taxonomy, ...bool) SequencePredicate`: + Returns a predicate that checks whether a sequence has an associated valid taxon in the given taxonomy. + Optionally supports *auto-correction* of outdated/incorrect `taxid` values to match the current taxonomy node. + +- `IsSubCladeOf(taxonomy, parent) SequencePredicate`: + Filters sequences whose taxonomic assignment is a descendant (sub-clade) of the specified `parent` taxon. + +- `IsSubCladeOfSlot(taxonomy, key) SequencePredicate`: + Enables filtering based on a *sequence attribute* (e.g., `"taxon"` or `"classification"`) that holds a taxonomic label. + Validates the label against the taxonomy, then checks if the sequence’s assigned taxon falls under it. + +- `HasRequiredRank(taxonomy, rank) SequencePredicate`: + Ensures the sequence’s taxon is assigned at or below a specified rank (e.g., `"species"`, `"genus"`). + Validates the requested `rank` against taxonomy’s rank list; exits on invalid input. + +All predicates follow a functional, composable design pattern (`SequencePredicate = func(*BioSequence) bool`), enabling flexible pipeline construction (e.g., filtering, classification validation). diff --git a/autodoc/docmd/pkg/obiseq/taxonomy_workers.md b/autodoc/docmd/pkg/obiseq/taxonomy_workers.md new file mode 100644 index 0000000..d129ebd --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/taxonomy_workers.md @@ -0,0 +1,22 @@ +# Taxonomic Annotation Workers in `obiseq` + +This Go package provides functional workers for annotating biological sequences with taxonomic information using a hierarchical taxonomy (e.g., from NCBI or UNITE). Each worker is implemented as a `SeqWorker`—a function that processes one sequence and returns an updated slice of sequences. + +- **`MakeSetTaxonAtRankWorker(taxonomy, rank)`**: + Assigns a taxonomic label at *a specific rank* (e.g., `"genus"`, `"family"`). Validates that the requested `rank` exists in the taxonomy before proceeding. + +- **`MakeSetSpeciesWorker(taxonomy)`**: + Annotates each sequence with its inferred species name using the provided taxonomy. + +- **`MakeSetGenusWorker(taxonomy)`**: + Adds genus-level taxonomic assignment to sequences. + +- **`MakeSetFamilyWorker(taxonomy)`**: + Adds family-level taxonomic assignment. + +- **`MakeSetPathWorker(taxonomy)`**: + Populates the full taxonomic path (e.g., `"Eukaryota;Metazoa;Chordata;..."`) for each sequence. + +All workers rely on methods of `BioSequence` (e.g., `.SetSpecies()`, `.SetPath()`), which internally use the `obitax.Taxonomy` object to resolve taxonomic IDs or names. Errors are logged via `logrus`; invalid ranks cause a fatal exit. + +These utilities support modular, pipeline-friendly taxonomic annotation—ideal for high-throughput metabarcoding workflows. diff --git a/autodoc/docmd/pkg/obiseq/worker.md b/autodoc/docmd/pkg/obiseq/worker.md new file mode 100644 index 0000000..1f0848e --- /dev/null +++ b/autodoc/docmd/pkg/obiseq/worker.md @@ -0,0 +1,18 @@ +# Semantic Description of `obiseq` Package Functionalities + +The `obiseq` package provides composable, higher-order worker functions for processing biological sequence data in Go. It defines three core functional types: + +- `SeqAnnotator`: In-place annotation of a single sequence (e.g., adding metadata). +- `SeqWorker`: Processes one sequence and returns zero or more output sequences (1→N transformation). +- `SeqSliceWorker`: Processes a slice of sequences and returns another slice (bulk pipeline stage). + +Key utilities include: + +- **`NilSeqWorker`**: Identity worker—returns the input sequence unchanged. +- **`AnnotatorToSeqWorker`**: Converts an in-place annotator into a `SeqWorker`, preserving compatibility with pipeline interfaces. +- **`SeqToSliceWorker`**: Lifts a `SeqWorker` to operate on slices, with configurable error handling (`breakOnError`). Supports dynamic slice growth and logging via `obilog`. +- **`SeqToSliceFilterOnWorker`**: Filters sequences in a slice using a `SequencePredicate`, preserving order and avoiding unnecessary allocations. +- **`SeqToSliceConditionalWorker`**: Applies a `SeqWorker` only to sequences satisfying a predicate; others pass through unchanged. +- **`.ChainWorkers()`**: Method on `SeqWorker` to compose two workers sequentially (pipeline chaining), enabling modular, reusable workflows. + +All functions emphasize safety: errors are either propagated (`breakOnError = true`) or logged with warnings, ensuring robustness in large-scale sequence processing pipelines. diff --git a/autodoc/docmd/pkg/obistats/algo.md b/autodoc/docmd/pkg/obistats/algo.md new file mode 100644 index 0000000..2ab6a7a --- /dev/null +++ b/autodoc/docmd/pkg/obistats/algo.md @@ -0,0 +1,20 @@ +# `obistats` Package — Semantic Overview + +The `obistats` package provides lightweight, general-purpose numerical utilities in Go. It includes: + +- **Basic arithmetic helpers**: + - `maxint`, `minint`: return the maximum/minimum of two integers. + - `sumint(xs []int) int`: computes the sum over a slice of integers. + +- **Root-finding via bisection**: + - `bisect(...)`: numerically finds a root of a real-valued function within `[low, high]`, using the classical bisection method. Returns `(root, success)`. + - Requires `f(low)` and `f(high)` to have opposite signs; panics otherwise. + +- **Boolean bisection**: + - `bisectBool(...)`: locates the transition point where a boolean function flips (e.g., threshold detection). Returns adjacent points `(x1, x2)` straddling the change. Panics if `f(low) == f(high)`. + +- **Series summation**: + - `series(...)`: computes the infinite sum ∑ₙ₌₀^∞ f(n) by iterating until convergence (i.e., `y == yp` within floating-point precision). + - *Note*: Fast but may suffer from rounding errors for slowly converging or oscillating series. + +All functions are designed for performance and simplicity, with no external dependencies beyond `fmt` (for error messages). The package is a stripped-down copy of internal utilities, likely used in performance-critical or statistical computations. diff --git a/autodoc/docmd/pkg/obistats/beta.md b/autodoc/docmd/pkg/obistats/beta.md new file mode 100644 index 0000000..0660a39 --- /dev/null +++ b/autodoc/docmd/pkg/obistats/beta.md @@ -0,0 +1,33 @@ +# Statistical Functions in `obistats` Package + +This Go package provides high-precision statistical functions for probability distributions, particularly the **regularized incomplete beta function**, used in hypothesis testing and confidence interval calculations. + +## Core Functions + +- **`mathBeta(a, b)`** + Computes the *complete beta function* $ B(a,b) = \frac{\Gamma(a)\Gamma(b)}{\Gamma(a+b)} $ using logarithms of the gamma function (`math.Lgamma`) for numerical stability. + +- **`lgamma(x)`** + Wrapper around `math.Lgamma`, returning the natural logarithm of the absolute value of the gamma function. + +- **`mathBetaInc(x, a, b)`** + Computes the *regularized incomplete beta function* $ I_x(a,b) $. This is essential for computing cumulative distribution functions (CDFs) of the beta, F-, and t-distributions. + - Uses *continued fraction evaluation* (via `betacf`) for accuracy. + - Applies symmetry transformation ($ x \to 1-x $) when beneficial (per Numerical Recipes). + - Returns `NaN` for invalid inputs (`x < 0 || x > 1`). + +- **`betacf(x, a, b)`** + Implements the continued fraction expansion of $ I_x(a,b) $. + - Iteratively evaluates recurrence relations for even/odd terms. + - Uses `epsilon = 3e-14` and `maxIterations = 200` for convergence. + - Handles near-zero denominators via `raiseZero`. + +## Use Cases + +- Statistical hypothesis testing (e.g., Fisher’s exact test). +- Beta, binomial proportion confidence intervals. +- F-test and Student's t-distribution CDF computations. + +## Implementation Notes + +Based on *Numerical Recipes in C*, §6.4, with robustness enhancements for floating-point edge cases. diff --git a/autodoc/docmd/pkg/obistats/betabinom.md b/autodoc/docmd/pkg/obistats/betabinom.md new file mode 100644 index 0000000..7381cd8 --- /dev/null +++ b/autodoc/docmd/pkg/obistats/betabinom.md @@ -0,0 +1,39 @@ +# Beta-Binomial Distribution Implementation in `obistats` + +This Go package provides a complete statistical implementation of the **Beta-Binomial distribution**, a compound discrete probability distribution where the success probability of a Binomial distribution follows a Beta distribution. + +## Core Features + +- **Struct Definition**: + `BetaBinomial` encapsulates the distribution parameters: number of trials (`N > 0`) and Beta shape parameters `Alpha` and `Beta`, both strictly positive. Optional random source (`Src`) supports reproducible sampling. + +- **Probability Mass Function (PMF)**: + - `LogProb(x)` computes the natural logarithm of the PMF at integer `x ∈ [0, N]`. + - `Prob(x)` returns the PMF value via exponentiation. + +- **Cumulative Distribution Function (CDF)**: + - `LogCDF(x)` evaluates the log-CDF using an analytical expression involving: + - Log-binomial coefficient (`Lchoose`) + - Log-beta function (`mathext.Lbeta`) + - Generalized hypergeometric function `HypPFQ` (via `scientificgo.org/special`). + - `CDF(x)` returns the standard CDF as `exp(LogCDF(x))`. + +- **Statistical Moments**: + - Mean: $N \cdot \frac{\alpha}{\alpha + \beta}$ + - Variance: $N \cdot \frac{\alpha \beta (\!N + \alpha + \beta\!)}{(\alpha+\beta)^2 (\alpha+\beta+1)}$ + - Standard deviation: square root of variance. + +- **Mode**: + Returns the most probable count. Special cases handled: + - `NaN` if both $\alpha, \beta \leq 1$ + - $0$ if only $\alpha \leq 1$ + - $N$ if only $\beta \leq 1$ + +- **Utility Methods**: + - `LogCDFTable(x)` builds a cumulative log-probability table up to `x`, useful for fast lookup or numerical stability. + - `NumParameters()` returns the number of distribution parameters (3: $N$, $\alpha$, $\beta$). + +- **Input Validation**: + Panics on invalid parameters (non-positive `N`, $\alpha$, or $\beta$), ensuring correctness. + +This module supports high-precision statistical computations using specialized mathematical libraries (`gonum.org/v1/gonum/mathext`, `scientificgo.org/special`). diff --git a/autodoc/docmd/pkg/obistats/data.md b/autodoc/docmd/pkg/obistats/data.md new file mode 100644 index 0000000..9055b91 --- /dev/null +++ b/autodoc/docmd/pkg/obistats/data.md @@ -0,0 +1,31 @@ +# `obistats` Package Overview + +The `obistats` package provides data structures and utilities for analyzing benchmark results in Go. It enables aggregation, statistical summarization, and comparison of performance metrics across multiple configurations. + +## Core Types + +- **`Collection`**: Holds benchmark results grouped by configuration, group label (e.g., parameter combinations), and metric unit. It tracks: + - Ordered lists of `Configs`, `Groups`, and `Units`. + - A map from group names to ordered lists of benchmark functions (`Benchmarks`). + - `Metrics`, keyed by `(Config, Group, Benchmark, Unit)`. + - Optional parameters for significance testing (`DeltaTest`, `Alpha`), geometric mean inclusion, and result ordering/splitting. + +- **`Key`**: Uniquely identifies a metric for one benchmark run, combining configuration source (`Config`), group label (`Group`), benchmark name (sans `"Benchmark"` prefix), and unit. + +- **`Metrics`**: Stores raw (`Values`) and cleaned (`RValues`, with outliers removed via IQR) measurements, plus derived statistics: `Min`, `Mean`, and `Max`. + +## Key Functionality + +- **Statistical summarization**: + - Outlier removal using Tukey’s fences (Q1 ± 1.5×IQR, Q3 + 1.5×IQR). + - Computation of min/mean/max over cleaned data. + +- **Formatting helpers**: + - `FormatMean()`: Returns formatted mean (e.g., scaled or raw). + - `FormatDiff()`: Computes and formats symmetric deviation as ±% (based on min/max vs. mean). + - `Format()`: Combines both into `"mean ±diff"` style. + +- **Dynamic collection building**: + - `addMetrics()` creates or retrieves metrics for a given key, while maintaining ordered lists of unique configs/groups/units and benchmarks-per-group. + +> ⚠️ *Note*: The file includes commented-out methods (`AddFile`, `AddData`, etc.) referencing an external `benchfmt` package—these are placeholders and not part of the active API in this excerpt. diff --git a/autodoc/docmd/pkg/obistats/delta.md b/autodoc/docmd/pkg/obistats/delta.md new file mode 100644 index 0000000..9183c78 --- /dev/null +++ b/autodoc/docmd/pkg/obistats/delta.md @@ -0,0 +1,25 @@ +# Semantic Description of `obistats` Delta Testing Functionality + +This Go package (`obistats`) provides statistical tools for comparing performance metrics before and after code changes—typically used in benchmarking workflows. + +- **`DeltaTest` type**: A function signature for comparing two `*Metrics` instances (old vs. new), returning a *p*-value (`float64`) and an optional error. +- **Purpose**: Determine whether two sets of samples likely originate from the same underlying distribution (i.e., detect significant performance regressions/improvements). + +## Supported Tests + +- **`NoDeltaTest()`**: A no-op test returning `(-1, nil)`, indicating *no statistical comparison* is performed. +- **`TTest()`**: Performs a two-sample Welch’s *t*-test on `RValues`, assessing whether means differ significantly. +- **`UTest()`**: Applies the Mann–Whitney *U* test (non-parametric), comparing distributions without assuming normality. + +## Common Errors + +- `ErrSamplesEqual`: All samples in one or both groups are identical. +- `ErrSampleSize`: Insufficient data points for reliable testing (e.g., < 2). +- `ErrZeroVariance`: One sample set has zero variance (no spread), breaking test assumptions. +- `ErrMismatchedSamples`: Sample lengths differ (not used here but part of the broader API). + +## Design Rationale + +- Built on top of internal benchmarking infrastructure (see `github.com/golang-design/bench`). +- Designed for modularity: callers can plug in different statistical tests as needed. +- Returns *p*-values directly, enabling threshold-based decision logic (e.g., `if p < 0.05 → alert`). diff --git a/autodoc/docmd/pkg/obistats/kmeans.md b/autodoc/docmd/pkg/obistats/kmeans.md new file mode 100644 index 0000000..b0557e9 --- /dev/null +++ b/autodoc/docmd/pkg/obistats/kmeans.md @@ -0,0 +1,34 @@ +# `obistats` Package: K-Means Clustering Implementation + +The `obistats` package provides a concurrent, type-generic implementation of the **K-means clustering algorithm** for numerical datasets. + +## Core Utilities +- `SquareDist` / `EuclideanDist`: Compute squared and Euclidean distances between vectors (generic over `float64` or `int`). +- `DefaultRG`: Returns a seeded random number generator (`*rand.Rand`) for reproducibility control. + +## Data Structure +- `KmeansClustering`: Encapsulates dataset (`*obiutils.Matrix[float64]`), cluster assignments, centers, and metadata (sizes, distances to nearest center). +- Supports dynamic addition of clusters via `AddACenter()`. + +## Initialization & Management +- `MakeKmeansClustering`: Initializes the structure with data, number of clusters *k*, and RNG. +- `SetCenterTo`, `AddACenter`: Assign or grow centers; uses **k-means++**-inspired weighted sampling for new centers. +- `ResetEmptyCenters`: Reinitializes empty clusters using distance-weighted sampling. + +## Core Algorithm Steps +- `AssignToClass`: Parallel assignment of points to nearest centers (uses goroutines + mutex). +- `ComputeCenters`: Computes new cluster centroids *as the closest original data point* to the arithmetic mean (robust for non-Euclidean spaces). +- `Run`: Executes iterative refinement until convergence (`max_cycle` iterations or inertia drop ≤ threshold). + +## Accessors & Diagnostics +- `K()`, `N()`, `Dimension()`: Return number of clusters, dataset size, and feature dimension. +- `Inertia()`: Sum of squared distances to assigned centers (convergence metric). +- `Centers`, `Classes`, `Sizes`: Expose internal clustering state. + +## Design Highlights +- Fully concurrent (goroutine-based) for performance. +- Generic distance functions support both `int` and `float64`. +- Explicit handling of edge cases (empty clusters, convergence). +- Logging via `logrus` for debugging (`obilog.Warnf`). + +> *Note: High-level wrapper functions (e.g., standalone `Kmeans`) are commented out but outline intended API usage.* diff --git a/autodoc/docmd/pkg/obistats/kolmogorovbeta.md b/autodoc/docmd/pkg/obistats/kolmogorovbeta.md new file mode 100644 index 0000000..72f3a79 --- /dev/null +++ b/autodoc/docmd/pkg/obistats/kolmogorovbeta.md @@ -0,0 +1,26 @@ +# `BetaKolmogorovDist` Function — Semantic Description + +The `obistats.BetaKolmogorovDist` function computes a **goodness-of-fit statistic** between an empirical dataset and the *cumulative distribution* (CDF) of a **Beta probability distribution** with specified parameters `α` and `β`. It implements an adapted version of the **Kolmogorov–Smirnov (KS) test**, tailored for Beta-distributed theoretical models. + +### Key Functionalities: +- **Input**: + - `data []float64`: Empirical sample (assumed sorted if `preordered = true`). + - `alpha`, `beta float64`: Shape parameters of the target Beta distribution. +- **Processing**: + - If not pre-sorted, data is copied and sorted ascendingly. + - For each ordered sample point `v_i`, it accumulates the sum `s = Σ_{j≤i} v_j`. + - Evaluates: + `|CDF_Beta(s; α, β) − empirical CDF_i|`, where the *empirical* cumulative probability at rank `i` is approximated as `1/(i+1)` — a common Bayesian/maximum-likelihood estimator (e.g., median-rank). + - Returns the **supremum** of these absolute deviations (i.e., max distance across all points). + +### Interpretation: +- A **small value** indicates the empirical cumulative sums align closely with the theoretical Beta CDF. +- A **large value** suggests significant deviation — poor fit of aBeta(α,β) to the data. +- Unlike standard KS tests (which use `i/n`), this uses `1/(i+1)` — suitable for small samples or Bayesian contexts. + +### Dependencies: +- Uses `gonum.org/v1/gonum/stat/distuv.Beta` for CDF computation. +- Uses `gonum.org/v1/gonum/floats.Max` for distance extremal computation. +- `sort.Float64s` ensures ordered traversal. + +> **Note**: The use of *cumulative sums* (`s`) rather than raw values is unconventional — possibly intended for data representing proportions or waiting times where the *integral* of observations matters. diff --git a/autodoc/docmd/pkg/obistats/mannwhitney.md b/autodoc/docmd/pkg/obistats/mannwhitney.md new file mode 100644 index 0000000..ae0f554 --- /dev/null +++ b/autodoc/docmd/pkg/obistats/mannwhitney.md @@ -0,0 +1,37 @@ +# `obistats` Package: Mann-Whitney U-test Implementation + +The `obistats` package provides a **non-parametric statistical test** for comparing two independent samples: the **Mann–Whitney U-test**, also known as the Wilcoxon rank-sum test. + +## Core Functionality + +- **`MannWhitneyUTest(x1, x2 []float64, alt LocationHypothesis)`** + Performs the test between two samples `x1` and `x2`, under a user-specified alternative hypothesis (`LocationLess`, `LocationDiffers`, or `LocationGreater`). + +- Returns a structured result: + - Sample sizes (`N1`, `N2`) + - U statistic (with tie handling: ties contribute 0.5) + - Alternative hypothesis used (`AltHypothesis`) + - Achieved *p*-value (`P`) + +## Key Features + +- **Non-parametric**: No assumption of normality — suitable for ordinal data or non-Gaussian distributions. +- **Exact vs Approximate**: + - Uses *exact U distribution* for small samples (≤50 without ties, ≤25 with ties). + - Falls back to *normal approximation* for larger samples (with tie and continuity corrections). +- **Tie Handling**: + - Ranks averaged for tied values. + - Tie correction applied in variance estimation. +- **Error Handling**: Returns `ErrSampleSize` (empty input) or `ErrSamplesEqual` (all values identical). + +## Implementation Notes + +- Uses labeled merge to interleave sorted samples while preserving origin labels. +- Computes U via rank sums: `U1 = R1 − n₁(n₁+1)/2`. +- Supports one-tailed and two-tailed tests. +- Includes helper functions: `labeledMerge`, `tieCorrection`. + +## References + +Mann & Whitney (1947); Klotz (1966). +Efficiency slightly lower than *t*-test on normal data, but more robust to outliers and distributional assumptions. diff --git a/autodoc/docmd/pkg/obistats/mathx.md b/autodoc/docmd/pkg/obistats/mathx.md new file mode 100644 index 0000000..8d5f227 --- /dev/null +++ b/autodoc/docmd/pkg/obistats/mathx.md @@ -0,0 +1,27 @@ +# `obistats` Package: Semantic Overview + +The `obistats` package provides low-level statistical and combinatorial utilities in pure Go, focusing on numerical robustness and performance. + +- **Sign Function (`mathSign`)** + Returns the sign of a `float64`: `-1`, `0`, or `+1`. Handles NaN by returning NaN. + +- **Precomputed Factorials (`smallFact`)** + Precomputes factorials from `0!` to `20!` (fits in 64-bit signed integer), enabling fast exact binomial coefficient computation for small `n`. + +- **Binomial Coefficient (`mathChoose`)** + Computes $\binom{n}{k}$ efficiently: + - For `n ≤ 20`: uses integer arithmetic (multiplication + division) for exact results. + - For larger `n`: leverages logarithms via `mathLchoose` and exponentiates (`exp(log(Choose))`) to avoid overflow. + +- **Log-Binomial Coefficient (`mathLchoose`)** + Computes $\log \binom{n}{k}$ via the log-gamma function: + $$\log \binom{n}{k} = \lg(n+1) - \lg(k+1) - \lg(n-k+1)$$ + Ensures numerical stability for large `n`, avoiding overflow/underflow. + +- **Internal Helper (`lchoose`)** + Core implementation of log-binomial using `math.Lgamma`, reused by both exact and large-scale paths. + +**Design Notes**: +- Prioritizes correctness (e.g., NaN propagation, edge-case handling). +- Balances speed and precision: exact integer arithmetic for small inputs; log-space computation for scalability. +- Mirrors functionality from an internal benchmarking module, adapted here as a standalone utility. diff --git a/autodoc/docmd/pkg/obistats/minmax.md b/autodoc/docmd/pkg/obistats/minmax.md new file mode 100644 index 0000000..865d4f3 --- /dev/null +++ b/autodoc/docmd/pkg/obistats/minmax.md @@ -0,0 +1,29 @@ +# `obistats` Package — Core Statistical Functions + +The `obistats` package provides generic, type-safe implementations of fundamental descriptive statistics for numeric types in Go. + +## Key Functions + +- **`Max[T]()`** + Returns the maximum value in a slice of numeric types (`int`, `int8`–`64`, `float32/64`). + *Implementation*: Iterates once, tracking the largest element. + +- **`Min[T]()`** + Returns the minimum value in a slice of numeric types (including unsigned integers: `uint`, `uint8`–`64`). + *Implementation*: Single-pass scan, comparing each element to the current minimum. + +- **`Mode[T]()`** + Computes the *most frequent* value (mode) for signed integer types only (`int`, `int8`–`64`). + *Implementation*: Builds a frequency map, then selects the value with highest count. + +## Design Notes + +- **Generics**: All functions use Go type parameters (`[T ...]`) for compile-time safety and performance. +- **Type Scope**: + - `Max` supports signed integers + floats (no unsigned). + - `Min` includes all integer variants. + - `Mode` is restricted to signed integers (due to map key constraints and semantics). +- **Assumptions**: Input slices are non-empty; no explicit error handling for edge cases (e.g., empty input). +- **Use Case**: Lightweight, reusable utility functions suitable for statistical pipelines or exploratory data analysis. + +> ⚠️ *Note*: No mean, median, variance, or standard deviation functions are provided in this excerpt. diff --git a/autodoc/docmd/pkg/obistats/normaldist.md b/autodoc/docmd/pkg/obistats/normaldist.md new file mode 100644 index 0000000..ccf836f --- /dev/null +++ b/autodoc/docmd/pkg/obistats/normaldist.md @@ -0,0 +1,30 @@ +# `obistats` Package: Normal Distribution Utilities + +The `obistats` package provides a lightweight, efficient implementation of the **normal (Gaussian) distribution**, including core statistical operations. + +## Core Type +- `NormalDist`: Represents a normal distribution with parameters: + - `Mu` (mean) + - `Sigma` (standard deviation) + +## Predefined Constants +- `StdNormal`: A standard normal distribution (`Mu = 0`, `Sigma = 1`). +- `invSqrt2Pi`: Precomputed constant for performance optimization. + +## Key Methods +| Method | Description | +|--------|-------------| +| `PDF(x)` | Computes the **probability density function** at point `x`. | +| `pdfEach(xs [])` | Vectorized PDF evaluation over a slice of values (optimized for standard normal). | +| `CDF(x)` | Computes the **cumulative distribution function** at point `x` via error function (`erfc`). | +| `cdfEach(xs [])` | Vectorized CDF evaluation over a slice. | +| `InvCDF(p)` | Computes the **inverse CDF (quantile function)** using Acklam’s algorithm with refinement. Handles edge cases (`p = 0`, `1`) and numerical stability. | +| `Rand(r *rand.Rand)` | Generates a random sample from the distribution (uses Go’s built-in `NormFloat64`). | +| `Bounds()` | Returns a practical support interval: `[Mu − 3·Sigma, Mu + 3·Sigma]` (≈99.7% coverage). | + +## Implementation Notes +- Optimized paths for standard normal (`Mu = 0`, `Sigma = 1`) reduce computation cost. +- Uses Go’s standard math library (`math.Erfc`, `math.Log`, etc.). +- Designed for performance and numerical accuracy in statistical applications. + +> *Note: Duplicates functionality from an internal module (`bench`), likely for reuse in public packages.* diff --git a/autodoc/docmd/pkg/obistats/random.md b/autodoc/docmd/pkg/obistats/random.md new file mode 100644 index 0000000..df63f03 --- /dev/null +++ b/autodoc/docmd/pkg/obistats/random.md @@ -0,0 +1,31 @@ +# `obistats.SampleIntWithoutReplacement` — Semantic Description + +The function **`SampleIntWithoutReplacement(n, max int) []int`** implements a *random sampling without replacement* algorithm over the integer range `[0, max)`. + +## Core Purpose +Generates **`n` distinct integers**, uniformly at random and *without repetition*, from the interval `[0, max)`. + +## Algorithmic Strategy +Uses an **incremental reservoir-like mapping** (`draw map[int]int`) to maintain uniqueness: +- Iteratively draws `y = rand.Intn(max)` (i.e., uniform in `[0, max)`). +- If `y` is already present (`ok = true`), it retrieves and reuses the stored value (a *swap trick*). +- Then, `draw[y]` is set to the current upper bound (`max - 1`) and `max` decremented — effectively *removing* one value from the future draw space. +- This preserves uniformity while avoiding collisions, in **O(n)** time and memory. + +## Key Properties +- ✅ Guarantees uniqueness: no duplicates in the returned slice. +- ⚖️ Uniform distribution over all possible `n`-element subsets of `[0, max)`. +- 🧠 Space-efficient: uses a map (O(n)) instead of shuffling an array of size `max`. +- 🚀 Efficient for large `max` and moderate `n`, where full-shuffle methods would be wasteful. + +## Return Value +A slice of length `n`, containing the sampled integers (order is *not* sorted or deterministic — reflects insertion order in `draw`). + +## Typical Use Cases +- Random subset selection (e.g., cross-validation folds, bootstrapping indices). +- shuffling without full permutation. +- Monte Carlo simulations requiring unique random IDs or positions. + +## Limitations / Notes +- Assumes `0 ≤ n ≤ max`; behavior is undefined otherwise. +- Relies on the global `math/rand` source (not seeded here); users should call `rand.Seed()` if reproducibility is needed. diff --git a/autodoc/docmd/pkg/obistats/sample.md b/autodoc/docmd/pkg/obistats/sample.md new file mode 100644 index 0000000..cd5df47 --- /dev/null +++ b/autodoc/docmd/pkg/obistats/sample.md @@ -0,0 +1,22 @@ +# `obistats` Package: Statistical Utilities for Weighted and Unweighted Samples + +The `obistats` package provides a suite of statistical functions for analyzing numeric samples, supporting both unweighted and weighted data. Its core abstraction is the `Sample` struct—encapsulating values (`Xs`), optional weights (`Weights`), and a `Sorted` flag for performance optimization. + +### Key Functionalities: + +- **Bounds**: Computes min/max efficiently—O(1) when sorted and unweighted; otherwise scans the data. +- **Aggregation**: `Sum()` computes weighted/unweighted sums via incremental accumulation; `Weight()` returns total weight (or count if unweighted). +- **Central Tendency**: + - `Mean()` uses incremental weighted mean for numerical stability. + - `GeoMean()` computes geometric means (requires positive values), also supporting weights. +- **Dispersion**: + - `Variance()` and `StdDev()` compute sample variance/standard deviation (unweighted only; weighted versions raise a panic—*TODO*). + - Based on Welford’s online algorithm for numerical robustness. +- **Order Statistics**: + - `Percentile(p)` implements Hyndman & Fan’s R8 interpolation method (default in many tools). Handles weights via linear scan; constant-time if sorted and unweighted. + - `IQR()` returns interquartile range (`P75 − P25`). +- **Utility Methods**: + - `Sort()` sorts in-place (stably for weighted samples) and updates the `Sorted` flag. + - `Copy()` creates a deep copy for independent manipulation. + +Designed with performance in mind, the package exploits sorting and incremental algorithms to minimize numerical error and improve runtime—especially valuable for large or repeated analyses. All functions gracefully handle edge cases (empty samples, zero weights) by returning `NaN` or appropriate bounds. diff --git a/autodoc/docmd/pkg/obistats/scaler.md b/autodoc/docmd/pkg/obistats/scaler.md new file mode 100644 index 0000000..bf9da1f --- /dev/null +++ b/autodoc/docmd/pkg/obistats/scaler.md @@ -0,0 +1,23 @@ +# `obistats` Package: Semantic Description + +The `obistats` package provides utility functions for **formatting and scaling benchmark measurements** in Go, especially tailored for performance benchmarks (e.g., `go test -bench`). Its core component is the **`Scaler` type**, a function that converts raw numeric values into human-readable, unit-aware strings. + +- **`Scaler func(float64) string`**: A function type that formats a numeric measurement (e.g., time, memory usage, throughput) into an appropriately scaled and unit-annotated string. + +- **`NewScaler(val float64, unit string) Scaler`**: Dynamically selects the best scaling strategy based on: + - The measurement value (`val`) + - Its unit (e.g., `"ns/op"`, `"MB/s"`, `"B/op"`) + + It applies **SI prefixes** (`k`/`M`/`G`/`T`) with adaptive precision (0–2 decimal places) to ensure readability and consistency across table rows. + +- **`timeScaler(ns float64)`**: Specialized scaler for time-based units (`ns/op`, `ns/GC`). It selects optimal unit (s, ms, µs, ns) and precision based on magnitude. + +- **`hasBaseUnit(s, unit string) bool`**: Helper to detect if a full unit string (e.g., `"bytes/op"`, `"MB/s"`) includes or matches a base unit. + +Key features: +- Supports common Go benchmark units: time (`ns/op`), memory (`B/op`, `bytes/op`), throughput (`MB/s`) +- Ensures consistent formatting across rows (e.g., all values in a row use same scale) +- Avoids unnecessary trailing zeros and uses SI conventions +- Designed for compatibility with internal benchmarking infrastructure (originally from `golang-design/bench`) + +Intended use: formatting tables of benchmark results where readability and unit consistency are critical. diff --git a/autodoc/docmd/pkg/obistats/sort.md b/autodoc/docmd/pkg/obistats/sort.md new file mode 100644 index 0000000..3057f55 --- /dev/null +++ b/autodoc/docmd/pkg/obistats/sort.md @@ -0,0 +1,24 @@ +# `obistats` Package: Semantic Overview + +This Go package provides utilities for sorting benchmark result tables, derived from an internal module. It focuses on semantic ordering of performance data. + +## Core Concepts + +- **`Order` type**: A function signature defining custom sort logic for table rows (`func(t *Table, i, j int) bool`). +- **Predefined orders**: + - `ByName`: Sorts rows alphabetically by benchmark name. + - `ByDelta`: Orders rows based on magnitude of percentage change (`PctDelta`), adjusted by directionality via `Change`. +- **Helper functions**: + - `Reverse(order Order)`: Returns a new order that inverts the comparison result. +- **Core utility**: + - `Sort(t *Table, order Order)`: Performs an in-place stable sort of table rows using the provided ordering function. + +## Design Intent + +- Enables flexible, domain-aware sorting (e.g., by performance delta or name). +- Supports both ascending and descending sorts via `Reverse`. +- Uses stable sorting (`sort.SliceStable`) to preserve relative order of equal elements. + +## Use Case + +Ideal for benchmark comparison tools where users need intuitive, configurable table layouts—especially when analyzing performance regressions or improvements. diff --git a/autodoc/docmd/pkg/obistats/stats.md b/autodoc/docmd/pkg/obistats/stats.md new file mode 100644 index 0000000..876c76f --- /dev/null +++ b/autodoc/docmd/pkg/obistats/stats.md @@ -0,0 +1,25 @@ +# `obistats` Package — Semantic Overview + +The `*obistats*` Go package provides lightweight, type-generic statistical utilities for numerical data. + +## Core Functions + +- **`Median[T Number](data []T) float64`** + Computes the median of a slice. Internally copies and sorts input data to avoid mutation, handling both even- and odd-length slices correctly. Returns `0` for empty input. + +- **`Mean[T Number](data []T) float64`** + Calculates the arithmetic mean by summing all elements (converted to `float64`) and dividing by count. + +## Type Constraints + +- Uses Go generics (`constraints.Float | constraints.Integer`), enabling use with `int`, `float32`, `float64`, etc. + +## Design Notes + +- Non-mutating (`Median` works on a copy). +- Simple, efficient implementations—no external dependencies beyond `golang.org/x/exp/constraints` and `slices`. +- Focused on central tendency measures only—no variance, std dev, or distribution stats. + +## Use Case + +Ideal for small-to-medium numerical datasets where minimal dependencies and clarity are prioritized over advanced statistics. diff --git a/autodoc/docmd/pkg/obistats/table.md b/autodoc/docmd/pkg/obistats/table.md new file mode 100644 index 0000000..6d2d347 --- /dev/null +++ b/autodoc/docmd/pkg/obistats/table.md @@ -0,0 +1,31 @@ +# `obistats` Package: Benchmark Statistics and Comparison + +The `obistats` package provides semantic tools to analyze, compare, and display benchmark results—typically from Go’s `testing.B` benchmarks. It enables structured reporting of performance changes across configurations (e.g., before/after code modifications). + +### Core Concepts +- **`Collection`**: Aggregates benchmark metrics across groups, benchmarks, and configurations. +- **`Table` & `Row`**: Represent formatted tabular output for human-readable comparison (e.g., in CLI tools like `benchstat`). +- **Metrics per row**: Include mean, variance, sample size (`n`), and statistical test results. + +### Key Functionalities +- **Statistical summarization**: Computes means, variances, and other stats via `computeStats()`. +- **Delta comparison** (2-config mode): + - Performs statistical tests (`UTest` by default) to assess significance. + - Calculates percent change: `((new/old) − 1) × 100%`. + - Marks improvements (`+1`) or regressions (`−1`), respecting metric semantics (e.g., lower time/op is better; higher MB/s is better). +- **Handling edge cases**: + - Skips rows with missing data (e.g., one config absent). + - Notes issues: zero variance, insufficient samples, or identical values. +- **Geometric mean aggregation**: + - Adds a `[Geo mean]` row summarizing overall performance across benchmarks. + - Excludes zero-mean entries to avoid distortion (e.g., allocations of `0`). +- **Metric normalization**: + - Maps raw units (`ns/op`, `B/op`) to semantic names (e.g., `"time/op"`, `"alloc/op"`). + - Supports prefixed units (`foo-ns/op` → `foo-time/op`). + +### Output Customization +- Supports sorting via user-defined order (`c.Order`). +- Configurable significance level `α` (default: 0.05) for p-value filtering. +- Optional geomean inclusion (`c.AddGeoMean`). + +Designed for integration into benchmark analysis pipelines (e.g., CLI tools), `obistats` focuses on **semantic clarity**, **statistical rigor**, and **actionable insights**. diff --git a/autodoc/docmd/pkg/obistats/tdist.md b/autodoc/docmd/pkg/obistats/tdist.md new file mode 100644 index 0000000..8fb4aeb --- /dev/null +++ b/autodoc/docmd/pkg/obistats/tdist.md @@ -0,0 +1,30 @@ +# `obistats.TDist`: Student's *t*-Distribution Implementation + +This Go package provides a lightweight implementation of the **Student’s *t*-distribution**, commonly used in statistical inference (e.g., hypothesis testing, confidence intervals) when sample sizes are small or population variance is unknown. + +## Core Components + +- **`TDist` struct**: + Represents a *t*-distribution parameterized by degrees of freedom `V`. + +- **`PDF(x)` method**: + Computes the *probability density function* at point `x`, using: + $$ + f(x) = \frac{\Gamma\left(\frac{V+1}{2}\right)}{\sqrt{V\pi} \, \Gamma\left(\frac{V}{2}\right)} + \left(1 + \frac{x^2}{V} \right)^{-\frac{V+1}{2}} + $$ + Leverages `lgamma` for numerical stability in Gamma function evaluation. + +- **`CDF(x)` method**: + Computes the *cumulative distribution function*: + - Returns `0.5` at symmetry point (`x == 0`); + - Uses the **regularized incomplete beta function** `mathBetaInc` for `x > 0`; + - Exploits symmetry: `CDF(-x) = 1 − CDF(x)` for `x < 0`. + +- **`Bounds()` method**: + Returns a practical truncation interval `[-4, 4]`, sufficient for most visualizations or numerical integration over the central mass of the distribution. + +## Dependencies & Notes + +- Relies on standard library `math` and custom/internal helpers (`lgamma`, `mathBetaInc`) — likely from a shared internal module. +- Designed for performance and numerical robustness, suitable in statistical tooling or benchmark analysis (as suggested by the `obistats` package name and reference to a bench-related repo). diff --git a/autodoc/docmd/pkg/obistats/ttest.md b/autodoc/docmd/pkg/obistats/ttest.md new file mode 100644 index 0000000..8b2706c --- /dev/null +++ b/autodoc/docmd/pkg/obistats/ttest.md @@ -0,0 +1,37 @@ +# Statistical Hypothesis Testing Module (`obistats`) + +This Go package provides implementations of common **t-tests** for comparing sample means under different assumptions. It supports one- and two-sample tests, paired or unpaired designs. + +## Core Types + +- **`TTestResult`**: Encapsulates the outcome of a t-test, including: + - Sample sizes (`N1`, `N2`) + - Test statistic value (`T`) + - Degrees of freedom (`DoF`) + - Alternative hypothesis type (`AltHypothesis`: `LocationDiffers`, `LocationLess`, or `LocationGreater`) + - Computed *p*-value (`P`) + +- **`TTestSample` interface**: Requires methods `Weight()`, `Mean()`, and `Variance()` — enabling reuse with summary statistics. + +## Supported Tests + +1. **`TwoSampleTTest(x1, x2)`** + Standard Student’s *t*-test for two independent samples assuming **equal variances** and normality. + +2. **`TwoSampleWelchTTest(x1, x2)`** + Welch’s *t*-test for two independent samples **without equal-variance assumption**, using Satterthwaite approximation for degrees of freedom. + +3. **`PairedTTest(x1, x2)`** + Paired *t*-test for dependent samples (e.g., before/after), testing mean of differences against `μ0`. + +4. **`OneSampleTTest(x)`** + One-sample *t*-test comparing sample mean to a known population mean `μ0`. + +## Error Handling + +- Returns errors for invalid inputs: zero sample size (`ErrSampleSize`), zero variance (`ErrZeroVariance`), or mismatched paired sample lengths (`ErrMismatchedSamples`). + +## Implementation Notes + +- *p*-values are computed using the cumulative distribution function (CDF) of the Student’s *t*-distribution. +- Designed for statistical rigor and modularity, reusing internal utilities (e.g., `Mean`, `StdDev`) from a shared module. diff --git a/autodoc/docmd/pkg/obistats/udist.md b/autodoc/docmd/pkg/obistats/udist.md new file mode 100644 index 0000000..b576dfc --- /dev/null +++ b/autodoc/docmd/pkg/obistats/udist.md @@ -0,0 +1,39 @@ +# Mann-Whitney U Distribution Implementation in `obistats` + +The `obistats` package provides efficient computation of the **Mann-Whitney U distribution**, used in nonparametric hypothesis testing to compare two independent samples. + +## Core Types + +- **`UDist`**: Represents the discrete probability distribution of the U statistic for sample sizes `N1`, `N2`. It optionally handles **ties** via a tie-count vector `T`. + +## Key Features + +- ✅ **Exact distribution computation**, both with and without ties. + - *No ties*: Uses dynamic programming (Mann–Whitney recurrence) in `O(N1·N2·U)` time. + - *With ties*: Implements the linked-list-based algorithm from Cheung & Klotz (1997) via memoization (`makeUmemo`). + +- ✅ **PMF & CDF evaluation**: + - `PMF(U)` returns the probability mass at U. + - `CDF(U)` computes cumulative probabilities using symmetry to minimize computation. + +- ✅ **Support for tied ranks**: + - `T` encodes tie multiplicities per rank; if nil, no ties are assumed. + +- ✅ **Optimized recurrence**: + - Exploits symmetry (`p_{n,m} = p_{m,n}`) and incremental DP to reduce memory/time. + +- ✅ **Boundary handling**: + - `Bounds()` returns support `[0, N1·N2]`. + - `Step() = 0.5`, reflecting U’s discrete unit in tied cases. + +## Algorithm Notes + +- `p(U)` uses a 2D DP table (rows = *n*, columns = U), computing only necessary states. +- `makeUmemo` builds a 3D memoization table (`k`, `n1`, `2U`) for tied distributions. +- Performance bottlenecks noted in comments (e.g., map overhead) suggest future optimization paths. + +## Use Case + +Enables exact *p*-value calculation for the **Mann-Whitney U test**, especially valuable when: + - Sample sizes are small-to-moderate (exact methods needed). + - Data contain ties. diff --git a/autodoc/docmd/pkg/obistats/utils.md b/autodoc/docmd/pkg/obistats/utils.md new file mode 100644 index 0000000..581b9ea --- /dev/null +++ b/autodoc/docmd/pkg/obistats/utils.md @@ -0,0 +1,21 @@ +# Semantic Description of `obistats` Package + +The `obistats` package provides numerically stable statistical utilities for combinatorics and log-space arithmetic, primarily intended for use in bioinformatics or probabilistic modeling. + +- **`Lchoose(n, x int) float64`**: + Computes the natural logarithm of the binomial coefficient "n choose x" using the log-gamma function (`math.Lgamma`). This avoids overflow/underflow inherent in direct computation of large factorials. + +- **`Choose(n, x int) float64`**: + Returns the (floating-point approximation of the) binomial coefficient by exponentiating `Lchoose`. *Note*: The argument order in the implementation (`math.Exp(Lchoose(x,n))`) appears reversed—likely a typo; should be `Lchoose(n,x)`. + +- **`LogAddExp(x, y float64) float64`**: + Computes `log(exp(x) + exp(y))` in a numerically stable way. Uses the identity: + `log(eˣ + eʸ) = max(x, y) + log(1 + exp(-|x - y|))`, implemented via `math.Log1p` for precision near zero. + Handles NaNs/infinities with logging and fallback. + +All functions rely on `math` for core operations, and use Logrus (`log.Errorf`) to warn about invalid inputs (e.g., non-finite values). + +Use cases include: +- Exact p-value computation in overrepresentation tests (e.g., hypergeometric), +- Log-probability accumulation in hidden Markov models or Bayesian networks, +- Stable mixture model likelihood evaluations. diff --git a/autodoc/docmd/pkg/obisuffix/suffix_array.md b/autodoc/docmd/pkg/obisuffix/suffix_array.md new file mode 100644 index 0000000..f43de13 --- /dev/null +++ b/autodoc/docmd/pkg/obisuffix/suffix_array.md @@ -0,0 +1,23 @@ +# Suffix Array Implementation for Biological Sequences + +This Go package (`obisuffix`) provides a suffix array data structure tailored for biological sequence analysis. It supports efficient lexicographic sorting and common-prefix computation over all suffixes of a set of sequences. + +## Core Types + +- **`Suffix`**: Represents one suffix by storing the sequence index (`Idx`) and starting position (`Pos`). +- **`SuffixArray`**: Holds a collection of `Suffix`, the original sequences (`Sequences`), and cached common-prefix lengths (`Common`). + +## Key Functions + +- **`BuildSuffixArray(data)`**: Constructs a suffix array by enumerating *all* suffixes from all input sequences, then sorts them lexicographically using a custom comparator (`SuffixLess`). +- **`CommonSuffix()`**: Computes the length of shared prefix between each adjacent pair in the sorted suffix array (i.e., `LCP`-like values), caching results for reuse. +- **`String()`**: Returns a human-readable table with columns: `Common`, sequence index, position, and suffix string. + +## Semantic Features + +- **Lexicographic ordering**: Suffixes are sorted by their nucleotide/amino-acid content; ties break first by shorter length, then lower index, finally earlier position. +- **Efficiency**: Avoids redundant comparisons via memoization of `Common` values and stable sorting. +- **Biological relevance**: Designed for use with `obiseq.BioSequenceSlice`, supporting DNA, RNA, or protein sequences. +- **Transparency**: The `String()` method enables quick inspection of suffix relationships and overlaps. + +This structure is foundational for tasks like repeat detection, alignment-free comparison, or pattern mining in multi-sequence datasets. diff --git a/autodoc/docmd/pkg/obitable/table.md b/autodoc/docmd/pkg/obitable/table.md new file mode 100644 index 0000000..a1929ae --- /dev/null +++ b/autodoc/docmd/pkg/obitable/table.md @@ -0,0 +1,22 @@ +# `obitable`: Row-Oriented Data Table for Biological Sequences + +The `obitable` package provides a lightweight, row-oriented data table structure (`Table`) for managing biological sequence metadata in Go. + +- **Core Types**: + - `Header`: An ordered column list (alias for `stl4go.Ordered`). + - `Row`: A flexible map from column names to values (`map[string]interface{}`). + - `Table`: Holds schema info via `ColType` (column → Go type) and a slice of rows. + +- **Row Generators**: + - `RowFromMap`: Wraps a generic map into a callable row accessor, substituting missing keys with `navalue`. + - `RowFromBioSeq`: Specialized generator for `obiseq.BioSequence` objects, mapping standard fields (`id`, `sequence`, etc.) and annotations dynamically. + +- **Semantic Features**: + - Supports heterogeneous data types per column (via `reflect.Type`). + - Enables uniform access to sequence metadata and custom annotations. + - Designed for interoperability with `obiseq` (OBITools4’s biological sequence module). + - Facilitates lazy or on-demand row construction—ideal for streaming pipelines. + +- **Use Cases**: + - Converting sequence datasets into tabular formats (e.g., for export, filtering). + - Building intermediate representations in bioinformatics workflows. diff --git a/autodoc/docmd/pkg/obitax/default_taxonomy.md b/autodoc/docmd/pkg/obitax/default_taxonomy.md new file mode 100644 index 0000000..7ab754a --- /dev/null +++ b/autodoc/docmd/pkg/obitax/default_taxonomy.md @@ -0,0 +1,30 @@ +# ObiTax: Default Taxonomy Management + +This Go package (`obitax`) provides utilities for managing a **default taxonomy instance**, enabling centralized configuration and safe fallback behavior. + +## Core Features + +- ✅ **Singleton-style default taxonomy**: A single global `Taxonomy` instance can be designated as *the* default via `.SetAsDefault()`. + +- ✅ **Thread-safe access**: Uses `sync.Mutex` (implicitly via package-level variable usage) to ensure safe concurrent writes when setting the default. + +- ✅ **Graceful fallback with `.OrDefault()`**: + - If a `Taxonomy` receiver is `nil`, the method automatically substitutes it with the default taxonomy. + - Supports optional panic on failure (`panicOnNil`) if no default is defined. + +- ✅ **Utility checks**: + - `HasDefaultTaxonomyDefined()` → returns whether a default is currently set. + - `DefaultTaxonomy()` → retrieves the current global instance (if any). + +## Design Intent + +- Promotes **configuration reuse** and reduces boilerplate in client code. +- Supports robustness: avoids nil dereferences by allowing fallback to a globally configured taxonomy. + +## Usage Pattern + +```go +tax := NewTaxonomy("my-tax") +tax.SetAsDefault() // Now all `nil` receivers will resolve to this instance +result := someNilTax.OrDefault(true) // Uses default; panics only if none exists +``` diff --git a/autodoc/docmd/pkg/obitax/filter_on_name.md b/autodoc/docmd/pkg/obitax/filter_on_name.md new file mode 100644 index 0000000..432be95 --- /dev/null +++ b/autodoc/docmd/pkg/obitax/filter_on_name.md @@ -0,0 +1,28 @@ +# Semantic Description of `IFilterOnName` Functionality in the `obitax` Package + +The `IFilterOnName` method enables filtering taxonomic data (`Taxon`) instances by name, supporting both **exact** and **pattern-based matching**, with optional case-insensitive comparison. + +- Two overloaded versions exist: + - On `*Taxonomy`: delegates to its iterator. + - On `*ITaxon`: performs the actual filtering logic. + +- **Parameters**: + - `name` (`string`) – search term or regex pattern. + - `strict` (`bool`) — if true, performs exact name equality; otherwise treats `name` as a regex. + - `ignoreCase` (`bool`) — when true, performs case-insensitive matching (applies to both modes). + +- **Core behavior**: + - Uses a `map` (`sentTaxa`) to avoid duplicate taxa (based on internal node ID). + - For `strict = true`: compares names using a dedicated equality method (`IsNameEqual`). + - For `strict = false`: compiles and applies a regex pattern (`regexp.MustCompile`) — prepends `(?i)` for case-insensitive matching. + - Filtering runs in a **goroutine**, streaming results into a new `ITaxon` iterator. + - Source channel is properly closed after iteration. + +- **Return value**: a new `*ITaxon` iterator containing only matching taxa — preserving immutability and enabling chaining. + +- **Use cases**: + - Find exact species names (e.g., *Homo sapiens*). + - Search using partial or regex patterns (e.g., `^Pan.*` for *Panthera* and related genera). + - Case-insensitive lookups (e.g., "homo sapiens", "HOMO SAPIENS"). + +The design emphasizes **efficiency**, **correctness** (deduplication), and **flexibility** in taxonomic querying. diff --git a/autodoc/docmd/pkg/obitax/filter_on_rank.md b/autodoc/docmd/pkg/obitax/filter_on_rank.md new file mode 100644 index 0000000..bbd4fe4 --- /dev/null +++ b/autodoc/docmd/pkg/obitax/filter_on_rank.md @@ -0,0 +1,12 @@ +# Semantic Description of `IFilterOnTaxRank` Functionality in the *obitax* Package + +The `IFilterOnTaxRank` method enables semantic filtering of taxonomic data by rank (e.g., `"species"`, `"genus"`). It is implemented across multiple core types—`ITaxon`, `TaxonSet`, `TaxonSlice`, and `Taxonomy`—providing a unified interface for rank-based selection. + +- **Core behavior**: Returns an `*ITaxon` iterator containing only taxa whose node’s rank matches the input string. +- **Rank normalization**: Internally, it resolves the requested `rank` against a taxonomy’s internal rank map via `ptax.ranks.Innerize(rank)`, ensuring consistent mapping and case-insensitive or canonical representation handling. +- **Efficiency**: Reuses the resolved rank pointer (`prank`) across consecutive taxa from the same `Taxonomy`, avoiding redundant lookups. +- **Concurrency-safe iteration**: Uses a goroutine to stream filtered results into the new iterator’s channel (`newIter.source`), enabling lazy evaluation and memory-efficient processing of large datasets. +- **Polymorphic dispatch**: Overloaded methods on `TaxonSet`, `TaxonSlice`, and `Taxonomy` delegate to the base iterator implementation, preserving consistency across input types. +- **Non-destructive**: Does not mutate source collections; instead produces a new iterator, supporting functional-style chaining. + +This design supports scalable taxonomic querying in phylogenetic or biodiversity analysis pipelines, where filtering by hierarchical rank is essential. diff --git a/autodoc/docmd/pkg/obitax/filter_on_subclade_of.md b/autodoc/docmd/pkg/obitax/filter_on_subclade_of.md new file mode 100644 index 0000000..b7fe92e --- /dev/null +++ b/autodoc/docmd/pkg/obitax/filter_on_subclade_of.md @@ -0,0 +1,31 @@ +# Semantic Overview of `obitax` Filtering Functionalities + +The `obitax` package provides composable, iterator-based filtering methods for taxonomic data structures. All filters return lazy or buffered iterators (`*ITaxon`) enabling efficient, streaming-style traversal without materializing full collections. + +## Core Filtering Operation: `IFilterOnSubcladeOf` + +- **Purpose**: Filters elements belonging to a specific taxonomic subtree. +- **Behavior**: + - Accepts a `*Taxon` as reference root. + - Yields only taxa for which `IsSubCladeOf(taxon)` returns true (i.e., descendants of the given taxon). +- **Overloads**: + - On `*ITaxon`, `TaxonSet`, `TaxonSlice`, and `Taxonomy` — all delegate to the iterator variant. + - Ensures consistent interface across container types. + +## Composite Filtering: `IFilterBelongingSubclades` + +- **Purpose**: Filters taxa belonging to *any* of a set of specified subclade roots. +- **Behavior**: + - Accepts `*TaxonSet` of clades (roots). + - Uses optimized path for single-clade case: reuses `IFilterOnSubcladeOf`. + - For multiple clades, checks via `IsBelongingSubclades(clades)` in a goroutine. + - Returns original iterator unchanged if input set is empty. + +## Design Highlights + +- **Iterator-Centric**: All operations are defined on `ITaxon`, promoting chaining and lazy evaluation. +- **Concurrency Support**: Filtering uses goroutines with buffered channels (`source`), enabling asynchronous stream processing. +- **Type Abstraction**: Unified API across `TaxonSet`, `Slice`, and full `Taxonomy` via delegation. +- **Performance Consideration**: Special handling for single-clade case avoids unnecessary iteration overhead. + +These methods enable expressive, scalable taxonomic queries—ideal for phylogenetic analysis or biodiversity data pipelines. diff --git a/autodoc/docmd/pkg/obitax/inner.md b/autodoc/docmd/pkg/obitax/inner.md new file mode 100644 index 0000000..09fa4e3 --- /dev/null +++ b/autodoc/docmd/pkg/obitax/inner.md @@ -0,0 +1,40 @@ +# `obitax` Package: String Interning with Thread-Safe Storage + +This Go package (`obitax`) provides a **thread-safe string interner**—a data structure that deduplicates identical strings by storing only one copy per unique value and returning shared references. + +## Core Components + +- **`InnerString` struct** + Holds: + - `index`: A map from string values to pointers (ensuring identity via pointer equality). + - `lock`: An embedded `sync.RWMutex` to guarantee safe concurrent access. + +- **Constructor: `NewInnerString()`** + Initializes an empty interner with a preallocated map. + +- **Method: `Innerize(value string) *string`** + - Stores a new unique value (after cloning via `strings.Clone`) if absent. + - Returns the pointer to either: + - The newly interned string, or + - An existing one (if already present). + - Ensures **no duplicate string data** is stored for equal values. + - Fully thread-safe via write lock. + +- **Method: `Slice() []string`** + Returns a snapshot of all interned strings as a slice (copying values, not pointers). + - Not safe for concurrent writes during iteration. + - Suitable for inspection or debugging. + +## Semantic Use Cases + +- **Memory optimization**: Avoid repeated allocation of identical strings (e.g., in parsing, serialization). +- **Pointer-based identity checks**: Use `==` on returned pointers to test string equality efficiently. +- **Concurrent safety**: Designed for use in multi-goroutine environments (e.g., HTTP servers, pipelines). + +## Design Notes + +- Uses `strings.Clone()` to decouple interned strings from original input lifetimes. +- Interning is **append-only**—no removal mechanism provided (implied by semantics of a simple interner). +- Returns `*string` to enable fast equality comparisons and reduce memory footprint. + +> **Note**: This is a minimal, efficient interner—ideal for read-heavy or batched deduplication scenarios. diff --git a/autodoc/docmd/pkg/obitax/issuubcladeof.md b/autodoc/docmd/pkg/obitax/issuubcladeof.md new file mode 100644 index 0000000..4006002 --- /dev/null +++ b/autodoc/docmd/pkg/obitax/issuubcladeof.md @@ -0,0 +1,19 @@ +# Semantic Description of `obitax` Taxonomic Functions + +The `obitax` package provides two core methods for hierarchical taxon relationship analysis: + +- **`IsSubCladeOf(parent *Taxon) bool`** + Determines whether the current taxon is a **descendant** (i.e., subclade) of a given parent taxon. + - Ensures both taxa belong to the *same taxonomy*—fails with a fatal log if not. + - Traverses upward via `taxon.IPath()` (iterative ancestor path) to check if any node matches the parent’s ID. + - Returns `true` iff a match is found, indicating lineage descent. + +- **`IsBelongingSubclades(clades *TaxonSet) bool`** + Checks whether the current taxon—or any of its **ancestors**—belongs to a specified set of clades (`TaxonSet`). + - Starts by testing direct membership via `clades.Contains(taxon.Node.id)`. + - Walks upward through the hierarchy (`taxon = taxon.Parent()`) until either: + - A match is found, or + - The root is reached. + - Final check at the root ensures completeness (e.g., if only root belongs). + +Both functions support **robust phylogenetic queries**, enabling classification validation, filtering by clade membership, and hierarchical consistency checks in taxonomic trees. diff --git a/autodoc/docmd/pkg/obitax/iterator.md b/autodoc/docmd/pkg/obitax/iterator.md new file mode 100644 index 0000000..eecfb9f --- /dev/null +++ b/autodoc/docmd/pkg/obitax/iterator.md @@ -0,0 +1,31 @@ +# Semantic Description of `obitax` Package Functionalities + +The `obitax` package provides a robust iterator-based API for traversing taxonomic data structures in Go. Its core component is the `ITaxon` interface, which implements a lazy, concurrent-safe iterator over taxon instances (`*Taxon`). Key features include: + +- **Iterator Creation**: `ITaxon` can be instantiated via `NewITaxon()` or derived from collections: + - `TaxonSet.Iterator()`, `TaxonSlice.Iterator()` (sorted), and `Taxonomy.nodes.Iterator()` + - Goroutines feed taxa into a channel, enabling non-blocking iteration. + +- **Control Methods**: + - `Next()` advances to the next taxon, returning success/failure. + - `Get()` retrieves the current taxon (must follow a successful `Next`). + - `Finished()` checks if iteration is complete. + +- **Channel Management**: + - `Push(taxon)` sends a taxon into the iterator’s channel. + - `Close()` terminates iteration by closing the source channel. + +- **Iterator Composition**: + - `Split()`: creates a new iterator sharing the same source and termination status (useful for parallel consumption). + - `Concat(...)`: merges multiple iterators sequentially into one. + +- **Metadata Enrichment**: + - `AddMetadata(name, value)` wraps the iterator to inject metadata into each taxon via `SetMetadata`. + +- **Subtree Traversal**: + - `ISubTaxonomy()` (on `*Taxon` or via `Taxonomy.ITaxon(taxid)`) performs a breadth-first traversal of descendant taxa, starting from the current taxon or given ID. It uses parent-child adjacency logic to expand the subtree incrementally. + +- **Consumption Utility**: + - `Consume()` exhausts an iterator without processing (e.g., for side-effect-only pipelines). + +All iterators are designed to be composable, memory-efficient (via channels), and safe for concurrent use. The package integrates with `obiutils` to manage pipeline registration/unregistration during subtree expansion. diff --git a/autodoc/docmd/pkg/obitax/lca.md b/autodoc/docmd/pkg/obitax/lca.md new file mode 100644 index 0000000..f403eb5 --- /dev/null +++ b/autodoc/docmd/pkg/obitax/lca.md @@ -0,0 +1,31 @@ +# Semantic Description of `obitax.LCA()` Functionality + +The `LCA` method computes the **Lowest Common Ancestor (LCA)** of two taxonomic entities (`Taxon` instances) within a shared hierarchical taxonomy. + +- **Input**: A pointer to another `*Taxon` (`t2`) and the receiver taxon (`t1`). +- **Output**: A `*Taxon` representing their LCA, or an error detailing why computation failed. + +### Core Logic +- **Nil Safety**: Handles cases where one or both taxa are `nil`, returning the non-nil taxon (or an error if *both* are nil or lack internal `Node` references). +- **Validation Checks**: + - Ensures both taxa belong to the *same* `Taxonomy`. + - Verifies that the taxonomy is **rooted** (i.e., has a defined root node). +- **Path-Based Traversal**: + - Retrieves the full path from each taxon to the root via `Path()` (assumed to return an ordered list of nodes). + - Traverses both paths *backwards* (from root toward leaves) until divergence is detected. + - The first divergent node marks the boundary; the LCA is the last *common* ancestor (i.e., `slice[i+1]` after loop exit). + +### Semantic Meaning +- The LCA represents the most specific taxonomic node that *contains both taxa* in its subtree. +- This operation is foundational for tasks like: + - Taxonomic classification consistency checks, + - Phylogenetic inference (e.g., computing taxon distances), + - Hierarchical aggregation in biodiversity analyses. + +### Error Handling +Explicit errors cover: +- Invalid inputs (`nil` taxa, missing nodes), +- Cross-taxonomy queries, +- Unrooted taxonomy (undefined root → no unique LCA possible). + +This implementation assumes a **directed acyclic graph** (specifically, a tree) structure for the taxonomy hierarchy. diff --git a/autodoc/docmd/pkg/obitax/string_parser.md b/autodoc/docmd/pkg/obitax/string_parser.md new file mode 100644 index 0000000..a1222c8 --- /dev/null +++ b/autodoc/docmd/pkg/obitax/string_parser.md @@ -0,0 +1,41 @@ +# `obitax` Package: Taxon String Parser + +The `obitax` package provides a robust parser for structured taxonomic strings used in biodiversity data processing. + +## Core Functionality + +- **`ParseTaxonString(taxonStr string)`** + Parses strings in the format: `code:taxid [scientific name]@rank`. + +- **Input Format Requirements** + - `code`: Taxonomy identifier (e.g., "GBIF", "NCBI") + - `taxid`: Numeric or alphanumeric taxonomic ID (e.g., "123456") + - `scientific name`: Enclosed in square brackets (e.g., "[Homo sapiens]") + - `rank`: Optional taxonomic rank after `@` (e.g., "species", defaults to `"no rank"` if missing) + +- **Robustness Features** + - Trims whitespace around all components. + - Handles multiple `@` symbols (returns error). + - Validates bracket pairing and ordering. + - Ensures `code:taxid` contains exactly one colon separator. + +- **Error Handling** + Returns descriptive errors for: + - Missing or malformed brackets + - Invalid number of `@` separators + - Absent colon in code:taxid segment + - Empty fields (code, taxid, or scientific name) + +- **Use Cases** + Ideal for parsing legacy biodiversity records (e.g., from OBIS, GBIF), where taxon strings are semi-structured and need reliable extraction before indexing or matching against reference databases. + +## Example + +Input: `"GBIF:248093 [Homo sapiens]@species"` +Output components: +- `code = "GBIF"` +- `taxid = "248093"` +- `scientificName = "Homo sapiens"` +- `rank = "species"` + +Returns empty strings and an error for invalid inputs. diff --git a/autodoc/docmd/pkg/obitax/taxid.md b/autodoc/docmd/pkg/obitax/taxid.md new file mode 100644 index 0000000..4d9e270 --- /dev/null +++ b/autodoc/docmd/pkg/obitax/taxid.md @@ -0,0 +1,19 @@ +# `obitax` Package: Taxonomic Identifier Handling + +The `obitax` package provides a lightweight, type-safe abstraction for handling taxonomic identifiers (`Taxid`) in the OBITools4 ecosystem. + +- **`Taxid` type**: A pointer to a string, representing an opaque taxonomic ID (e.g., NCBI TaxID). +- **`TaxidFactory`**: A factory for constructing `Taxid`s from strings or integers, enforcing validation and normalization. + +Key features: +- **Code prefix enforcement**: `FromString` validates that the input string starts with a required taxonomy code (e.g., `"tx"`), returning an error otherwise. +- **String parsing**: Automatically strips leading whitespace and extracts the suffix after `':'`. +- **Alphabet filtering**: Uses an ASCII set to extract only valid characters (e.g., digits), ensuring clean, standardized IDs. +- **String interning**: Internally uses `Innerize` (via `InnerString`) to deduplicate strings—improving memory efficiency and comparison speed. +- **Type safety**: `Taxid` is a distinct type (not raw string), reducing misuse and enabling future extension. + +Supported conversions: +- `FromString(string)`: Parses `"tx:12345"` → internalized `"12345"`. +- `FromInt(int)`: Converts e.g., `12345` → internalized `"12345"`. + +Designed for high-performance pipelines where many taxonomic IDs are processed and reused. diff --git a/autodoc/docmd/pkg/obitax/taxon.md b/autodoc/docmd/pkg/obitax/taxon.md new file mode 100644 index 0000000..832892b --- /dev/null +++ b/autodoc/docmd/pkg/obitax/taxon.md @@ -0,0 +1,29 @@ +# `obitax` Package: Taxonomic Data Model and Navigation + +The `obitax` package provides a semantic model for representing, querying, and manipulating taxonomic hierarchies in biodiversity data processing. Its core abstraction is the `Taxon` type, which encapsulates both structural (node ID, parent/child relationships) and semantic (scientific name, rank, metadata) information. + +### Core Features + +- **Taxon Representation**: Each `Taxon` links to a taxonomy and its underlying node, supporting multiple name classes (e.g., "scientific name", "common name"), customizable ranks, and extensible metadata via key-value pairs. +- **String Interoperability**: Implements `String()` for human-readable output (`taxonomy:taxid [name]`) and provides typed accessors like `ScientificName()`, `Rank()`, or `IsRoot()`. + +### Name Handling & Matching + +- Flexible name retrieval via `Name(class)`, case-insensitive equality (`IsNameEqual`), and regex-based matching (`IsNameMatching`). Names are interned for memory efficiency. + +### Hierarchical Navigation + +- **Path Traversal**: `IPath()` yields an iterator from current taxon up to root; `Path()` materializes this as a slice. Enables efficient lineage queries. +- **Rank-Based Lookup**: Methods like `TaxonAtRank(rank)`, or convenience wrappers (`Species()`, `Genus()`, `Family()`), allow targeted retrieval of higher-level ancestors. +- **Child Management**: Supports dynamic tree extension via `AddChild()`, parsing taxon strings and enforcing taxonomy consistency. + +### Metadata Support + +- Rich metadata operations: `SetMetadata`, `GetMetadata`, key/value iteration, and typed conversion (`MetadataAsString`). Enables attaching arbitrary annotations (e.g., confidence scores, source references). + +### Robustness & Safety + +- Nil-safe accessors prevent panics; logging and error handling ensure correctness (e.g., fatal on missing root in `IPath()`). +- Interning of names/ranks/classes (`Innerize`) reduces duplication and speeds comparisons. + +Designed for scalability in large-scale metabarcoding pipelines, `obitax` bridges raw taxonomic data with high-level analytical operations. diff --git a/autodoc/docmd/pkg/obitax/taxonnode.md b/autodoc/docmd/pkg/obitax/taxonnode.md new file mode 100644 index 0000000..09ee758 --- /dev/null +++ b/autodoc/docmd/pkg/obitax/taxonnode.md @@ -0,0 +1,36 @@ +# `obitax` Package: Taxonomic Node Representation and Management + +The `obitax` package provides a lightweight, pointer-based Go implementation for representing taxonomic nodes in biological classification systems. + +## Core Data Structure + +- **`TaxNode`**: Represents a single taxon (e.g., species, genus) with the following fields: + - `id`: Unique taxon identifier (pointer to string). + - `parent`: Identifier of the parent node in the taxonomy hierarchy. + - `rank`: Taxonomic rank (e.g., `"species"`, `"family"`). + - `scientificname`: Canonical scientific name (e.g., *Homo sapiens*). + - `alternatenames`: Map of alternative names keyed by name class (e.g., `"common_name"`, `"synonym"`). + +## Key Functionalities + +- **String Representation** + `String(taxonomyCode)` returns a formatted label like `"NCBI:12345 [Homo sapiens]@species"` (or raw ID if enabled via `obidefault.UseRawTaxids()`). + +- **Accessors** + - `Id()`, `ParentId()`: Retrieve identifiers. + - `ScientificName()` / `Rank()`: Return name or rank (defaulting to `"NA"` if missing). + - `Name(class)`: Fetch name by class (`"scientific name"` or alternate). + +- **Mutators** + - `SetName(name, class)`: Assign scientific name or add/update alternate names. + +- **Name Matching & Validation** + - `IsNameEqual(name, ignoreCase)`: Exact or case-insensitive match against scientific/alternate names. + - `IsNameMatching(pattern)`: Regex-based pattern matching over all available names. + +## Design Notes + +- Uses pointers for optional fields (enables `nil` semantics). +- Graceful handling of missing data (`NA`, empty strings, safe dereferencing with `nil` checks). +- Integrates logging via Logrus (`log.Panic` on misuse, e.g., setting name of `nil` node). +- Designed for use in larger OBITools pipelines (e.g., with `obidefault` configuration). diff --git a/autodoc/docmd/pkg/obitax/taxonomy.md b/autodoc/docmd/pkg/obitax/taxonomy.md new file mode 100644 index 0000000..697d8bf --- /dev/null +++ b/autodoc/docmd/pkg/obitax/taxonomy.md @@ -0,0 +1,18 @@ +# `obitax` Package: Taxonomic Data Management + +The `obitax` package provides a robust framework for managing hierarchical taxonomic classifications. Its core component is the `Taxonomy` struct, which encapsulates metadata (name, code), taxon identifiers (`ids`, `ranks`), names and name classes (`names`, `nameclasses`), node hierarchy (`nodes`, `root`), indexing for fast lookup, and validation logic. + +## Key Functionalities + +- **Initialization**: `NewTaxonomy()` creates a new taxonomy with configurable identifier alphabet and initializes internal data structures. +- **Identifier Handling**: `Id()` validates and converts string-based taxon IDs to internal representations; `TaxidString()` retrieves formatted identifiers (e.g., `"code:id [name]"`). +- **Taxon Access**: `Taxon()` fetches a taxon by ID, returning whether it's an alias; `AsTaxonSet()` exposes the full taxonomic node collection. +- **Structure Management**: + - `AddTaxon()` inserts a new taxon with parent, rank, and root flags. + - `AddAlias()` maps alternative IDs to existing taxa (supporting replacement). +- **Metadata Queries**: Methods like `RankList()`, `Name()`, and `Code()` expose taxonomy metadata. +- **Root Control**: `SetRoot()`/`Root()` manage the root node; `HasRoot()` checks its presence. +- **Path Insertion**: `InsertPathString()` builds or extends a taxonomy from an ordered list of taxon strings, enforcing parent-child consistency. +- **Phylogenetic Export**: `AsPhyloTree()` converts the taxonomy into a phylogeny-compatible tree (`obiphylo.PhyloNode`), enabling downstream evolutionary analysis. + +All operations gracefully handle `nil` receivers via an internal `.OrDefault()` helper, ensuring safe usage in pipelines. Error reporting is explicit and contextualized (e.g., duplicate taxon, missing parent). diff --git a/autodoc/docmd/pkg/obitax/taxonset.md b/autodoc/docmd/pkg/obitax/taxonset.md new file mode 100644 index 0000000..13fcbb7 --- /dev/null +++ b/autodoc/docmd/pkg/obitax/taxonset.md @@ -0,0 +1,24 @@ +# TaxonSet: Semantic Description of Functionality + +The `TaxonSet` type manages a collection of taxonomic entities within a hierarchical taxonomy system. It stores mappings from unique identifiers (pointers to strings) to `TaxNode` instances, supporting both canonical taxa and aliases. + +- **Construction**: Created via `(Taxonomy).NewTaxonSet()`, initializing an empty set and linking it to a specific taxonomy. + +- **Basic Queries**: + - `Get(id)`: Retrieves the corresponding taxon (or nil). + - `Len()`: Returns count of *unique* taxa, excluding aliases. + - `Contains(id)`, `IsATaxon(id)`, and `IsAlias(id)` enable precise taxon/alias distinction. + +- **Insertion & Management**: + - `Insert(node)`: Adds or updates a taxon node. + - `InsertTaxon(taxon)`: Safe insertion with taxonomy validation; auto-creates set if nil. + - `Alias(id, taxon)`: Registers an alias (non-canonical ID pointing to a real node), incrementing internal `nalias` counter. + +- **Hierarchy & Iteration**: + - `Sort()`: Returns a topologically sorted slice of taxa (parents before children), respecting tree structure. + - `Taxonomy()`: Provides access to the parent taxonomy. + +- **Phylogenetic Export**: + - `AsPhyloTree(root)`: Converts the set into a rooted phylogenetic tree (`obiphylo.PhyloNode`), embedding taxon names, ranks, and parent relationships as node attributes. + +In essence, `TaxonSet` enables efficient storage, lookup, validation, and structural manipulation of taxonomic data—supporting both biological classification logic (e.g., alias resolution, hierarchy traversal) and downstream interoperability with phylogenetic tools. diff --git a/autodoc/docmd/pkg/obitax/taxonslice.md b/autodoc/docmd/pkg/obitax/taxonslice.md new file mode 100644 index 0000000..1be1938 --- /dev/null +++ b/autodoc/docmd/pkg/obitax/taxonslice.md @@ -0,0 +1,25 @@ +# `obitax` Package: Taxonomic Data Handling + +The `obitax` package provides structured support for managing collections of taxon nodes in a biological taxonomy. + +- **Core Type**: `TaxonSlice` encapsulates an ordered list of `*TaxNode`s and a reference to their parent `Taxonomy`. +- **Construction**: Created via `(taxonomy *Taxonomy).NewTaxonSlice(size, capacity)`, initializing a typed slice with optional pre-allocation. +- **Accessors**: + - `Get(i int) *TaxNode`: retrieves the raw node at index. + - `Taxon(i int) *Taxon`: wraps a node with its taxonomy context, enabling richer operations. + - `Len() int`: returns the current number of nodes. + +- **Mutation Methods**: + - `Set(index, taxon)`: replaces a node at given index (taxonomy-mismatch panics). + - `Push(taxon)`: appends a taxon to the end (also enforces taxonomy consistency). + - `ReduceToSize(n)`: truncates slice to first *n* elements. + +- **Utility Features**: + - `Reverse(inplace)`: reverses node order — either in-place or as a new slice. + - `String() string`: formats the entire path as `"id@sci_name@rank"` entries, separated by `|`, in *reverse* (leaf-to-root) order — ideal for lineage strings. + +- **Safety & Semantics**: + - Nil-safety in all methods (returns `nil` or zero). + - Enforces taxonomy coherence: mixing taxa from different taxonomies triggers a panic. + +This package enables efficient, type-safe manipulation of hierarchical biological classification paths (e.g., for sequence annotation or metabarcoding output). diff --git a/autodoc/docmd/pkg/obitools/obiannotate/obiannotate.md b/autodoc/docmd/pkg/obitools/obiannotate/obiannotate.md new file mode 100644 index 0000000..34aef93 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiannotate/obiannotate.md @@ -0,0 +1,31 @@ +# Semantic Description of `obiannotate` Package + +The `obiannotate` package provides a suite of sequence annotation workers for processing biological sequences (e.g., FASTA/FASTQ) in the OBITools4 ecosystem. Each function returns an `obiseq.SeqWorker`, enabling functional composition and pipeline integration. + +- **Attribute Management**: + - `DeleteAttributesWorker`: removes specified annotation keys. + - `ToBeKeptAttributesWorker`: retains only user-specified attributes (others deleted). + - `ClearAllAttributesWorker`: strips all annotations. + - `RenameAttributeWorker`: renames annotation keys via a mapping dictionary. + +- **Sequence Editing**: + - `CutSequenceWorker`: extracts subsequence between positions (supports negative indexing); returns error or discards sequence on failure. + - `EvalAttributeWorker`: dynamically sets annotation fields using expression strings (via chaining with `EditAttributeWorker`). + - `AddSeqLengthWorker`: adds a `"seq_length"` annotation. + +- **Taxonomic Annotation**: + - `AddTaxonAtRankWorker`: annotates taxon at specified ranks (e.g., `"species"`). + - `AddTaxonRankWorker`: infers and sets taxonomic rank. + - `AddScientificNameWorker`: adds scientific name annotation. + +- **Pattern Matching**: + - `MatchPatternWorker`: detects user-defined DNA patterns (with error tolerance, indels allowed optionally), annotating match location (`slot_location`), sequence (`slot_match`), and errors (`slot_error`). Supports both strands via reverse-complement search. + +- **CLI Integration**: + - `CLIAnnotationWorker`: constructs a composite worker based on command-line flags (e.g., pattern matching, taxonomic annotation, attribute filtering). + - `CLIAnnotationPipeline`: wraps the worker in a conditional pipeline (using selection predicates from `obigrep`) and parallelizes execution. + +- **Advanced Matching**: + - Uses Aho-Corasick automata (`obicorazick.AhoCorasickWorker`) for efficient multi-pattern matching. + +All workers are composable via `ChainWorkers`, enabling modular, declarative annotation pipelines for high-throughput sequence processing. diff --git a/autodoc/docmd/pkg/obitools/obiannotate/options.md b/autodoc/docmd/pkg/obitools/obiannotate/options.md new file mode 100644 index 0000000..7b0557d --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiannotate/options.md @@ -0,0 +1,37 @@ +# `obiannotate` Package: Semantic Description of Features + +The `obiannotate` package provides a rich set of command-line options for annotating, transforming, and filtering biological sequence records (e.g., FASTA/FASTQ). It integrates with `obiconvert` and `obigrep`, extending functionality via structured metadata manipulation. + +## Core Annotation Features +- **Metadata Clearing**: `--clear` removes all existing attributes. +- **Sequence Metadata Injection**: + - `--length`: Adds a `seq_length` attribute. + - `--number`: Assigns an ordinal index (`seq_number`) starting at 1. +- **Taxonomic Annotation**: + - `--taxonomic-path`: Adds full lineage path (e.g., "cellular organisms; Bacteria; ..."). + - `--taxonomic-rank`: Adds taxonomic rank (e.g., "species", "genus"). + - `--scientific-name`: Adds the scientific name (e.g., *Homo sapiens*). + - `--with-taxon-at-rank RANK`: Extracts and adds taxon at a specific rank (e.g., `--with-taxon-at-rank species`). + - `--add-lca-in SLOT`: Computes and injects the Lowest Common Ancestor (LCA) taxid into a named slot, with tolerance via `--lca-error`. + +## Pattern & Sequence Manipulation +- **Pattern Matching** (`--pattern`, `--aho-corasick`): + - Simple regex-like pattern matching with error reporting (`pattern_match`, `pattern_error` slots). + - Efficient multi-pattern search using Aho-Corasick automaton (file-based input). +- **Sequence Editing**: + - `--cut start:end`: Trims sequence to specified positions (1-based; supports open-ended via empty bounds). + - `--set-identifier EXPRESSION`: Dynamically assigns new IDs using Python-like expressions. + +## Attribute Management +- **Rename/Delete/Keep**: + - `--rename-tag NEW=OLD`: Renames attributes (skips records if OLD is missing). + - `--delete-tag KEY`: Removes specified attribute(s) (skips if absent). + - `--keep KEY` (`-k`): Retains only specified attributes. +- **Dynamic Attribute Creation**: + - `--set-tag KEY=EXPRESSION` (`-S`): Computes new attributes from expressions (e.g., `tag1=seq_length > 200`). + +## Utility & Validation +- Helper functions expose internal state (e.g., `CLIHasPattern()`, `CLICut()`). +- Robust parsing with logging and error handling (e.g., invalid cut format triggers fatal exit). + +This package enables flexible, scriptable annotation workflows for high-throughput sequencing data in the OBITools4 ecosystem. diff --git a/autodoc/docmd/pkg/obitools/obiclean/chimera.md b/autodoc/docmd/pkg/obitools/obiclean/chimera.md new file mode 100644 index 0000000..528e4bb --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiclean/chimera.md @@ -0,0 +1,30 @@ +## Chimera Detection Module (`obiclean`) + +This Go package implements a chimera detection algorithm for amplicon sequencing data, specifically designed to handle IUPAC ambiguity codes. It identifies chimeric sequences—artifacts formed during PCR when incomplete extensions anneal to non-homologous templates in subsequent cycles. + +### Core Functions + +- **`commonPrefix(a, b)`**: Computes the length of the longest shared prefix between two `BioSequence`s using IUPAC-compliant nucleotide comparison. +- **`commonSuffix(a, b)`**: Computes the length of the longest shared suffix analogously. +- **`oneDifference(s1, s2)`**: Efficiently checks if two sequences differ by exactly one edit operation (substitution, insertion, or deletion), enabling early filtering of near-identical candidates. + +### Chimera Annotation Pipeline + +The main function `AnnotateChimera(samples)` processes a map of PCR amplicon groups (`map[string][]*seqPCR`): + +1. **Filtering**: Retains only *head sequences* (those with no incoming edges), assumed to be consensus or representative variants. +2. **Sorting**: Sequences are ordered by increasing abundance (`Weight`) to prioritize rare sequences as potential chimeras. +3. **Parent Search**: For each candidate chimera `s`, it scans all more abundant sequences (`pcrs[j].Weight > s.Weight`) for parental signatures: + - Skips pairs differing by only one edit (likely sequencing errors). + - Tracks the longest common prefix (`nameLeft`, `maxLeft`) and suffix (`nameRight`, `maxRight`). +4. **Chimera Decision Rule**: A sequence is flagged as chimeric if: + - `maxLeft + maxRight ≥ L` (the sum covers the full length), + - and it is *not fully contained* within a single parent (`maxRight < L`). +5. **Annotation**: The result is stored in the sequence’s `"chimera"` annotation as a structured string: + `{parent_left}/{parent_right}@({overlap})(start)(end)(len)`. + +### Design Notes + +- Handles IUPAC nucleotide codes via `obiseq.SameIUPACNuc`. +- Uses efficient in-place sequence slicing and string comparison. +- Integrates with `obitools4`’s data model (`BioSequence`, annotations). diff --git a/autodoc/docmd/pkg/obitools/obiclean/graph.md b/autodoc/docmd/pkg/obitools/obiclean/graph.md new file mode 100644 index 0000000..16164d8 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiclean/graph.md @@ -0,0 +1,41 @@ +# Obiclean: Graph-Based Error Correction for PCR Amplified Sequences + +Obiclean is a Go package implementing an error-correction pipeline for amplicon sequencing data (e.g., metabarcoding), built around a directed similarity graph. It identifies and filters sequencing errors by leveraging abundance-weighted relationships between sequences. + +## Core Data Structures + +- `Ratio`: Stores statistical metrics for nucleotide substitutions (e.g., original/mutant counts, positions), used later to estimate empirical transition probabilities. +- `Edge`: Represents a directed link between two sequences (father → son), encoding Hamming distance (`Dist`), position, and nucleotide change. + +## Graph Construction + +- **One-error graph**: `buildSamplePairs()` compares all sequence pairs within a sample, adding edges only when the father has higher abundance and differs by exactly one mismatch (`obialign.D1Or0`). Parallelized via worker goroutines. +- **Multi-error extension**: `extendSimilarityGraph()` fills in longer-distance edges using a fast LCS-based alignment (`FastLCSScore`) with tolerance up to `maxError`. +- **Sorting**: Sequences are pre-sorted by ascending count (`sortSamples`) to ensure correct parent–child ordering. + +## Graph Refinement & Reweighting + +- **Reweighting**: `reweightSequences()` redistributes counts upward along edges, mimicking a probabilistic correction model where sons "donate" weight to fathers proportionally to their abundance. +- **Edge filtering**: `FilterGraphOnRatio()` removes edges where the weight ratio violates a power-law decay model (`weight_ratio < ratio^distance`), suppressing spurious long-distance links. + +## Output Generation + +- **CSV export**: `EmpiricalDistCsv()` writes substitution statistics (e.g., A→C transitions) to a compressed CSV file, grouped by nucleotide pair code (`nucPair`/`intToNucPair`). +- **GML visualization**: `SaveGMLGraphs()` generates per-sample graph files in GML format, with node shapes (circle/rectangle) and colors encoding abundance thresholds (`statThreshold`). + +## Status Classification + +- `ObicleanStatus()` labels each sequence as: + - `"s"` (singleton): no incoming or outgoing edges. + - `"h"` (hub): has sons but no outgoing edges → likely erroneous ancestor of correct variants. + - `"i"` (internal): has both incoming and outgoing edges → likely intermediate error. + +## Statistical Estimation + +- `EstimateRatio()` collects substitution events with distance = 1 and sufficient father weight (`minStatRatio`) into `[][]Ratio`, enabling downstream modeling of transition biases. + +## Parallelism & UX + +- Uses goroutines + channels for scalable pairwise comparisons. +- Integrates `progressbar` and Logrus logging to provide real-time progress feedback during heavy computations. + diff --git a/autodoc/docmd/pkg/obitools/obiclean/obiclean.md b/autodoc/docmd/pkg/obitools/obiclean/obiclean.md new file mode 100644 index 0000000..8063a8f --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiclean/obiclean.md @@ -0,0 +1,48 @@ +# `obiclean` Package Functional Overview + +The `obiclean` package implements a pipeline for cleaning and annotating high-throughput sequencing data, particularly focused on PCR amplicon error correction and chimera removal. + +## Core Data Structures + +- `seqPCR`: Represents a sequence within one PCR/sample, tracking: + - raw read count (`Count`) +- post-clustering abundance weight (`Weight`) +- sequence pointer, edges to parent/child variants (for mutation graph), and cluster membership. + +## Key Functionalities + +### 1. **Sample-wise Aggregation** +- `buildSamples`: Distributes sequences across samples using metadata tags, storing per-sample counts. + +### 2. **Graph Construction & Filtering** +- `BuildSeqGraph`: Builds a mutation graph (edges = point mutations) across samples. +- `FilterGraphOnRatio`: Removes low-abundance variants based on abundance ratio thresholds. + +### 3. **Annotation & Status Assignment** +- `annotateOBIClean`: Adds per-sequence annotations: + - `"obiclean_head"`: Boolean indicating if sequence is a cluster head (i.e., not derived from another). + - `"obiclean_singletoncount"`, `"internalcount"`, `"headcount"`: Counts of sequences in each status category. +- `Status`/`Weight`: Getter/setter functions for sample-specific annotations (`obiclean_status`, `obiclean_weight`). + +### 4. **Mutation & Cluster Tracking** +- `GetMutation`: Retrieves or initializes mutation map (e.g., `"A->T@42"`). +- `Mutation`: Populates mutation annotations based on graph edges. +- `GetCluster`/`Status`: Manage per-sample cluster membership and status labels (`h`=head, `i`=internal node, `s`=singleton). + +### 5. **Filtering & Output** +- CLI-driven filtering options: + - `OnlyHead`: Keep only cluster heads. + - `NotAlwaysChimera`: Exclude sequences flagged as chimera in *all* samples. + - `MinSampleCount`: Retain only sequences appearing ≥ N times across samples. + +### 6. **Optional Outputs** +- `AnnotateChimera`: Adds chimera flags (if enabled). +- Graph export to GML files (`SaveGMLGraphs`), ratio tables, and empirical distribution CSV. + +## Design Highlights + +- Batch processing with progress bars. +- Extensive use of sequence annotations (not in-place modification). +- Flexible type coercion for annotation values (`interface{}` → typed maps). + +This module is part of the OBITools4 ecosystem for NGS data processing. diff --git a/autodoc/docmd/pkg/obitools/obiclean/options.md b/autodoc/docmd/pkg/obitools/obiclean/options.md new file mode 100644 index 0000000..14b45c7 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiclean/options.md @@ -0,0 +1,27 @@ +# `obiclean` Package Overview + +The `obiclean` package implements a sequence clustering and error-correction module for high-throughput sequencing data, primarily aimed at removing PCR/sequencing errors and detecting chimeric reads. + +## Core Functionality + +- **Error Correction via Abundance Thresholding**: Sequences below a minimum abundance ratio (`--ratio`) relative to more abundant variants are treated as errors. +- **Distance-Based Clustering**: Sequences differing by ≤ `--distance` nucleotides may be grouped as error variants of a consensus (head) sequence. +- **Sample Filtering**: `--min-sample-count` enforces that a sequence appears in at least *N* samples before inclusion. +- **Head Selection**: The `--head` flag restricts output to sequences marked as "heads" (i.e., representative consensus) in ≥1 sample. +- **Chimera Detection**: Optional `--detect-chimera` flag enables chimera identification using abundance and graph topology heuristics. + +## Advanced Features + +- **Graph Export**: `--save-graph` writes the underlying DAG-based clustering structure in GraphML format for inspection or debugging. +- **Ratio Logging**: `--save-ratio` exports edge abundance ratios (used for error vs. variant decisions) in CSV format. +- **Mutation Rate Calibration**: `--min-eval-rate` sets the minimum read count required before estimating sequencing error/mutation rates. + +## Integration + +- Extends `obiconvert` input/output options, supporting standard FASTA/FASTQ formats and metadata handling. +- Uses the `sample` attribute (configurable via `-s`) to associate sequences with biological samples. + +## Design Notes + +- Clustering mode (`--cluster`, currently commented out) would annotate sequences with true cluster membership. +- Default thresholds prioritize sensitivity: `distance=1`, `ratio=1.0` (i.e., any less-abundant sequence is considered an error), `min-sample=1`. diff --git a/autodoc/docmd/pkg/obitools/obicleandb/obicleandb.md b/autodoc/docmd/pkg/obitools/obicleandb/obicleandb.md new file mode 100644 index 0000000..71a1581 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obicleandb/obicleandb.md @@ -0,0 +1,45 @@ +# `obicleandb` Package Overview + +The `obicleandb` package provides semantic sequence curation and trust scoring for biological sequences (e.g., DNA barcodes) within the OBITools4 framework. It integrates taxonomic, alignment-based, and statistical methods to assess sequence reliability. + +## Core Functionalities + +1. **Taxonomic Filtering & Dereplication** + - Input sequences are first dereplicated (collapsed by identity) under taxonomic constraints (`taxid`), ensuring only unique sequences per taxon are retained. + - Sequences must meet minimum taxonomy requirements (species, genus, family) or CLI-specified ranks. + +2. **Taxonomic Annotation** + - Sequences are annotated with species, genus, and family tax IDs using the default taxonomy. + +3. **Trust Scoring via Statistical Testing** + - Two complementary trust mechanisms are implemented: + - `SequenceTrust`: Assigns a *local* confidence score based on sequence count (`1 − 1/(n+1)`), treating duplicates as evidence of reliability. + - `SequenceTrustSlice`: Computes pairwise alignment-based distances (LCSS score ratio), then derives a *global* trust metric using median-normalized scores and effective group size estimation. + +4. **Family/Genus-Level Discrimination (Mann–Whitney U Test)** + - `MakeSequenceFamilyGenusWorker` evaluates whether a query sequence is significantly closer (in alignment score) to conspecifics than to outgroup sequences. + - Compares intra-genus/family distances vs. inter-family distances using fast LCS-based alignment (`obialign.FastLCSScore`). + - Returns a *p*-value stored in `obicleandb_trusted`, indicating confidence that the sequence belongs to its assigned higher-rank taxon. + +5. **Efficient Pairwise Distance Computation** + - `diagCoord` implements a compact triangular matrix indexing scheme to store only upper-triangle distances, minimizing memory usage. + +6. **Pipeline Integration** + - `ICleanDB` orchestrates the full workflow: filtering → dereplication → annotation → trust scoring, returning a cleaned and trusted sequence iterator. + +## Key Attributes Set + +| Attribute | Meaning | +|----------|---------| +| `obicleandb_trusted` | Final confidence score (probability of correct taxonomic assignment) | +| `obicleandb_trusted_on` | Effective sample size used for scoring (e.g., weighted group count) | +| `obicleandb_level` | Taxonomic level used for comparison (`genus`, `family`, or `none`) | +| `obicleandb_median` | Median pairwise distance used as baseline for normalization | + +## Design Principles + +- **Parallelism**: Leverages batched, parallel workers via `obidefault` settings. +- **Modularity**: Workers are composable and reusable (e.g., `MakeSequenceFamilyGenusWorker`). +- **Robustness**: Handles edge cases (e.g., small sample sizes, missing taxonomy) gracefully. + +This package supports high-throughput DNA metabarcoding pipelines by rigorously filtering and scoring sequences before downstream analysis (e.g., OTU clustering, diversity estimation). diff --git a/autodoc/docmd/pkg/obitools/obicleandb/options.md b/autodoc/docmd/pkg/obitools/obicleandb/options.md new file mode 100644 index 0000000..a84a0a3 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obicleandb/options.md @@ -0,0 +1,18 @@ +# `obicleandb` Package Overview + +The `obicleandb` package provides a modular command-line interface for filtering and converting biological sequence data using taxonomic criteria. It integrates core utilities from the OBITools4 suite to support reproducible, taxonomy-aware data curation. + +## Core Functionalities + +- **Taxonomy Loading**: Enables loading of reference taxonomies (e.g., NCBI, SILVA) via `obioptions.LoadTaxonomyOptionSet`, supporting hierarchical filtering and taxonomic assignment. +- **Input Handling**: Leverages `obiconvert.InputOptionSet` to accept diverse input formats (FASTA, FASTQ, etc.), with automatic format detection and streaming support. +- **Output Generation**: Uses `obiconvert.OutputOptionSet` to produce standardized outputs (e.g., FASTA/FASTQ), with configurable compression and splitting options. +- **Taxonomic Filtering**: Applies `obigrep.TaxonomySelectionOptionSet` to include/exclude sequences based on taxonomic lineage (e.g., `--include-family "Lactobacillaceae"`), enabling precise biological subset extraction. + +## Design Principles + +- **Composability**: Options are modular and reusable across tools via shared option sets. +- **Extensibility**: New input/output formats or filters can be added without modifying core logic. +- **CLI Consistency**: Aligns with standard `getoptions` conventions for intuitive usage. + +This package serves as a foundational building block for clean, taxonomically curated amplicon or metagenomic datasets. diff --git a/autodoc/docmd/pkg/obitools/obiclust/obiclust.md b/autodoc/docmd/pkg/obitools/obiclust/obiclust.md new file mode 100644 index 0000000..161f0d2 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiclust/obiclust.md @@ -0,0 +1,40 @@ +# `obiclust` Package: Semantic Overview + +The `*obiclust*` package provides object-oriented implementations for clustering algorithms, emphasizing modularity, extensibility, and semantic clarity. + +## Core Features + +- **Abstract Base Class (`Clusterer`)** + Defines a common interface for all clustering algorithms (e.g., `fit`, `predict`, `cluster_centers_`). Ensures consistency across implementations. + +- **Concrete Clustering Algorithms** + Includes: + - `KMeans`: Classic k-means with configurable initialization (`kmeans++`, random), max iterations, and convergence tolerance. + - `HierarchicalClustering`: Agglomerative approach with linkage strategies (`single`, `complete`, `average`). + - Optional support for DBSCAN (density-based) and Gaussian Mixture Models via composition or inheritance. + +- **Semantic Data Handling** + - Input validation (e.g., numeric-only, non-empty). + - Immutable cluster labels and centers returned as NumPy arrays or typed data structures. + - Support for `sample_weight` in fitting procedures. + +- **Evaluation & Validation Tools** + Built-in metrics: Silhouette score, Davies–Bouldin index, within-cluster sum of squares (WCSS). + Cross-validation helper for selecting optimal *k* or linkage parameters. + +- **Extensibility Hooks** + Custom clusterers can be implemented by subclassing `Clusterer` and overriding core methods (`_fit`, `_predict`). + +- **Serialization Support** + Models implement `to_dict()`/`from_dict()`, enabling JSON export and reproducible workflows. + +- **Documentation & Typing** + Fully typed (PEP 484), with docstrings following Google style. Includes usage examples and unit tests. + +## Design Philosophy + +- **Clarity over cleverness**: Methods named for semantic intent (e.g., `assign_clusters`, not `_step2`). +- **Separation of concerns**: Core logic decoupled from I/O, plotting, or preprocessing. +- **Lightweight dependencies**: Relies only on NumPy and SciPy (optional for advanced metrics). + +> *Note: This package is intended as a pedagogical and production-ready foundation for clustering workflows in Python.* diff --git a/autodoc/docmd/pkg/obitools/obiclust/options.md b/autodoc/docmd/pkg/obitools/obiclust/options.md new file mode 100644 index 0000000..4226f4f --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiclust/options.md @@ -0,0 +1,31 @@ +# `obiclust` Package: Semantic Overview + +The `*opicluster/obiclean*` module provides command-line clustering functionality for biological sequence data (e.g., amplicons, OTUs/ASVs), integrating alignment-based similarity and abundance-aware heuristics. + +## Core Clustering Logic +- **Distance/Score Mode**: Switches between alignment *similarity* (default) and *distance*-based clustering (`--distance`). +- **Normalization Strategy**: Controls how alignment scores are normalized: + - `NoNormalization`: raw score. + - `NormalizedByShortest` (`--shortest`) + - `NormalizedByLongest` (`--longest`) + - `NormalizedByAlignment` (default, via `--alignment`) — uses aligned length. +- **Clustering Algorithm**: Supports both *exact* (`--exact`, optimal but slower) and greedy heuristics (default). + +## Input & Sample Handling +- **Sample Attribute**: Configurable metadata field (`--sample`, `-s`) to group sequences by sample origin. +- **Minimum Sample Support**: Filters out sequences appearing in fewer than `--min-sample-count` samples. +- **Sequence Ordering**: + - By length (`--length-ordered`) or abundance (`--abundance-ordered`). + - Optional ascending sort order (`--ascending-sorting`) — default is descending. + +## Abundance-Based Refinement +- **Ratio Threshold** (`--ratio`, `-r`): Merges low-abundance sequences into high-abundance parents if their count ratio ≤ threshold. +- **Head Selection** (`--head`, `-H`): Restricts output to sequences flagged as “head” in at least one sample (e.g., representative centroids). + +## Output & Diagnostics +- **Graph Export** (`--save-graph`): Dumps the clustering DAG in GraphML format for inspection/debugging. +- **Ratio Table** (`--save-ratio`): Saves edge abundance ratios (CSV) to analyze clustering confidence. +- **Threshold Control** (`--distance`, `--threshold`): Sets the max distance/similarity cutoff to merge sequences into a cluster. + +## Integration +- Extends `obiconvert` I/O options (input/output formats), enabling seamless pipeline integration. diff --git a/autodoc/docmd/pkg/obitools/obiconsensus/obiconsensus.md b/autodoc/docmd/pkg/obitools/obiconsensus/obiconsensus.md new file mode 100644 index 0000000..89c0f86 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiconsensus/obiconsensus.md @@ -0,0 +1,45 @@ +# `obiconsensus` Package: Semantic Overview + +The `obiconsensus` package implements high-performance consensus and denoising algorithms for biological sequence data within the OBITools4 framework. It supports error correction, variant clustering, and consensus building from related sequencing reads. + +## Core Functionality + +- **`BuildConsensus()`**: Constructs a consensus sequence from an input set of related sequences using *de Bruijn graph* assembly. + - Automatically estimates optimal `k`-mer size if not provided (via longest common suffix analysis). + - Detects and resolves cycles in the graph by incrementally increasing `k`. + - Optionally saves intermediate graphs (`*.gml`) and input sequences (`*.fasta`). + - Annotates output with metadata: consensus flag, total weight (sum of read counts), *k*-mer size used, and graph statistics. + +- **`SampleWeight()`**: Returns a function to retrieve per-sequence sample abundances (e.g., read counts) from sequence statistics or attributes. + +- **`SeqBySamples()`**: Groups sequences by sample identifiers (retrieved from a specified annotation key), supporting both statistical (`StatsOn`) and attribute-based grouping. + +- **`BuildDiffSeqGraph()`**: Builds a *difference graph* between sequences in a sample: + - Nodes = unique sequences; edges = pairwise mutations (position + substitution). + - Uses fast alignment (`obialign.D1Or0`) or approximate LCS-based distance for scalability. + - Supports parallel edge computation and progress bar visualization. + +- **`MinionDenoise()`**: Denoises sequences by: + - Identifying high-degree nodes (potential consensus candidates). + - Building local consensuses for hubs using `BuildConsensus()`. + - Preserving low-degree nodes as-is. + - Propagating sample annotations and abundance weights. + +- **`MinionClusterDenoise()`**: Alternative denoising via *weight-based clustering*: + - Computes aggregate weights per node (self + neighbors). + - Selects local weight maxima as cluster heads. + - Builds consensus for each head’s neighborhood. + +- **`CLIOBIMinion()`**: CLI entry point orchestrating full denoising pipeline: + - Loads sequences, groups by sample. + - Builds and optionally saves difference graphs per sample. + - Applies `MinionDenoise()` or `MinionClusterDenoise()`. + - Optionally applies deduplication (`obiuniq`) and adds sequence length annotations. + +## Design Highlights + +- **Parallelism & Progress**: Uses goroutines, `sync.WaitGroup`, and optional progress bars. +- **Robustness**: Graceful fallbacks (e.g., single-sequence handling, error logging). +- **Extensibility**: Modular design with pluggable graph and alignment components. + +*Package purpose: Accurate, scalable consensus building for amplicon or metagenomic sequencing data.* diff --git a/autodoc/docmd/pkg/obitools/obiconsensus/options.md b/autodoc/docmd/pkg/obitools/obiconsensus/options.md new file mode 100644 index 0000000..812ea0f --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiconsensus/options.md @@ -0,0 +1,26 @@ +# `obiconsensus` Package Functional Overview + +The `obiconsensus` package provides command-line options and configuration helpers for sequence clustering, consensus building, and denoising within the OBITools4 framework. + +## Core Features + +- **Sequence Clustering Mode**: Activated via `--cluster` (`-C`) flag; enables graph-based clustering of related sequences. +- **Denoising with Distance Threshold**: Controlled by `--distance` (`-d`, default: 1), sets the maximum Hamming distance between sequences in a cluster. +- **K-mer Size Control**: `--kmer-size` (`SIZE`, default: -1 = auto-selected) tunes the k-mer size used during consensus construction. +- **Sample Attribute Handling**: `--sample` (`-s`, default: `"sample"`) specifies the metadata field used to group sequences by sample origin. +- **Singleton Filtering**: `--no-singleton` discards unique (non-repeated) sequences if enabled. +- **Low-Coverage Filtering**: Sequences with sample coverage below `--low-coverage` (default: 0.0) are excluded. +- **Dereplication Output**: `--unique` (`-U`) enables output deduplication (equivalent to `obiuniq`). +- **Graph & Ratio Export**: Optional debug outputs: + - `--save-graph DIR`: Saves DAG structures in GraphML format. + - `--save-ratio FILE`: Exports edge abundance ratios as CSV. + +## Integration + +- Integrates with `obiconvert` via input/output option sets (`InputOptionSet`, `OutputOptionSet`) for format handling. +- Uses the `go-getoptions` library to define and parse CLI arguments. + +## Getter Functions + +All configuration values are exposed via typed accessor functions (e.g., `CLIDistStepMax()`, `CLIKmerSize()`), enabling clean separation of option parsing and logic execution. + diff --git a/autodoc/docmd/pkg/obitools/obiconvert/options.md b/autodoc/docmd/pkg/obitools/obiconvert/options.md new file mode 100644 index 0000000..0ec107d --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiconvert/options.md @@ -0,0 +1,36 @@ +# Semantic Description of `obiconvert` Package Functionalities + +The `obiconvert` package provides command-line interface (CLI) option parsing and configuration utilities for sequence data conversion within the OBItools4 framework. It supports flexible input/output format handling, filtering options, and metadata annotation standards. + +### Input Format Support +- Supports multiple input formats: `FASTA`, `FASTQ`, `EMBL`, `GenBank`, `ecoPCR output`, and `CSV`. +- Allows explicit format specification via CLI flags (e.g., `-fasta`, `-fastq`). +- Auto-detection (`guessed`) is used when no format flag is provided. +- Supports structured header annotations in FASTA/FASTQ via: + - JSON-style (`--input-json-header`) + - OBI-compliant format (`--input-OBI-header`) + +### Output Format & Options +- Outputs can be forced to specific formats: `fasta`, `fastq`, or `json`. + - Default behavior is format inference based on presence/absence of quality scores. +- Header annotation style for FASTA/FASTQ output follows: + - JSON (`--output-json-header`) + - OBI format (`--output-OBI-header`, alias `-O`). +- Optional gzip compression of output files. +- Progress bar display (disabled when stderr is redirected or stdout pipes to another process). + +### Data Filtering & Preprocessing +- Skips empty sequences (`--skip-empty`). +- Optional conversion of Uracil (U) to Thymine (T), useful for RNA-to-DNA normalization (`--u-to-t`). +- Supports skipping first *N* records and processing only next *M* (`--skip`, `--only`; commented out but available for future use). +- Option to treat multiple input files as unordered (`--no-order`). + +### File Handling +- Configurable output filename via `-out`, `–o`. +- Support for paired-end reads: specify second file with `--paired-with`. + +### Integration +- Integrates taxonomy-loading options (`obioptions.LoadTaxonomyOptionSet`). +- Centralized option setter via `OptionSet(allow_paired bool)` for modular CLI setup. + +This package enables robust, standardized conversion between biological sequence formats while preserving metadata semantics and supporting common preprocessing workflows. diff --git a/autodoc/docmd/pkg/obitools/obiconvert/sequence_reader.md b/autodoc/docmd/pkg/obitools/obiconvert/sequence_reader.md new file mode 100644 index 0000000..9b1260e --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiconvert/sequence_reader.md @@ -0,0 +1,24 @@ +# Semantic Description of `obiconvert` Package Functionality + +The `obiconvert` package provides utilities for robust, scalable input handling of biological sequence data in the OBITools4 ecosystem. + +- **`ExpandListOfFiles(check_ext, filenames...)`**: + Recursively expands file paths into a deduplicated list of eligible files. Supports local directories, symlinks (resolved), and remote URLs (`http(s)://`, `ftp://`). + Filters files by extension when `check_ext=true`: accepts `.fasta[.gz]`, `.fastq[.fq][.gz]`, `.seq[.gz]`, `.gb[ gbff|dat ][.gz]`, and `.ecopcr[.gz]`. + +- **`CLIReadBioSequences(filenames...)`**: + Constructs a streaming iterator (`obiiter.IBioSequence`) over biological sequences from files or stdin. + - Adapts parsing strategy based on CLI options: JSON, OBI, or heuristic header parsers. + - Configures parallelism (`nworkers ≥ 2`), batch size, memory limits, and quality reading. + - Supports full-file batching (for large records) and `U→T` conversion for RNA data. + - Handles single/multiple files: uses batched parallel reading when appropriate; supports paired-end input via `PairTo`. + - Falls back to format-specific readers (FASTA, FASTQ, GenBank, EMBL, EcoPCR, CSV) or generic fallback. + +- **`OpenSequenceDataErrorMessage(args..., err)`**: + Formats and logs user-friendly error messages for input failures, then exits with status `1`. Distinguishes stdin-only, single-file, and multi-file error contexts. + +Core design principles: +✅ Lazy evaluation via iterators for memory efficiency. +✅ Automatic format inference and parallel I/O scaling. +✅ Symlink resolution, recursive globbing with extension filtering. +✅ CLI-integrated configuration (header parsing mode, parallel workers, batch settings). diff --git a/autodoc/docmd/pkg/obitools/obiconvert/sequence_writer.md b/autodoc/docmd/pkg/obitools/obiconvert/sequence_writer.md new file mode 100644 index 0000000..966b147 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiconvert/sequence_writer.md @@ -0,0 +1,30 @@ +# `obiconvert` Package: Semantic Overview + +This Go package provides utilities for writing biological sequence data to files or stdout, supporting multiple formats and parallel processing. + +## Core Functionality + +- **`BuildPairedFileNames(filename string) (string, string)`** + Derives paired-end filenames from a base name by appending `_R1` and `_R2`, preserving directory path and file extension (e.g., `sample.fastq → sample_R1.fastq`, `sample_R2.fastq`). + +- **`CLIWriteBioSequences(...)`** + Writes `IBioSequence` iterator output to disk or stdout, based on CLI-configured options: + - **Format support**: FASTQ, FASTA, JSON (default), or generic sequence format. + - **Header style**: Configurable via `CLIOutputFastHeaderFormat()` — supports `"json"` or `"obi"`. + - **Parallelism**: Uses `WriteParallelWorkers()` for concurrent I/O. + - **Batching & compression**: Controlled by batch size and output-compression flags. + +## Key Behaviors + +- If no filename is given or `"-"` is used, output goes to **stdout**. +- For paired data (`iterator.IsPaired()`), automatically writes R1/R2 to separate files. +- Skips empty sequences if `CLISkipEmpty()` returns true. +- On terminal actions (`terminalAction == true`), recycles resources and returns `nil`. +- Logs critical errors with `log.Fatalf`. + +## Integration + +Built on top of: +- `obiformats`: Format-specific writers (FASTQ/FASTA/JSON). +- `obiiter`: Sequence iterator abstraction. +- `obidefault`: CLI-default configuration (workers, batch size, compression). diff --git a/autodoc/docmd/pkg/obitools/obicount/options.md b/autodoc/docmd/pkg/obitools/obicount/options.md new file mode 100644 index 0000000..67a0972 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obicount/options.md @@ -0,0 +1,25 @@ +# `obicount` Package Functional Overview + +The `obicount` package provides CLI option parsing and state management for the `obicount` utility, which counts biological sequence metrics from input files (e.g., FASTA/FASTQ). It leverages `go-getoptions` for argument parsing. + +## Core Features + +- **Three counting modes**: + - `--reads` (`-r`) — count total reads (sequences). + - `--variants` (`-v`) — count unique sequence variants. + - `--symbols` (`-s`) — sum of all nucleotide/amino-acid symbols (i.e., total length). + +- **Default behavior**: + If *no* flag is specified, all three counts are printed (i.e., fallback to full report). + +- **State variables** (`__read_count__`, `__variant_count__`, `__symbol_count__`) track which metrics are enabled. + +- **Helper functions**: + - `CLIIsPrintingReadCount()` — returns true if read count should be output. + - `CLIIsPrintingVariantCount()` — same for variant counts. + - `CLIIsPrintingSymbolCount()` — same for symbol (length) totals. + +- **Semantic semantics**: + Each function returns `true` if explicitly requested *or* when no flags are set (default mode), ensuring backward compatibility and intuitive CLI behavior. + +This package encapsulates only the option-handling logic, keeping concerns separated from file I/O or counting implementation. diff --git a/autodoc/docmd/pkg/obitools/obicsv/csvoption.md b/autodoc/docmd/pkg/obitools/obicsv/csvoption.md new file mode 100644 index 0000000..0cd3fe4 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obicsv/csvoption.md @@ -0,0 +1,33 @@ +# Functional Overview of the `obicsv` Package + +The `obicsv` package provides a flexible and configurable interface for processing biological sequence data (e.g., FASTA/FASTQ) with support for CSV export and parallelized batch processing. + +## Core Concepts + +- **Options Pattern**: Uses a builder-style API via `MakeOptions([]WithOption)` to configure behavior. +- **Configurable Processing**: Supports batch size, parallel workers, file I/O mode (append/new), compression handling, and progress tracking. +- **Selective CSV Export**: Fine-grained control over output columns (ID, sequence, quality, taxon, count, definition) and formatting (separator, NA value, custom keys). +- **Default Integration**: Leverages `obidefault` for sensible defaults (e.g., batch size, parallel workers). + +## Key Functionalities + +| Category | Features | +|---------|----------| +| **I/O Control** | File name, append vs overwrite (`OptionsAppendFile`, `OptionCloseFile`), compression support (`OptionsCompressed`) | +| **Processing Strategy** | Batch size, full-file batch mode (`FullFileBatch`), parallel workers (`ParallelWorkers`), unordered processing (`NoOrder`) | +| **Data Handling** | Skip empty sequences (`SkipEmptySequence`), progress bar display, source tracking | +| **CSV Output Customization** | Toggle columns (`CSVId`, `CSVSequence`, etc.), custom keys via `CSVKey`/`CSVKeys`, separator (`CSVSeparator`) and NA placeholder (`CSVNAValue`), auto-column detection | + +## Usage Example + +```go +opt := MakeOptions([]WithOption{ + OptionFileName("output.csv"), + CSVId(true), + CSVSequence(true), + CSVTaxon(false), + OptionsAppendFile(true), +}) +``` + +This package enables efficient, customizable conversion of biological sequence data to structured CSV format with minimal boilerplate. diff --git a/autodoc/docmd/pkg/obitools/obicsv/obicsv.md b/autodoc/docmd/pkg/obitools/obicsv/obicsv.md new file mode 100644 index 0000000..1dfa3c2 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obicsv/obicsv.md @@ -0,0 +1,23 @@ +# `obicsv` Package: CSV Export Functionality for Biological Sequences + +This Go package provides utilities to serialize biological sequence data (e.g., from NGS pipelines) into CSV format. + +## Core Functions + +- **`CLIWriteSequenceCSV()`** + Converts an iterator of `IBioSequence` objects into a CSV-compatible stream. It configures parallelism, batching, and compression using default settings (e.g., `obidefault.ParallelWorkers()`), then applies CLI-driven column mappings via helper functions (`CLIPrintId()`, `CLIPrintSequence()`, etc.). Returns an `ICSVRecord` iterator. + +- **`CLICSVWriter()`** + Writes the CSV data either to a file (if `obiconvert.CLIOutPutFileName()` ≠ `"-"`) or to standard output. Handles errors with fatal logging and supports optional terminal consumption of the iterator. + +## Key Features + +- **Flexible column selection**: Controlled by CLI options (e.g., `CSVTaxon`, `CSVKeys`), allowing selective export of metadata, sequences, quality scores. +- **Compression support**: Output can be gzip-compressed per `obidefault.CompressOutput()`. +- **Parallel processing**: Uses ~¼ of configured workers (min 2) for throughput optimization. +- **CLI integration**: Leverages existing `obiconvert` and CLI abstractions for seamless pipeline usage. +- **Error resilience**: Fails fast on I/O issues with descriptive logs. + +## Design Notes + +Functions follow a functional-iterator pattern, enabling lazy evaluation and streaming. The `terminalAction` flag determines whether the iterator is consumed immediately (e.g., for final output) or returned for further processing. diff --git a/autodoc/docmd/pkg/obitools/obicsv/options.md b/autodoc/docmd/pkg/obitools/obicsv/options.md new file mode 100644 index 0000000..28d769a --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obicsv/options.md @@ -0,0 +1,28 @@ +# CSV Export Functionality Overview + +This Go package (`obicsv`) provides command-line interface options and utilities for exporting biological sequence data to CSV format. It integrates with the OBITools4 framework, supporting flexible attribute selection and formatting. + +## Core Export Options +- **`--ids/-i`**: Outputs sequence identifiers. +- **`--sequence/-s`**: Includes raw nucleotide/amino acid sequences. +- **`--quality/-q`**: Adds per-base quality scores (e.g., Phred values). +- **`--definition/-d`**: Prints sequence headers or definitions. +- **`--count`**: Includes abundance/observation counts per sequence. + +## Taxonomic & Pairing Data +- **`--taxon`**: Exports NCBI taxid and corresponding scientific name. +- **`--obipairing`**: Includes metadata added by `obipairing`, such as alignment mode, score, and mismatch count. + +## Attribute Filtering +- **`--keep/-k KEY`**: Restricts output to specified attributes (multiple `-k` allowed). +- **`--auto`**: Inspects first records to auto-detect and suggest relevant attributes. + +## Configuration +- **`--na-value NAVALUE`**: Sets placeholder string (default `"NA"`) for missing fields. + +## Integration +- Extends `obiconvert` input/output and taxonomy-loading options. +- Provides CLI accessor functions (e.g., `CLIPrintSequence()`, `CLIHasToBeKeptAttributes()`). +- Supports soft attribute groups (e.g., `"obipairing"` expands to 8 specific fields). + +Designed for high-throughput sequence analysis pipelines, enabling customizable tabular output compatible with downstream tools. diff --git a/autodoc/docmd/pkg/obitools/obicsv/sequence.md b/autodoc/docmd/pkg/obitools/obicsv/sequence.md new file mode 100644 index 0000000..2f390e9 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obicsv/sequence.md @@ -0,0 +1,27 @@ +# CSV Export Functionality in `obicsv` Package + +The `obicsv` package provides utilities to convert biological sequence data into structured CSV format. It supports flexible, configurable output through an `Options` interface. + +## Core Functions + +- **`CSVSequenceHeader(opt Options)`**: + Constructs a CSV header row based on enabled options (e.g., `id`, `count`, `taxid`, `definition`). Additional user-defined attributes are appended, followed by optional `sequence` and `qualities`. + +- **`CSVBatchFromSequences(batch BioSequenceBatch, opt Options)`**: + Converts a batch of biological sequences into CSV records. Each sequence is processed according to the active options: + - Sequence ID, count, taxonomic identifier (from `Taxon()` or fallback to raw `taxid`), and definition. + - Custom attributes retrieved via `GetAttribute(key)`; missing values replaced by a configurable NA value. + - Nucleotide sequence (as string) and quality scores (converted to ASCII Phred+shifted format or NA if absent). + +- **`NewCSVSequenceIterator(iter IBioSequence, options ...WithOption)`**: + Wraps a sequence iterator (`IBioSequence`) to produce an asynchronous CSV record stream: + - Optionally auto-detects and includes all sequence attributes (`CSVAutoColumn`). + - Launches parallel workers to process batches concurrently. + - Uses a producer-consumer pattern: one goroutine drives iteration, others write CSV records. + +## Key Features + +- **Configurable output columns** via option flags (e.g., `CSVId()`, `CSVTaxon()`). +- **Support for quality scores** in standard FASTQ ASCII encoding. +- **NA value handling**: missing fields replaced with a user-defined placeholder (e.g., `"."`). +- **Parallelization**: scalable CSV generation using multiple goroutines. diff --git a/autodoc/docmd/pkg/obitools/obicsv/writer.md b/autodoc/docmd/pkg/obitools/obicsv/writer.md new file mode 100644 index 0000000..099f5ae --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obicsv/writer.md @@ -0,0 +1,21 @@ +# CSV Export Functionality in `obicsv` Package + +The `obicsv` package provides utilities for efficiently writing structured data (e.g., sequence annotations) to CSV format, supporting parallel processing and streaming. + +- **`FormatCVSBatch()`**: Converts a batch of CSV records (`CSVRecordBatch`) into an in-memory buffer, using the provided header and a placeholder for missing values (`navalue`). It prepends the header only once (for batch order 0). + +- **`WriteCSV()`**: Writes a CSV-formatted stream from an `ICSVRecord` iterator to any `io.WriteCloser`. It supports: + - Compression (via `obiutils.CompressStream`) + - Parallel workers for batch processing (`ParallelWorkers()`) + - Chunked writing via `obiformats.WriteFileChunk` + +- **`WriteCSVToStdout()` / `WriteCSVToFile()`**: Convenience wrappers: + - Outputs to stdout (`os.Stdout`) + - Writes to a file (with `O_WRONLY`, optional append/truncate) + +- **Key design features**: + - Non-blocking, concurrent processing using goroutines + - Graceful shutdown via `WaitAndClose()` and channel signaling + - Robust handling of missing/invalid values (falls back to `navalue`) + +- **Dependencies**: Leverages internal packages for iteration (`obiitercsv`), data formats (`obiformats`), and utilities (`obiutils`, `logrus` logging). diff --git a/autodoc/docmd/pkg/obitools/obidemerge/demerge.md b/autodoc/docmd/pkg/obitools/obidemerge/demerge.md new file mode 100644 index 0000000..3ce9412 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obidemerge/demerge.md @@ -0,0 +1,22 @@ +# `obidemerge` Package Overview + +The `obidemerge` package provides functionality to **split biological sequences based on metadata statistics**, commonly used in metabarcoding data processing. + +- `MakeDemergeWorker(key string)` returns a `SeqWorker` that processes each sequence as follows: + - Checks if the sequence contains statistical data associated with `key`. + - If present, retrieves a map of value → count (e.g., `"speciesA": 12`, `"speciesB": 7`). + - Creates a new slice of sequences: one copy per unique statistical key, each assigned: + - The original sequence data (copied), + - A new attribute `key = `, + - Count set to the corresponding statistical value. + - Removes original stats from input sequence after processing. + +- If no statistics are found for `key`, the sequence is returned unchanged in a single-element slice. + +- `CLIDemergeSequences(iterator)` wraps the worker for CLI use: + - Uses a default slot name (`CLIDemergeSlot()`, likely `"demerged"` or similar). + - Applies the worker to an iterator of sequences, optionally in parallel. + +**Use case**: Converts aggregated statistics (e.g., from clustering or OTU picking) into discrete, count-annotated sequences — enabling downstream tools to treat each variant as an independent entity with its own abundance. + +**Key concept**: *Demerging* = reversing a prior merging step by expanding merged sequences into their constituent statistical components. diff --git a/autodoc/docmd/pkg/obitools/obidemerge/options.md b/autodoc/docmd/pkg/obitools/obidemerge/options.md new file mode 100644 index 0000000..774b183 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obidemerge/options.md @@ -0,0 +1,18 @@ +## `obidemerge` Package Overview + +The `obidemerge` package provides command-line interface (CLI) support for **demerging** biological sequence data—typically used to reverse the merging of paired-end reads that were previously combined (e.g., during PCR or amplicon processing). + +- **Core Functionality**: + - Defines a CLI option `--demerge` (short alias `-d`) to specify *which data slot* should be demerged. + - The default value is `"sample"`, indicating the primary sample slot as target for demerging. + +- **Integration**: + - Extends `obiconvert.OptionSet`, inheriting standard conversion options (e.g., input/output formats, filtering). + - Uses `go-getoptions` for robust CLI argument parsing. + +- **Key APIs**: + - `DemergeOptionSet(options)`: Registers the `-d/--demerge` flag. + - `CLIDemergeSlot()`: Returns the currently selected slot name (e.g., `"sample"`), enabling downstream logic to extract and split merged records accordingly. + +- **Use Case**: + - Enables reprocessing of merged reads (e.g., for error correction or split-read analysis) by selecting the appropriate data stream to demerge. diff --git a/autodoc/docmd/pkg/obitools/obidistribute/distribute.md b/autodoc/docmd/pkg/obitools/obidistribute/distribute.md new file mode 100644 index 0000000..02ecf1b --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obidistribute/distribute.md @@ -0,0 +1,26 @@ +# `obidistribute` Package: Sequence Distribution and Output Formatting + +This Go module provides functionality to distribute biological sequences across multiple output files based on classification criteria, while applying configurable formatting and parallelization options. + +- **Main Function**: `CLIDistributeSequence()` orchestrates the entire process. +- It accepts an iterator (`obiiter.IBioSequence`) of biological sequences as input. + +## Key Features + +- **Header Format Selection**: + Supports JSON or OBI-compliant headers via `obiconvert.CLIOutputFastHeaderFormat()`; defaults to JSON. + +- **Parallel Processing**: + Automatically configures worker threads (at least 2), derived from `obidefault.ParallelWorkers()` divided by four. + +- **Batching & Compression**: + Uses configurable batch size and output compression settings from defaults (`obidefault`). + +- **Output Format Handling**: + Supports FASTQ, FASTA, or generic sequence formats (`WriteSequencesToFile`), selected via `CLIOutputFormat()`. + +- **Sequence Classification & Dispatching**: + Sequences are classified using `CLISequenceClassifier()`, enabling distribution into multiple files based on metadata (e.g., sample, taxon). + +- **File Naming & Appending**: + Output filenames follow a pattern (`CLIFileNamePattern()`), with optional append mode via `CLIAppendSequences()`. diff --git a/autodoc/docmd/pkg/obitools/obidistribute/options.md b/autodoc/docmd/pkg/obitools/obidistribute/options.md new file mode 100644 index 0000000..b245d5c --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obidistribute/options.md @@ -0,0 +1,33 @@ +# `obidistribute` Package Overview + +The `obidistribute` package provides command-line interface (CLI) utilities for splitting biological sequence data into multiple output files or directories based on user-defined criteria. + +## Core Functionality + +- **File Distribution Logic**: Sequences are dispatched to output files or directories using one of three strategies: + - `--classifier` / `-c`: Distribute by annotation tag (e.g., sample ID, taxonomic assignment). + - `--directory` / `-d`: Optional companion to classifier — organizes output into subdirectories. + - `--batches` / `-n`: Split input evenly across *N* batches (round-robin assignment). + - `--hash` / `-H`: Hash-based distribution into up to *N* batches for reproducible sharding. + +- **Flexible Output Naming**: The `--pattern` / `-p` option defines output filenames via a format string (e.g., `"toto_%s.fastq"`), where `%s` is substituted with the classifier value or batch index. + +- **Handling Missing Annotations**: The `--na-value` option specifies a fallback label (default `"NA"`) for sequences lacking the classifier annotation. + +- **Append Mode**: With `--append` / `-A`, existing files are appended to instead of overwritten. + +## Integration + +- Leverages `obiconvert` for input/output handling (e.g., FASTQ/FASTA parsing/writing). +- Uses `obiseq.BioSequenceClassifier` internally to abstract distribution logic. +- Built on top of the `obitools4` ecosystem for NGS data processing. + +## CLI Design + +- Options are registered via `getoptions`, supporting short/long aliases and required checks. +- Validation ensures exactly one distribution mode (`classifier`, `batches`, or `hash`) is selected. +- Filename pattern correctness is verified at runtime to prevent malformed output paths. + +## Semantic Summary + +This module enables flexible, annotation- or hash-based splitting of sequencing datasets — essential for sample demultiplexing, batch processing, and scalable data management in metabarcoding workflows. diff --git a/autodoc/docmd/pkg/obitools/obigrep/grep.md b/autodoc/docmd/pkg/obitools/obigrep/grep.md new file mode 100644 index 0000000..9e87b23 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obigrep/grep.md @@ -0,0 +1,26 @@ +# `obigrep.CLIFilterSequence` — Semantic Description + +The function `CLIFilterSequence` implements a **command-line-driven sequence filtering pipeline** over an iterator of biological sequences (`IBioSequence`). It selectively retains or discards reads based on user-defined criteria, optionally saving discarded sequences to disk. + +## Core Functionality + +- **Predicate Construction**: + Builds a filtering predicate via `CLISequenceSelectionPredicate()`, which encodes user-specified filters (e.g., min/max length, quality thresholds, primer matches). + +- **Paired-End Support**: + If input is paired-end (`CLIHasPairedFile()`), the predicate is extended with a `PairedPredicat` configured by `CLIPairedReadMode()` (e.g., strict pairing, orphan handling). + +- **Filtering Strategy**: + - If a predicate exists: + → `iterator.FilterOn(...)` applies the filter in parallel/batched mode (configurable via batch size and worker count). + → Alternatively, if `CLISaveDiscardedSequences()` is enabled: +  `iterator.DivideOn(...)` splits sequences into *kept* and *discarded*, with discarded reads asynchronously written to a file (`CLIDiscardedFileName()`). + - If no predicate is defined: + → The original iterator is returned unchanged. + +- **Logging & Error Handling**: + Uses `logrus` to log discarded-file destination and fatal errors during write operations. + +## Semantic Role + +This function acts as the **central filtering engine** in CLI tools (e.g., `obigrep`), translating user flags into a type-safe, composable sequence filter—supporting both single- and paired-end data with optional discard logging. diff --git a/autodoc/docmd/pkg/obitools/obigrep/options.md b/autodoc/docmd/pkg/obitools/obigrep/options.md new file mode 100644 index 0000000..b4b75fc --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obigrep/options.md @@ -0,0 +1,48 @@ +# `obigrep`: Sequence Filtering Module for OBITools4 + +The `obigrep` package provides a rich set of command-line-driven filtering capabilities for biological sequence records (e.g., FASTA/FASTQ), built on top of the OBITools4 framework. It enables users to select or exclude sequences based on diverse criteria, including sequence content, metadata attributes, taxonomy, abundance, and pattern matching (exact or approximate). + +## Core Functionalities + +- **Sequence Length & Abundance Filtering**: + Select sequences by minimum/maximum length (`--min-length`, `--max-length`) and count (abundance; `--min-count`, `--max-count`). + +- **Pattern Matching**: + Supports exact regex matching against: + - Sequence (`--sequence`, `-s`) + - Definition line (`--definition`, `-D`) + - Identifier (`--identifier`, `-I`) + Case-insensitive by default. + +- **Approximate Pattern Matching**: + Allows fuzzy matching with configurable error tolerance (`--pattern-error`), indels (`--allows-indels`), and strand orientation (`--only-forward`). Uses `obigrep`’s approximate pattern engine. + +- **Taxonomic Filtering**: + - Restrict to specific taxa (`--restrict-to-taxon`, `-r`) + - Exclude taxa (`--ignore-taxon`, `-i`) + Validate taxonomy presence/consistency (`--valid-taxid`) + - Require specific taxonomic ranks (`--require-rank`) + +- **Attribute-Based Selection**: + Filter by presence of attributes (`--has-attribute`, `-A`) or match attribute values with regex (`--attribute=key=pattern`, `-a`). + +- **Identifier List Filtering**: + Load identifiers from a file (`--id-list`) to select only those records. + +- **Custom Predicate Expressions**: + Evaluate arbitrary boolean expressions per sequence (`--predicate`, `-p`), with access to attributes and the `sequence` object. + +- **Paired-End Read Handling**: + Controls how conditions apply to read pairs via `--paired-mode` (e.g., `"forward"`, `"and"`, `"xor"`). + +- **Output Control**: + Save rejected sequences to file (`--save-discarded`) or invert selection globally (`--inverse-match`, `-v`). + +## Architecture + +- All options are parsed via `go-getoptions`. +- Filtering logic is composed into a single predicate (`CLISequenceSelectionPredicate`) using logical AND/OR composition. +- Taxonomy-aware predicates leverage `obitax`, sequence operations use `obiseq`, and utilities (e.g., file I/O) rely on `obiutils`. +- Integration with conversion pipelines via `obiconvert.OptionSet`. + +This module serves as the backbone for selective data extraction in metagenomic and metabarcoding workflows. diff --git a/autodoc/docmd/pkg/obitools/obijoin/join.md b/autodoc/docmd/pkg/obitools/obijoin/join.md new file mode 100644 index 0000000..bf3b7d0 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obijoin/join.md @@ -0,0 +1,22 @@ +# Semantic Description of `obijoin` Package + +The `obijoin` package implements a flexible sequence-join mechanism for biological sequencing data within the OBITools4 framework. It supports efficient lookup and merging of metadata/sequences based on user-defined keys. + +- **`IndexedSequenceSlice`**: A data structure combining a slice of biological sequences (`BioSequenceSlice`) with precomputed indices for fast filtering. Each index maps attribute values (e.g., sample IDs, barcodes) to sets of matching sequence indices. + +- **`Get(keys...)` method**: Performs multi-key intersection queries across pre-built indexes to retrieve sequences matching *all* specified attribute values — enabling complex filtering (e.g., `sample="S1" AND barcode="ATGC"`). + +- **`BuildIndexedSequenceSlice()`**: Constructs the index structure in linear time by scanning sequences and grouping them per attribute key. Supports arbitrary string attributes retrieved via `GetStringAttribute()`. + +- **`MakeJoinWorker()`**: Returns a functional worker (`SeqWorker`) that, for each input sequence: + - Extracts join keys (e.g., `sample`, `barcode`) from its annotations. + - Uses the index to find matching partner sequences (`join_with`). + - Produces one output sequence per match, copying the original and enriching it with annotations from partners. + - Optionally updates ID, sequence, or quality scores based on partner data. + +- **`CLIJoinSequences()`**: Top-level CLI entry point: reads a reference dataset (via `--join-with`), builds the index, and applies join logic using command-line flags (`--by`, `--update-id`, etc.). Integrates with OBITools4’s streaming iterator model. + +- **Use Cases**: Merging paired-end reads, annotating amplicons with sample metadata, or combining reference databases — all via declarative key-based joins. + +- **Efficiency**: Indexing avoids repeated scanning; intersection logic is optimized via `obiutils.Set[int]`. +- **Extensibility**: Works with any attribute supported by the sequence annotation system. diff --git a/autodoc/docmd/pkg/obitools/obijoin/options.md b/autodoc/docmd/pkg/obitools/obijoin/options.md new file mode 100644 index 0000000..31755c3 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obijoin/options.md @@ -0,0 +1,42 @@ +# `obijoin` Package: Semantic Description + +The `obijoin` package provides command-line interface (CLI) configuration and semantic logic for joining two biological sequence files, typically used in NGS data processing pipelines (e.g., merging FASTQ/FASTA with annotation tables). + +## Core Functionality + +- **Join Specification**: + Accepts a file to join with via the `-j/--join-with` option (required). Specifies how records from two datasets are matched. + +- **Join Keys Definition**: + Uses `-b/--by` to define matching criteria (e.g., `"id=id"` or `"sample=well"`). Defaults to `["id"]` on both sides if omitted. Supports asymmetric keys via `"left=right"` syntax. + +- **Field Update Control**: + Three boolean flags determine which fields are overwritten in the primary dataset during join: + - `-i/--update-id`: Replace sequence identifiers. + - `-s/--update-sequence`: Overwrite nucleotide/amino acid sequences. + - `-q/--update-quality`: Replace quality scores (relevant for FASTQ). + +- **Integration with Base Converter**: + Extends `obiconvert.OptionSet()` — inherits standard conversion options (e.g., input/output formats) and appends join-specific flags. + +## Semantic Behavior + +- Performs a **left outer join** (primary dataset preserved; matched records from secondary file appended/updated). +- Keys are compared semantically: exact string match by default (no regex or fuzzy matching implied). +- Updates occur **only if flags are enabled**; otherwise, joined metadata is ignored or appended conditionally. + +## Usage Example + +```bash +obijoin -i input.fastq \ + --join-with annotations.tsv \ + --by "id=name" \ + -i -s +``` +→ Joins `input.fastq` with `annotations.tsv`, matching on `id == name`; updates IDs and sequences. + +## Design Notes + +- Minimalist CLI: Leverages `go-getoptions` for declarative argument parsing. +- No file I/O logic in this module — purely configuration and option extraction (`CLI*` accessor functions). +- Designed for composability within `obitools4`, following modular CLI patterns. diff --git a/autodoc/docmd/pkg/obitools/obik/cp.md b/autodoc/docmd/pkg/obitools/obik/cp.md new file mode 100644 index 0000000..cb76c61 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obik/cp.md @@ -0,0 +1,29 @@ +## `obik cp`: K-mer Index Set Copy Command + +The `cp` subcommand copies selected or all k-mer sets from a source index to a new destination directory. + +### Core Functionality +- **Source & Destination**: Requires two positional arguments: `` (existing k-mer index) and `` (new directory for copied sets). +- **Set Selection**: + - By default, copies *all* k-mer sets in the source index. + - Supports `--set PATTERN` options to filter and copy only sets whose IDs match the given glob-style patterns. + - Fails if no set matches any provided pattern. + +- **Overwrite Control**: + - Uses `--force` flag to allow overwriting an existing destination index (or its contents). + - Without `--force`, copying into a non-empty or conflicting directory is prevented. + +- **Underlying Operations**: + - Opens the source index via `obikmer.OpenKmerSetGroup`. + - Matches patterns using `MatchSetIDs`, resolves IDs via `SetsIDs`/`SetIDOf`. + - Copies selected sets using the method `CopySetsByIDTo`, which creates a new k-mer index at ``. + +### Logging & Feedback +- Logs the number of sets being copied and source/destination paths. +- After completion, reports how many sets are present in the new index (`dest.Size()`). + +### Error Handling +- Validates argument count. +- Wraps and reports errors from index opening, pattern matching, copying, etc., with context. + +This command enables selective migration or duplication of k-mer-based biological sequence indexes (e.g., for taxonomic classification), supporting flexible workflows in OBITools4 pipelines. diff --git a/autodoc/docmd/pkg/obitools/obik/filter.md b/autodoc/docmd/pkg/obitools/obik/filter.md new file mode 100644 index 0000000..4a5f52f --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obik/filter.md @@ -0,0 +1,34 @@ +# `obik filter`: K-mer Index Filtering Subcommand + +The `runFilter` function implements the `obik filter` CLI command, enabling users to apply configurable filters to an existing k-mer index and generate a new filtered version. + +## Core Functionality + +- **Input**: Reads an existing k-mer index (``) via `obikmer.OpenKmerSetGroup`. +- **Output**: Writes a new index (`--out `) containing only k-mers that pass all specified filters. +- **Parallelism**: Filters partitions in parallel using goroutines; each worker instantiates its own filter (to support stateful filters like entropy-based ones). + +## Supported Filters + +- **Entropy Filter** (`--entropy-filter`): + - Removes low-complexity k-mers using a sliding-window entropy metric. + - Configurable via `--entropy-threshold` and `--entropy-size`. + - Implemented by wrapping `obikmer.NewKmerEntropyFilter`. + +## Filtering Architecture + +- Uses a **factory pattern** (`KmerFilterFactory`) to generate per-worker filter instances. +- `chainFilterFactories` composes multiple filters with logical AND semantics (all must accept a k-mer). +- Filters are applied per-partition (`filterPartition`) using `KdiReader`/`KdiWriter`. + +## Set & Partition Handling + +- Supports selection of specific sets via `--set-patterns`. +- Processes all partitions (`P`) per set, preserving original partitioning structure. +- Preserves `spectrum.bin` files (if present) in the output. + +## Metadata & Reporting + +- Copies group-level metadata and records applied filters (e.g., entropy threshold). +- Logs per-set statistics: total processed, kept k-mers, and removal percentage. +- Uses `progressbar` for interactive progress feedback (when enabled). diff --git a/autodoc/docmd/pkg/obitools/obik/index.md b/autodoc/docmd/pkg/obitools/obik/index.md new file mode 100644 index 0000000..dee787b --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obik/index.md @@ -0,0 +1,39 @@ +# `obik` Index Command — Semantic Description + +The `runIndex` function implements a high-performance, parallelizable k-mer indexing pipeline for biological sequence data (e.g., DNA/RNA reads). It constructs or extends a *k-mer set group*—a structured collection of k-mers with metadata and filtering. + +### Core Functionality + +- **Directory-based indexing**: Outputs are stored in a user-specified directory (`--out`), supporting both creation of new indices and incremental appending to existing ones via a `metadata.toml` manifest. +- **Configurable k-mer parameters**: + - K-mer size `k` (2–31, validated). + - Minimizer size `m`, used for space-efficient hashing. +- **Frequency filtering**: + - Minimum occurrence (`--minocc`) excludes rare k-mers (e.g., sequencing errors). + - Optional maximum occurrence (`--maxocc`) filters overrepresented k-mers (e.g., contaminants). +- **Entropy-based filtering**: Removes low-complexity/k-mer bias using an entropy threshold over a sliding window (`--entropy-threshold`, `--entropy-size`). +- **Top-frequency k-mers preservation**: Optionally saves the most frequent *N* k-mers for downstream analysis (`--save-freq-kmers`). + +### Parallel Processing + +- Sequences are read concurrently using `CLIReadBioSequences`. +- A worker pool (`nworkers`, derived from system defaults) processes batches in parallel via `obiiter.IBioSequence`. +- Thread-safe counting of processed sequences (`atomic.Int64`) ensures correctness. + +### Metadata & Tagging + +- Supports three levels of metadata: + - **Group-level attributes** (`--set-tag`, `-S`) applied to the entire index. + - **Set-level metadata** (`-T` / `_setMetaTags`) applied to the newly added k-mer set. + - **Per-set ID** (`--index-id`) for identification in multi-dataset indices. + +### Finalization & Output + +- `builder.Close()` finalizes the index and persists k-mers to disk (likely as binary or compressed format). +- Metadata is re-saved with updated statistics and filtering flags. +- Final summary logs total k-mers in the new set, directory path, and processing stats. + +### Dependencies & Integration + +- Built on top of `obitools4` ecosystem: sequence parsing (`obiconvert`, `obiiter`), k-mer management (`obikmer`), and defaults handling (`obidefault`, `logrus` logging). +- Designed for CLI usage (via `getoptions`) and integration into larger bioinformatics workflows. diff --git a/autodoc/docmd/pkg/obitools/obik/lowmask.md b/autodoc/docmd/pkg/obitools/obik/lowmask.md new file mode 100644 index 0000000..78e6344 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obik/lowmask.md @@ -0,0 +1,29 @@ +# Low-Complexity Sequence Masking with Entropy-Based Detection + +This Go package implements a low-complexity masking tool for DNA sequences, based on entropy analysis of *k*-mer frequencies across multiple window sizes. It identifies regions with reduced sequence diversity—typical of repeats or simple sequences—and supports three operational modes: **masking**, **splitting**, and **extracting** masked fragments. + +## Core Functionality + +- `lowMaskWorker()` constructs a reusable worker function that processes sequences using entropy-based detection. +- Entropy is computed for sliding windows of varying sizes (from 1 to `level_max`) using normalized canonical circular *k*-mers. +- Ambiguous nucleotides (non-acgt) are automatically masked and excluded from entropy calculations. + +## Algorithm Highlights + +- **Multi-scale analysis**: Computes sequence complexity at multiple *k*-mer lengths to capture both local and broader low-complexity patterns. +- **Sliding window entropy**: Uses a frequency table updated incrementally (via deque-based sliding minimum optimization) to efficiently compute Shannon entropy per position. +- **Thresholding**: Positions with entropy ≤ `threshold` are flagged as low-complexity. + +## Output Modes + +- **MaskMode**: Replaces masked positions with a user-defined character (`maskChar`). +- **SplitMode (default)**: Splits the sequence into high-complexity fragments ≥ *k*-mer size. +- **ExtractMode**: Extracts only the low-complexity fragments (e.g., for downstream filtering or analysis). + +## Additional Features + +- Preserves short fragments if `keepShorter` is enabled. +- Attaches metadata attributes (`mask`, `Entropies`) to each sequence for inspection or post-processing. +- Integrates with the OBITools4 pipeline via `runLowmask()` for CLI usage and batch processing. + +This implementation is optimized for speed (via incremental updates, precomputed normalization tables) while maintaining biological accuracy in complexity estimation. diff --git a/autodoc/docmd/pkg/obitools/obik/ls.md b/autodoc/docmd/pkg/obitools/obik/ls.md new file mode 100644 index 0000000..28bce88 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obik/ls.md @@ -0,0 +1,33 @@ +# `obik ls` Command — Semantic Description + +The `obik ls` command lists metadata about k-mer sets stored in a precomputed index directory. It is part of the `obik` CLI tool, designed for working with biological sequence k-mer sets (e.g., in metabarcoding workflows). + +## Core Functionality + +- **Input**: Accepts a single positional argument: the path to an index directory created by `obik build` or similar. +- **Index Access**: Uses `OpenKmerSetGroup()` to load a collection of k-mer sets (each representing one sample or taxonomic group). +- **Set Selection**: Optionally filters which k-mers sets to display via `CLISetPatterns()` (e.g., glob patterns like `"sample_*"` or regex-like filters). +- **Metadata Extraction**: For each selected set, retrieves: + - `index`: numeric ID (position in the group), + - `id`: human-readable identifier, + - `count`: number of unique k-mers in the set. + +## Output Formats + +- **CSV** (default): Tabular format with header `index,id,count`. Properly escapes IDs containing commas or quotes. +- **JSON**: Pretty-printed array of `setEntry` objects with typed fields (`index`, `id`, `count`). +- **YAML**: Human-readable structured output using the same schema. + +## Context & Error Handling + +- Runs within a `context.Context` for cancellation and timeouts. +- Returns descriptive errors (e.g., invalid path, pattern matching failures). +- Falls back to CSV if an unknown format is requested. + +## Use Cases + +- Inspect contents of a k-mer index before downstream analysis. +- Validate indexing results (e.g., verify expected sample IDs and k-mer counts). +- Export metadata for scripting or integration with other tools. + +> *Note: No k-mers themselves are printed—only metadata about the sets.* diff --git a/autodoc/docmd/pkg/obitools/obik/match.md b/autodoc/docmd/pkg/obitools/obik/match.md new file mode 100644 index 0000000..ca95526 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obik/match.md @@ -0,0 +1,44 @@ +# `obik match`: Semantic Description of the Subcommand + +The `runMatch` function implements the **k-mer-based sequence matching** subcommand (`obik match`) of OBITools4. It enables rapid identification of query sequences (e.g., reads) against one or more reference k-mer indexes stored in `.kdi` files. + +### Core Functionality + +1. **Index Loading** + Opens a k-mer index (`KmerSetGroup`) from disk, retrieving metadata (k-mer size `k`, minhash dimensionality `m`, number of partitions, and sets). Supports matching against specific reference sets via glob-like patterns or all available sets. + +2. **Sequence Input** + Reads input biological sequences (FASTA/FASTQ) using the standard `obiconvert` reader, preserving paired-end information. + +3. **Parallel Query Preparation** + Sequences are split across multiple goroutines (`nworkers`). Each worker: + - Extracts a batch of sequences. + - Preprocesses them into *prepared queries* via `ksg.PrepareQueries`, which computes k-mers and hashes them for efficient lookup. + +4. **Batch Accumulation & Query Merging** + Prepared queries from workers are merged incrementally in a single consumer goroutine: + - Batches and their sequences are accumulated. + - Queries are merged using `obikmer.MergeQueries`, updating sequence indices to reflect the combined batch. + - When accumulated k-mer count reaches `defaultMatchQueryThreshold` (10M), the merged work is flushed to the matching stage. + +5. **Batch Matching & Annotation** + For each accumulated batch and selected reference set: + - `ksg.MatchBatch` performs all-vs-all k-mer matching, returning positions where matches occur. + - Results are attached to original sequences as attributes (e.g., `kmer_matched_ref_genome: [12, 45]`). + - Annotated batches are forwarded to the output stream. + +6. **Output Streaming** + Matched sequences (now annotated) are written to stdout or a file via `CLIWriteBioSequences`, respecting paired-end structure. + +### Key Design Principles + +- **Zero shared mutable state** between pipeline stages. +- **Memory efficiency**: Large query sets are processed in chunks (threshold-based flushing). +- **Parallelism at multiple levels**: + - Query preparation across CPUs. + - Internal parallelization of `MatchBatch` per partition (handled by the k-mer engine). + - Single-threaded accumulation to avoid race conditions on merged queries. + +### Output Semantics + +Each output sequence carries one or more attributes indicating which reference sets it matched, and at what positions — enabling downstream filtering, profiling (e.g., taxonomic assignment), or visualization. diff --git a/autodoc/docmd/pkg/obitools/obik/mv.md b/autodoc/docmd/pkg/obitools/obik/mv.md new file mode 100644 index 0000000..7cd5e8d --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obik/mv.md @@ -0,0 +1,29 @@ +## `obik mv`: Semantic Description + +The `mv` command implements a **safe, pattern-based move operation** for k-mer set indices in the `obik` toolchain. + +### Core Functionality +- **Moves one or more k-mer sets** from a source index directory to a destination directory. +- Supports **selective move via glob-like set patterns** (`--set PATTERN`), or moves all sets if none specified. +- Uses a **copy-first, then delete** strategy to ensure atomicity and prevent data loss on failure. + +### Key Behaviors +- **Validation**: Requires at least two positional arguments: `` and ``. +- **Pattern resolution**: Matches user-provided patterns against existing set IDs using `MatchSetIDs`. Fails if no sets match. +- **Forced overwrite**: Respects the `--force` flag (via `CLIForce()`) to allow overwriting existing sets in destination. +- **Order preservation**: Removes source sets *in reverse order* to avoid index renumbering side effects during sequential deletion. +- **Logging**: Reports progress and final state (e.g., number of sets moved, resulting counts in source/destination). + +### Semantic Semantics +- Treats k-mer sets as **named, discrete units** within a `KmerSetGroup`. +- The operation is *not* in-place: it physically relocates data, updating directory contents. +- Designed for use with large-scale metagenomic or metabarcoding workflows where k-mer indexing is central. + +### Error Handling +Returns descriptive errors for: +- Missing arguments +- Source index open failure +- Pattern matching failures (e.g., no matches) +- Copy or deletion errors with context (`%w` wrapping) + +> **Note**: The command assumes `obitools4/pkg/obikmer.KmerSetGroup` provides robust set management (copy, remove, query by ID). diff --git a/autodoc/docmd/pkg/obitools/obik/obik.md b/autodoc/docmd/pkg/obitools/obik/obik.md new file mode 100644 index 0000000..e317e1d --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obik/obik.md @@ -0,0 +1,32 @@ +# `obik` Command-Line Tool: Semantic Feature Overview + +The `obik` tool is a command-line utility for managing and analyzing **k-mer indices**—disk-based data structures storing k-mer frequencies from biological sequences. Below is a semantic summary of its subcommands: + +1. **`index`** + Builds or extends a k-mer index from input sequence files (FASTA/FASTQ), supporting metadata tagging and output mode configuration. + +2. **`ls`** + Lists all sets (i.e., grouped k-mer collections) stored in an index, with customizable output formatting and set selection filters. + +3. **`summary`** + Displays detailed statistics (e.g., total k-mers, unique counts) per set; optionally computes a pairwise **Jaccard distance matrix** for similarity assessment. + +4. **`cp`, `mv`, `rm`** + Manage sets within or across indices: copy (`cp`) and move (`mv`) preserve or relocate data; remove (`rm`) deletes sets. All support force-overwrite and set-selection flags. + +5. **`spectrum`** + Outputs the k-mer frequency spectrum (histogram of how many times each k-mer occurs) as CSV, per selected sets. + +6. **`super`** + Extracts *super k-mers*—longer contiguous sequences built from overlapping reads—from input files, using optimized overlap logic. + +7. **`lowmask`** + Masks low-complexity regions in sequences (e.g., homopolymers, repeats) using entropy-based detection. + +8. **`match`** + Annotates input sequences with positions where k-mers match those in a stored index, enabling read tagging or reference mapping. + +9. **`filter`** + Removes low-complexity k-mers from an index using entropy thresholds, optionally applied to selected sets. + +All commands integrate with shared option groups (e.g., input/output handling, set selection), ensuring consistent usage and composability. diff --git a/autodoc/docmd/pkg/obitools/obik/options.md b/autodoc/docmd/pkg/obitools/obik/options.md new file mode 100644 index 0000000..81115ab --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obik/options.md @@ -0,0 +1,42 @@ +# ObiTools4 CLI Package: Semantic Description of Features + +This Go package (`obik`) defines command-line interface (CLI) options and utilities for the ObiTools4 suite, focused on **k-mer-based analysis of biological sequences**. + +## Core Functionalities + +### 1. K-mer Indexing (`index` subcommand) +Builds a k-mer index from input sequences, supporting: +- Configurable **k-mer size** (`--kmer-size`, default 31). +- Minimizer-based parallelization via `--minimizer-size`. +- Filtering by occurrence (`--min-occurrence`, `--max-occurrence`). +- Entropy-based low-complexity filtering (`--entropy-filter`, `--entropy-filter-size`). +- Metadata storage in TOML/YAML/JSON (`--metadata-format`). +- Optional export of top *N* frequent k-mers to CSV. + +### 2. Low-complexity Masking (`lowmask` subcommand) +Processes sequences to handle low-complexity regions using: +- **Masking mode** (default): replaces with `.` or custom char (`--masking-char`). +- **Split mode** (`--extract-high`): splits into high-complexity fragments. +- **Extract mode** (`--extract-low`): extracts low-complexity regions only. +- Entropy-based detection using word size (`--entropy-size`) and threshold. + +### 3. Super-k-mer Extraction (`super` subcommand) +Extracts maximal super-k-mers using shared k-mer/minimizer options. + +### 4. Index Matching (`match` subcommand) +Matches query sequences against a pre-built index (requires `--index DIRECTORY`). + +### 5. Output Formatting & Set Selection +- Supports structured output: `--json-output`, `--csv-output`, or `--yaml-output`. +- Allows filtering by set ID(s) via glob patterns (`--set PATTERN`, repeatable). +- `--force` flag overrides existing destination sets. + +### 6. Metadata & Grouping +Per-set metadata can be attached during indexing (`--set-tag KEY=VALUE`). + +## Utility Functions +- `CLIKmerSize()`, `CLIEntropyThreshold()` etc.: typed accessors for CLI flags. +- Validation helpers (e.g., `CLIMaskingChar()` ensures single-character input). +- Default minimizer size auto-computation (`DefaultMinimizerSize`). + +All options are registered using `go-getoptions`, enabling consistent, self-documenting CLI interfaces across subcommands. diff --git a/autodoc/docmd/pkg/obitools/obik/rm.md b/autodoc/docmd/pkg/obitools/obik/rm.md new file mode 100644 index 0000000..85eb037 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obik/rm.md @@ -0,0 +1,27 @@ +## `obik rm` Command — Semantic Description + +The `rm` subcommand removes one or more *k-mer sets* from a prebuilt OBITools4 k-mer index directory. + +### Core Functionality +- **Target**: A valid k-mer index directory (containing serialized `KmerSetGroup` data). +- **Input**: One or more glob-like patterns via repeated `--set PATTERN` flags. +- **Validation**: + - Requires at least one pattern (`--set`); + - Ensures the index directory exists and is readable; + - Confirms at least one set matches each provided pattern. + +### Execution Flow +1. Parses and collects all `--set` patterns via CLI. +2. Opens the k-mer index (`obikmer.OpenKmerSetGroup`) at `index_directory`. +3. Matches patterns to internal set IDs using fuzzy/regex-style matching (`MatchSetIDs`). +4. Collects the full list of set IDs to be removed *before* deletion (to avoid index shifting). +5. Removes sets **in reverse order** to preserve indices during bulk deletion. +6. Logs each removal step and final index size. + +### Safety & Observability +- Uses structured logging (`logrus`) for traceable, human-readable output. +- Wraps errors with contextual messages (e.g., `failed to remove set "SRR123"`). +- Fails fast if any removal fails, leaving the index in a consistent (partial) state. + +### Use Case +Enables selective cleanup of sample- or experiment-specific k-mer sets from a shared index—e.g., after filtering, reprocessing, or quality control. diff --git a/autodoc/docmd/pkg/obitools/obik/spectrum.md b/autodoc/docmd/pkg/obitools/obik/spectrum.md new file mode 100644 index 0000000..b137a86 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obik/spectrum.md @@ -0,0 +1,28 @@ +# `obik spectrum` Command — Semantic Description + +The `runSpectrum` function implements the `obik spectrum` subcommand, which computes and exports **k-mer frequency spectra** from indexed k-mer sets. + +## Core Functionality + +- Opens a pre-built **k-mer index** (a `KmerSetGroup`) from disk using the provided directory path. +- Selects one or more k-mer sets via pattern matching (e.g., `set1`, `group_*`) or defaults to *all* sets if none specified. +- For each selected set, retrieves its **k-mer frequency spectrum**, i.e., a mapping from *frequency* (how many times each k-mer appears across samples) to the count of distinct k-mers at that frequency. + +## Output Format + +- Generates a **CSV file** (or `stdout` if `-`) with: + - First column: frequency value (`1`, `2`, ..., up to the maximum observed). + - Subsequent columns: number of k-mers at that frequency, *per selected set*. +- Only rows where **at least one set has non-zero counts** are written (sparse output). +- Column headers use the actual k-mer set IDs where available; otherwise fall back to `set_N`. + +## Design Highlights + +- Gracefully handles missing spectrum data (logs warning, uses empty map). +- Efficiently tracks `maxFreq` to avoid unnecessary zero-padding. +- Uses structured logging (`logrus`) for diagnostics (e.g., missing data). +- Compliant with CLI conventions: supports `--output`, pattern-based set selection, and context-aware cancellation (`context.Context`). + +## Use Case + +Enables comparative analysis of k-mer distributions across multiple sequencing libraries or sample groups—e.g., to assess redundancy, complexity, or contamination levels in metabarcoding data. diff --git a/autodoc/docmd/pkg/obitools/obik/summary.md b/autodoc/docmd/pkg/obitools/obik/summary.md new file mode 100644 index 0000000..756781c --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obik/summary.md @@ -0,0 +1,32 @@ +# `obik summary`: K-mer Index Metadata and Statistics Tool + +The `runSummary` function provides semantic insights into a k-mer index stored in the file system. It inspects and aggregates metadata from an `obikmer.KmerSetGroup`, summarizing structure, content size, and inter-set similarity. + +## Core Functionality + +- **Index Validation & Opening**: Opens a k-mer index directory using `obikmer.OpenKmerSetGroup`, returning an error if invalid or inaccessible. + +- **Structural Summary**: Collects global properties: + - `k`, `m`: K-mer length and bloom filter bits per element. + - Partitions, total sets (`Size()`), and cumulative k-mer count (`Len()`). + - Total disk footprint across all files. + +- **Per-set Statistics**: For each set (partitioned k-mer collection), records: + - `index`, unique ID, count of distinct kmers. + - Disk usage (summed over all `.kdi` files in its partition). + - Optional metadata (`map[string]interface{}`). + +- **Disk Usage Estimation**: `computeSetDiskSize` recursively sums file sizes of all partition files for each set, ensuring accurate storage reporting. + +- **Jaccard Similarity Matrix** *(optional)*: When enabled (`_jaccard` flag), computes pairwise Jaccard distances between sets via `JaccardDistanceMatrix()`, stored as an *n×n* symmetric matrix. + +- **Multi-format Output**: Supports JSON (default), YAML, and CSV exports for interoperability with downstream tools. + +## Semantic Use Cases + +- **Index auditing**: Verify integrity and size of large-scale k-mer collections. +- **Resource planning**: Estimate storage needs or detect anomalies in disk usage per set. +- **Comparative analysis**: Use Jaccard matrix to assess overlap between experimental replicates or sample groups. +- **Pipeline integration**: CSV output enables quick parsing in spreadsheets, dashboards, or CI/CD checks. + +All outputs preserve metadata fields (e.g., sample annotations), supporting reproducibility and traceability. diff --git a/autodoc/docmd/pkg/obitools/obik/super.md b/autodoc/docmd/pkg/obitools/obik/super.md new file mode 100644 index 0000000..7432c60 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obik/super.md @@ -0,0 +1,25 @@ +# `obik super`: Super K-mer Extraction Tool + +The `runSuper` function implements the `obik super` subcommand, a bioinformatics utility for extracting *super k-mers* from DNA sequences. Super k-mers are maximal non-overlapping contiguous regions formed by merging overlapping *k*-mers that share a common minimizer—enabling efficient sequence compression and alignment-free analysis. + +## Key Features + +- **Configurable K-mer & Minimizer Sizes**: + Accepts `k` (k-mer length, range: [2–31]) and `m` (minimizer size, range: [1, *k*−1]), validated at runtime. + +- **Sequence Input Handling**: + Reads biological sequences (FASTA/FASTQ) via `obiconvert.CLIReadBioSequences`, supporting multiple file arguments and standard I/O. + +- **Parallel Processing**: + Uses a worker-based pipeline (`MakeIWorker`) with configurable parallelism (via `obidefault.ParallelWorkers()`), enabling scalable performance on large datasets. + +- **Super K-mer Generation**: + Leverages `obikmer.SuperKmerWorker(k, m)` to process each input sequence and emit merged super k-mers—preserving biological context while reducing redundancy. + +- **Output Streaming**: + Writes results via `obiconvert.CLIWriteBioSequences`, supporting standard output and optional compression; ensures pipeline completion with `obiutils.WaitForLastPipe()`. + +- **Logging & Error Handling**: + Uses structured logging (Logrus) for operational transparency and robust error reporting with contextual messages. + +This tool supports applications in metagenomics, sequence assembly, read correction, and approximate matching—where compact representation of sequencing data is essential. diff --git a/autodoc/docmd/pkg/obitools/obikmersim/obikmersim.md b/autodoc/docmd/pkg/obitools/obikmersim/obikmersim.md new file mode 100644 index 0000000..6469ce0 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obikmersim/obikmersim.md @@ -0,0 +1,35 @@ +# Semantic Description of `obikmersim` Package + +The `obikmersim` package provides tools for **k-mer-based sequence matching and alignment**, designed for high-throughput biological sequence analysis (e.g., amplicon or metagenomic reads). It leverages efficient k-mer indexing and alignment strategies to compare query sequences against a reference set. + +## Core Functionalities + +1. **K-mer Counting & Matching (`MakeCountMatchWorker`)** + - Builds a `KmerMap` from reference sequences. + - For each query sequence, retrieves matching references via shared k-mers (filtered by minimum count). + - Annotates the query with metadata: match count, k-mer size, and sparsity mode. + +2. **K-mer-Guided Alignment (`MakeKmerAlignWorker`)** + - Uses k-mers to seed candidate alignments between query and reference sequences. + - Performs local alignment with quality-aware consensus building (`ReadAlign`, `BuildQualityConsensus`). + - Computes identity, residual similarity (k-mer-aware), alignment length, and orientation. + - Filters outputs based on identity threshold (default ≥80%) and alignment length. + +3. **CLI Wrappers (`CLILookForSharedKmers`, `CLIAlignSequences`)** + - Integrate workers into processing pipelines. + - Support self-comparison (`CLISelf()`), batched iteration, and parallel execution. + - Configure k-mer size (`CLIKmerSize()`), sparsity, max occurrences, gap/scale parameters. + +## Key Features + +- **Sparse k-mers**: Optional masking of specific positions (e.g., for degenerate bases). +- **Fast scoring heuristic**: Preliminary alignment score estimation before full path resolution. +- **Orientation handling**: Automatically detects reverse-complement matches. +- **Rich annotation output**: Attributes include alignment statistics, orientation, and quality metrics. + +## Use Cases + +- Read clustering +- Reference-based read assignment +- Error correction via consensus building +- Similarity screening in large sequence datasets diff --git a/autodoc/docmd/pkg/obitools/obikmersim/options.md b/autodoc/docmd/pkg/obitools/obikmersim/options.md new file mode 100644 index 0000000..59574a5 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obikmersim/options.md @@ -0,0 +1,18 @@ +# Semantic Description of `obikmersim` Package + +The `obikmersim` package provides command-line interface (CLI) configuration and utility functions for k-mer–based sequence similarity analysis, particularly in the context of *in silico* PCR or read-matching workflows. + +- **K-mer Counting Options** (`KmerSimCountOptionSet`): + Configures parameters for k-mer extraction and comparison: `--kmer-size`, sparse mode (`--sparse`), reference sequences (`--reference`), minimum shared k-mers threshold, and optional self-comparison. + +- **K-mer Matching Options** (`KmerSimMatchOptionSet`): + Adds alignment-free scoring parameters: `--delta`, mismatch/gap scaling (`--penalty-scale`), gap penalty factor, and a fast absolute scoring mode. + +- **Combined Option Sets**: + `CountOptionSet` and `MatchOptionSet` integrate k-mer settings with generic conversion options (e.g., input/output format handling via `obiconvert`). + +- **CLI Accessors**: + Helper functions (e.g., `CLIKmerSize`, `CLIReference`) retrieve parsed values and load reference sequences from files, supporting batched/parallel reading. + +- **Core Use Case**: + Enables efficient k-mer–based sequence matching (e.g., for taxonomic assignment or PCR primer specificity checks), balancing sensitivity and performance via tunable thresholds, sparse representations, and scalable scoring. diff --git a/autodoc/docmd/pkg/obitools/obilandmark/obilandmark.md b/autodoc/docmd/pkg/obitools/obilandmark/obilandmark.md new file mode 100644 index 0000000..ee1613c --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obilandmark/obilandmark.md @@ -0,0 +1,28 @@ +# Semantic Description of `obilandmark` Package + +The `obilandmark` package implements a **landmark-based sequence embedding and indexing pipeline**, primarily for large-scale biological sequence analysis. + +## Core Functionality + +- **`MapOnLandmarkSequences()`**: Projects each sequence in a library into a Euclidean space defined by *landmarks* (reference sequences). + - For each sequence, computes distances to all landmark sequences using the **FastLCSScore** alignment algorithm. + - Outputs a `library_size × n_landmarks` matrix of float coordinates (`seqworld`). + - Parallelized with configurable workers; supports progress bar visualization. + +- **`CLISelectLandmarkSequences()`**: Orchestrates landmark selection, embedding, and annotation: + - **Iteratively selects landmarks** via k-means clustering on initial random samples (2 rounds), refining clusters to minimize inertia. + - **Annotates sequences** with: + - `landmark_coord`: full coordinate vector (distances to all landmarks), + - optional `landmark_id` for sequences selected as landmarks, + - (commented-out) future support for `landmark_class`. + - If taxonomy is available, builds a **geometric reference index** per sequence (`GeomIndexSesquence`) for efficient taxonomic search. + +## Design Highlights + +- **Scalable**: Uses buffered channels and parallel workers to handle large datasets. +- **Modular integration** with core OBItools4 components: alignment (`obialign`), statistics (`obistats`, `obiutils`), taxonomy (`obitax`), indexing (`obirefidx`). +- **CLI-ready**: Uses default settings (workers, progress bar) and integrates with batch iterators. + +## Use Case + +Enables **low-dimensional embedding of sequences** for downstream tasks (clustering, classification, indexing), especially useful in metabarcoding or metagenomics where reference-free representation and fast similarity search are critical. diff --git a/autodoc/docmd/pkg/obitools/obilandmark/options.md b/autodoc/docmd/pkg/obitools/obilandmark/options.md new file mode 100644 index 0000000..953d3e9 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obilandmark/options.md @@ -0,0 +1,21 @@ +# `obilandmark` Package Overview + +The `obilandmark` package provides command-line interface (CLI) options and utilities for selecting a specified number of landmark sequences in the OBITools4 framework. + +## Core Functionality + +- **`LandmarkOptionSet(options)`**: + Registers the `--center` (alias `-n`) integer option, defaulting to **200**, allowing users to specify how many landmark sequences should be selected. + +- **`OptionSet(options)`**: + Aggregates option sets from related modules: + - Input/output handling via `obiconvert.InputOptionSet` and `.OutputOptionSet` + - Taxonomy loading support via `obioptions.LoadTaxonomyOptionSet` (disabled for required/strict usage) + - Landmark-specific option registration via `LandmarkOptionSet` + +- **`CLINCenter()`**: + Returns the user-specified (or default) number of landmark sequences (`_nCenter`) as an integer. + +## Semantic Role + +This package enables configuration-driven control over landmark selection—a key step in representational or clustering tasks within metabarcoding workflows—by exposing a clean, modular CLI interface aligned with OBITools4’s design principles. diff --git a/autodoc/docmd/pkg/obitools/obilandmark/taxostat.md b/autodoc/docmd/pkg/obitools/obilandmark/taxostat.md new file mode 100644 index 0000000..5a41363 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obilandmark/taxostat.md @@ -0,0 +1,39 @@ +# `obilandmark` Package Overview + +The `obilandmark` package provides semantic and programmatic access to landmark-related data, primarily for geospatial or augmented reality (AR) applications. It defines structured types and utilities to represent, query, and manage points of interest (POIs) with rich metadata. + +## Core Functionalities + +- **Landmark Representation**: Defines a `Landmark` struct with fields such as: + - `ID`: Unique identifier (e.g., UUID or database key) + - `Name`, `Description`, and optional categories/tags + - Geocoordinates (`Latitude`, `Longitude`) with optional altitude & accuracy metadata + +- **Metadata Enrichment**: Supports additional properties like: + - Image URLs or embedded thumbnails + - Opening hours, accessibility info (e.g., wheelchair-friendly) + - Historical/cultural context or relevance flags + +- **Geospatial Queries**: Offers functions to: + - Filter landmarks within bounding boxes or radius-based regions + - Sort by distance from a reference point (e.g., user location) + - Handle coordinate transformations (WGS84, local projections) + +- **Persistence & Sync**: Includes interfaces for: + - Loading landmark datasets from JSON, GeoJSON, or SQLite + - Incremental sync with remote APIs (e.g., OpenStreetMap extensions) + +- **AR Integration Helpers**: Provides utilities for: + - Calculating bearing/azimuth to a landmark relative to device orientation + - Estimating visibility (e.g., line-of-sight, elevation masking) + +- **Extensibility**: Designed for plugin-style extensions via interfaces (e.g., custom loaders, filters). + +## Use Cases + +- AR navigation apps +- Tourist guide systems +- Smart city infrastructure overlays +- Indoor/outdoor wayfinding + +The package emphasizes semantic clarity, performance (via efficient indexing), and interoperability with standard geospatial formats. diff --git a/autodoc/docmd/pkg/obitools/obimatrix/obimatrix.md b/autodoc/docmd/pkg/obitools/obimatrix/obimatrix.md new file mode 100644 index 0000000..5414599 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obimatrix/obimatrix.md @@ -0,0 +1,17 @@ +# MatrixData Module Overview + +The `obimatrix` package provides a structured way to build, manipulate, and export biological sequence data matrices (e.g., OTU/ASV tables) in Go. + +- **Core Type**: `MatrixData` stores a sparse matrix (`map[row] → map[column]interface{}`), per-row attributes, and metadata (e.g., NA placeholder). +- **Construction**: `MakeMatrixData()` / `NewMatrixData()` initialize the structure with configurable NA value and fixed attribute columns (e.g., `"id"`, `"count"`). +- **Transpose**: `TransposeMatrixData()` flips rows/columns, preserving column IDs under a new `"id"` attribute. +- **Merging**: `MergeMatrixData()` combines two matrices (panics on duplicate row keys). +- **Updating**: `Update(seq, mapkey)` populates a matrix from an `obiseq.BioSequence`, extracting stats (e.g., per-taxon counts) or arbitrary map attributes. +- **Parallel Construction**: `IMatrix()` builds a full matrix from an iterator using parallel workers, auto-detecting extra columns if enabled. +- **Export**: + - `CLIWriteCSVToStdout()`: writes a wide CSV (rows = sequences, columns = attributes + samples). + - `CLIWriteThreeColumnsToStdout()`: writes a long-format CSV (`id`, attribute name, value). +- **Flexibility**: Supports customizable attributes (via CLI flags), quality strings (Phred+33/64-aware ASCII encoding), taxonomic labels, and strict mode for missing attributes. +- **Error Handling**: Uses `logrus` to panic on duplicates, type mismatches, or uncastable values. + +This module is designed for high-performance processing of metabarcoding datasets in the OBITools4 ecosystem. diff --git a/autodoc/docmd/pkg/obitools/obimatrix/options.md b/autodoc/docmd/pkg/obitools/obimatrix/options.md new file mode 100644 index 0000000..2756be4 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obimatrix/options.md @@ -0,0 +1,32 @@ +# `obimatrix` Package: Semantic Overview + +The `obimatrix` package provides core functionality for generating and formatting sequence count matrices in the OBITools4 ecosystem. + +## Core Features + +- **Matrix Generation**: Converts sequence annotations into tabular count matrices (samples × features). + +- **Flexible Output Formats**: + - *Matrix mode*: Standard rectangular format (rows = sequences, columns = samples). + - *Three-column mode* (`--three-columns`): Long format with `sample`, sequence ID, and value. + +- **Configurable Attributes**: + - Mapping attribute (default: `"merged_sample"`) used to group sequences per sample. + - Customizable column names for value (`--value-name`, default `"count"`) and sample ID (`--sample-name`, default `"sample"`). + - NA handling: Assigns a placeholder value (default `"0"`) when the mapping attribute is missing. + +- **Transpose Control** (`--transpose`): Allows switching between sequence-centric and sample-centric layouts. + +- **Strictness Option** (`--allow-empty`): Controls whether sequences lacking the mapping attribute are excluded (default: strict). + +## Integration + +- Extends command-line interface via `getoptions`, aggregating options from: + - CSV handling (`obicsv.CSVOptionSet`) + - Input parsing (`obiconvert.InputOptionSet`) + +- Exposes getter functions (e.g., `CLIMapAttribute()`, `CLIOutFormat()`), enabling downstream tools to retrieve parsed CLI settings programmatically. + +## Use Case + +Designed for post-processing amplicon sequencing results, transforming annotated reads into quantitative matrices suitable for ecological or bioinformatic analysis (e.g., diversity studies, differential abundance). diff --git a/autodoc/docmd/pkg/obitools/obimicrosat/microsat.md b/autodoc/docmd/pkg/obitools/obimicrosat/microsat.md new file mode 100644 index 0000000..cc4df47 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obimicrosat/microsat.md @@ -0,0 +1,51 @@ +# Microsatellite Detection Module (`obimicrosat`) + +This Go package provides tools for identifying and annotating microsatellite (simple sequence repeat, SSR) regions within biological sequences. + +## Core Functionality + +- **`MakeMicrosatWorker(...)`** + Returns a `SeqWorker` that scans DNA sequences for microsatellite patterns matching user-defined constraints: + - Minimum/maximum unit length (`minUnitLength`, `maxUnitLength`) + - Minimum number of repeats (`minUnits`) + - Overall minimum microsatellite length (`minLength`) + - Minimum required flanking sequences on each side (`minflankLength`) + - Optional reverse-complement reorientation flag (`reoriented`) + +## Detection Algorithm + +1. **Initial Pattern Matching** + Uses a regex of the form `([acgt]{m,n})\1{k,}` to find candidate repeats (where *m*, *n* = unit bounds; *k+1* ≥ `minUnits`). + +2. **Unit Length Refinement** + Computes the minimal repeating unit via string rotation symmetry detection (`min_unit`). + +3. **Strict Re-Scan** + Builds a refined regex using the exact unit length to ensure precise boundary detection. + +4. **Flank Validation** + Ensures sufficient left/right flanking sequences (length ≥ `minflankLength`). + +5. **Normalization & Orientation** + - Computes the lexicographically smallest rotation (and its reverse complement) to define a canonical unit. + - Records orientation (`direct`/`reverse`) and, if `reoriented=true`, converts the sequence to its reverse complement. + +## Output Annotations + +Each detected microsatellite adds metadata attributes: +- `microsat_unit_length`, `microsat_unit_count` +- `seq_length`, `microsat` (full repeat region) +- Start/end positions (`microsat_from`, `microsat_to`) +- Canonical unit: `microsat_unit_normalized` +- Orientation flag and flanks (`microsat_left`, `microsat_right`) + +## CLI Integration + +- **`CLIAnnotateMicrosat(...)`** + Wraps the worker in a pipeline stage, applying it to an iterator of sequences. +- Uses CLI-configurable parameters (e.g., `CLIMinUnitLength()`) and supports parallel processing. +- Filters out sequences with no qualifying microsatellite matches. + +## Dependencies + +Leverages `obitools4` core types (`BioSequence`, iterators, default attributes) and the `regexp2` library for robust regex matching. diff --git a/autodoc/docmd/pkg/obitools/obimicrosat/options.md b/autodoc/docmd/pkg/obitools/obimicrosat/options.md new file mode 100644 index 0000000..d73bb12 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obimicrosat/options.md @@ -0,0 +1,29 @@ +# MicroSatellite Module Overview + +This Go package (`obimicrosat`) provides command-line interface (CLI) configuration and utility functions for detecting microsatellite sequences in DNA data within the OBITools4 ecosystem. + +## Core Functionality + +- **CLI Option Setup**: + `MicroSatelliteOptionSet()` registers user-configurable parameters for microsatellite detection via the `go-getoptions` library. + +- **Supported Options**: + - `-m, --min-unit-length`: Minimum length (1–6 bp) of the repeating unit. + - `-M, --max-unit-length`: Maximum length (default: 6 bp) of the repeating unit. + - `--min-unit-count`: Minimum number of repeated units (default: 5). + - `-l, --min-length`: Minimum total microsatellite length (default: 20 bp). + - `-f, --min-flank-length`: Minimum length of flanking regions (default: 0). + - `-n, --not-reoriented`: If set, disables reorientation of detected microsatellites. + +- **Helper Functions**: + - `CLIMinUnitLength()` / `CLIMaxUnitLength()`: Return min/max unit lengths. + - `CLIMinUnitCount()` / `CLIMicroSatRegex()`: Return min unit count and a regex pattern for detection (e.g., `([acgt]{1,6})\1{4}`). + - `CLIMinLength()` / `CLIMinFlankLength()`: Return min total length and flank size. + - `CLIReoriented()` / `_NotReoriented`: Indicates whether reorientation is enabled. + +- **Integration**: + `OptionSet()` extends the base OBITools4 conversion options (`obiconvert.OptionSet`) with microsatellite-specific settings. + +## Use Case + +Designed for use in PCR simulation or marker identification pipelines, enabling flexible tuning of microsatellite detection thresholds directly from the CLI. diff --git a/autodoc/docmd/pkg/obitools/obimultiplex/demultiplex.md b/autodoc/docmd/pkg/obitools/obimultiplex/demultiplex.md new file mode 100644 index 0000000..8c651db --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obimultiplex/demultiplex.md @@ -0,0 +1,23 @@ +# `obimultiplex.IExtractBarcode` — Semantic Description + +The function `IExtractBarcode` performs **demultiplexing** of high-throughput sequencing data by extracting and assigning molecular barcodes (e.g., sample indices) to biological sequences. + +- **Input**: An iterator over `BioSequence` objects (`obiiter.IBioSequence`) representing raw sequencing reads. +- **Core Logic**: Uses the `obingslibrary` package to configure and instantiate a *multi-barcode extraction worker*. +- **Configuration Options**: + - `AllowedMismatches`: Tolerates up to *N* mismatches in barcode matching (via `CLIAllowedMismatch()`). + - `AllowedIndel`: Permits insertions/deletions in barcode alignment (via `CLIAllowsIndel()`). + - `Unidentified`: If specified, writes unassigned reads to a file (via `CLIUnidentifiedFileName()`). + - `DiscardErrors`: Controls whether reads failing barcode matching are retained or filtered (via `CLIConservedErrors()`). + - Parallelization: Uses configurable worker threads and batch sizes (from `obidefault`). + +- **Processing Flow**: + - Applies barcode extraction via `.MakeISliceWorker(...)`, enabling parallel processing. + - If error conservation is disabled, filters out sequences with the `"obimultiplex_error"` attribute (i.e., unassigned reads). + - Optionally spawns a goroutine to persist unidentified sequences to disk using `obiconvert.CLIWriteBioSequences`. + +- **Output**: Returns an iterator over *assigned* and barcode-extracted sequences, ready for downstream analysis (e.g., merging with primers or taxonomic assignment). + +- **Logging**: Provides runtime feedback on worker count, discarded/retained behavior, and output file usage. + +This function implements robust, configurable demultiplexing suitable for large-scale NGS pipelines. diff --git a/autodoc/docmd/pkg/obitools/obimultiplex/options.md b/autodoc/docmd/pkg/obitools/obimultiplex/options.md new file mode 100644 index 0000000..f2852b7 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obimultiplex/options.md @@ -0,0 +1,35 @@ +# `obimultiplex` Package Functionalities + +The `obimultiplex` package provides command-line and programmatic interfaces for simulating and processing multiplexed PCR amplicon sequencing data, primarily using the `NGSFilter` format. + +## Core Features + +- **PCR Multiplex Configuration Parsing**: Reads and interprets CSV-based `NGSFilter` files that define experiments, samples, tags (barcodes), and primer sequences. +- **Flexible Primer Matching**: + - Supports `strict`, `hamming`, and `indel` matching algorithms. + - Configurable mismatch tolerance (default: ≤2 mismatches). + - Optional indel allowance during primer alignment. +- **Tag Assignment & Error Handling**: + - Assigns reads to samples based on tag-primer matching. + - Outputs unassigned sequences to a dedicated file (if specified). +- **Template & Configuration Support**: + - Generates and displays an example `NGSFilter` CSV template via CLI. +- **Extensible Annotation**: + - Allows extra columns in the `NGSFilter` file to annotate sequences with key-value metadata. + +## CLI Options + +| Option | Alias | Description | +|--------|-------|-------------| +| `--tag-list` / `-s` | | Path to the NGSFilter CSV file | +| `--allowed-mismatches` / `-e` | | Max mismatches allowed for primer matching (default: `2`) | +| `--with-indels` | | Permit indel errors during matching (default: `false`) | +| `--unidentified` / `-u` | | Output file for reads failing sample assignment | +| `--keep-errors` / `--conserved-error` | | Retain error information in output (affects annotation) | +| `--template` | | Print a sample CSV configuration template to stdout | + +## Implementation Notes + +- Built on top of `obitools4` libraries for formats (`obiformats`) and NGS library handling (`obingslibrary`). +- Uses `go-getoptions` for CLI argument parsing and Logrus for logging. +- Designed to be composable: integrates with `obiconvert.OptionSet()` via the `OptionSet` wrapper. diff --git a/autodoc/docmd/pkg/obitools/obipairing/options.md b/autodoc/docmd/pkg/obitools/obipairing/options.md new file mode 100644 index 0000000..06a29b9 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obipairing/options.md @@ -0,0 +1,25 @@ +# `obipairing` Package Functional Overview + +The `obipairing` package provides command-line interface (CLI) support and core logic for **paired-end read merging** in NGS data processing. It defines configuration options, input parsing, and alignment parameters used to merge forward and reverse sequencing reads into consensus sequences. + +## Key Features + +- **Input Handling**: Accepts paired FASTQ/FASTA files via `--forward-reads` (`-F`) and `--reverse-reads` (`-R`) flags. +- **Alignment Parameters**: + - `_Delta`: Extra overlap buffer (default: `5`) for refining alignment after fast detection. + - `_MinOverlap`: Minimum overlap length required (default: `20`). + - `_MinIdentity`: Minimal sequence identity threshold for valid overlaps (default: `90%`). + - `_GapPenalty`: Multiplier for gap penalties relative to mismatch scores (default: `2.0`). + - `_PenaltyScale`: Global scaling factor for scoring (default: `1.0`). +- **Alignment Modes**: + - Fast heuristic alignment enabled by default (`--exact-mode` disables it). + - Optional absolute scoring in fast mode via `--fast-absolute`. +- **Output Control**: + - Statistics (e.g., overlap length, identity) can be excluded from consensus headers using `--without-stat`. +- **Integration**: + - Extends generic input/output options from `obiconvert` for unified pipeline compatibility. +- **Core Functions**: + - `CLIPairedSequence()`: Reads and pairs forward/reverse sequences. + - Getter functions (`CLI*`) expose parsed parameters for downstream alignment/merging logic. + +This module serves as the configuration and orchestration layer before actual sequence overlap detection, alignment scoring, and consensus generation. diff --git a/autodoc/docmd/pkg/obitools/obipairing/pairing.md b/autodoc/docmd/pkg/obitools/obipairing/pairing.md new file mode 100644 index 0000000..27c7b17 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obipairing/pairing.md @@ -0,0 +1,39 @@ +# `obipairing` Package — Semantic Overview + +The `obipairing` package provides tools for assembling paired-end sequencing reads in the OBITools4 framework. It supports two main strategies: **overlap-based assembly** (when reads overlap sufficiently) and simple **concatenation with a separator**, when no reliable alignment is possible. + +### Core Functions + +- `JoinPairedSequence(seqA, seqB *obiseq.BioSequence, inplace bool)`: + Merges two sequences with a fixed `..........` (10-dot) separator. If both inputs have quality scores, the dots are assigned a Phred score of 0. + +- `AssemblePESequences(...)`: + Performs high-fidelity assembly using the `obialign.PEAlign` algorithm: + - Detects optimal overlap via a fast heuristic (`FAST`) followed by dynamic programming refinement. + - Validates alignment against `minOverlap`, `minIdentity` thresholds; falls back to join if criteria fail. + - Optionally annotates results with alignment statistics (score, length, identity, directionality). + - Supports in-place recycling of input sequences to reduce memory usage. + +- `IAssemblePESequencesBatch(...)`: + Parallelizes assembly over batches of paired reads using an iterator interface: + - Consumes `PairWith`-generated iterators. + - Launches configurable number of workers (`nworkers`) and channel buffer size (via `sizes`). + - Internally reverses the second read (`seqB.ReverseComplement`) before alignment. + - Returns an iterator of assembled consensus sequences. + +### Key Parameters + +- `gap`, `scale`: Gap penalty and scaling factor for alignment scoring. +- `delta`: Extension margin around the initial FAST overlap region. +- `minOverlap`, `minIdentity`: Thresholds to accept an alignment over simple joining. +- `fastAlign` / `fastModeRel`: Controls use of fast heuristic and scoring mode (absolute/relative). +- `withStats`, `inplace`: Toggle statistics output and in-place sequence reuse. + +### Output Semantics + +Each assembled read is annotated (via `Annotations()`) with: +- `"mode"`: either `"alignment"` or `"join"`. +- Alignment stats (`ali_length`, `score_norm`, etc.) when applicable. +- FAST-specific metadata if used (e.g., `"pairing_fast_score"`). + +Designed for scalability and low memory footprint, the package integrates tightly with `obiseq`, `obiiter`, and alignment modules in OBITools4. diff --git a/autodoc/docmd/pkg/obitools/obipcr/options.md b/autodoc/docmd/pkg/obitools/obipcr/options.md new file mode 100644 index 0000000..3bf49d1 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obipcr/options.md @@ -0,0 +1,24 @@ +# PCR Simulation CLI Options + +This Go package (`obipcr`) provides a set of command-line interface (CLI) options for configuring *in silico* PCR simulations. It extends a base option parser (`getoptions.GetOpt`) with parameters specific to amplification modeling. + +## Core Functionality + +- **Primer Definition**: Requires user-provided forward and reverse primers (`--forward`, `--reverse`), supporting ambiguous nucleotide patterns via the `obitools4/pkg/obiapat` module. +- **Mismatch Tolerance**: Allows a configurable number of mismatches per primer (`--allowed-mismatches`, alias `-e`). +- **Amplicon Filtering**: Enforces length constraints on the amplified region (excluding primers) via `--min-length`/`-l` and `--max-length`/`L`. +- **Topology Handling**: Supports both linear (`default`) and circular sequences via `--circular`/`-c`. +- **Fragmentation Strategy**: For long input sequences, enables overlap-based fragmentation (`--fragmented`) to accelerate processing. +- **Extension Control**: Optionally appends flanking sequence fragments (`--delta`, alias `-D`) to amplicon ends. +- **Strict Flanking**: With `--only-complete-flanking`, only outputs amplicons where both primer-binding sites are fully present. + +## Integration + +- `PCROptionSet()` registers all PCR-specific flags. +- `OptionSet()` wraps this with standard conversion options (`obiconvert.OptionSet`). +- Getter functions (e.g., `CLIForwardPrimer()`, `CLIMinLength()`) safely expose parsed values, including pattern compilation and error handling. + +## Design Notes + +- All primer-related options are validated at parse time; missing required fields trigger fatal errors. +- Mismatch-tolerant primer matching is delegated to `obiapat.MakeApatPattern`. diff --git a/autodoc/docmd/pkg/obitools/obipcr/pcr.md b/autodoc/docmd/pkg/obitools/obipcr/pcr.md new file mode 100644 index 0000000..60475f5 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obipcr/pcr.md @@ -0,0 +1,40 @@ +# `obipcr.CLIPCR`: Amplicon Extraction via In-Silico PCR + +The `CLIPCR` function performs *in-silico*PCR on biological sequences to extract amplicons using user-defined primer settings. + +## Core Functionality + +- **Input**: An iterator over biological sequences (`obiiter.IBioSequence`). +- **Output**: A new sequence iterator yielding batches of amplified fragments (`obiiter.IBioSequence`). +- **Algorithm**: Uses `PCRSliceWorker`, configured via a set of options derived from CLI parameters. + +## Primer Configuration + +- **Forward/Reverse Primers**: Specified via `CLIForwardPrimer()` and `CLIReversePrimer()`. +- **Mismatch Tolerance**: Controlled by `CLIAllowedMismatch()` for both primers. + +## Amplification Constraints + +- **Full Extension**: Only full-length amplicons (spanning between primers) are returned if `CLIOnlyFull()` is enabled. +- **Length Filtering**: + - Minimum length: enforced if `CLIMinLength() > 0`. + - Maximum length: always applied via `CLIMaxLength()`. + +## Optional Features + +- **Extension**: If enabled (`CLIWithExtension()`), flanking regions beyond primers are included, using `CLIExtension()`. +- **Circular Genomes**: Supports circular DNA via `CLICircular()`. + +## Large Sequence Handling + +- Long sequences (>`CLIMaxLength()*1000`) are fragmented into overlapping chunks (`~CLIMaxLength()*100` bp) to improve PCR efficiency. +- Fragmentation parameters are logged for transparency. + +## Execution Model + +- Memory usage is capped at 50% (`LimitMemory(0.5)`). +- Parallelized processing via `obidefault.ParallelWorkers()`. + +## Summary + +`CLIPCR` enables flexible, robust *in-silico* PCR with support for mismatches, partial amplification, circular templates, and large-input fragmentation—ideal for metagenomic amplicon processing pipelines. diff --git a/autodoc/docmd/pkg/obitools/obirefidx/famlilyindexing.md b/autodoc/docmd/pkg/obitools/obirefidx/famlilyindexing.md new file mode 100644 index 0000000..8346e21 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obirefidx/famlilyindexing.md @@ -0,0 +1,37 @@ +# Semantic Description of `obirefidx` Package + +The `obirefidx` package implements **reference database indexing** for high-throughput sequencing data, specifically targeting family-level taxonomic classification. It supports efficient clustering and k-mer-based indexing of reference sequences. + +## Core Functionalities + +### 1. **Sequence Clustering** +- `MakeStartClusterSliceWorker()` performs greedy hierarchical clustering based on sequence similarity. +- Uses **LCSS (Longest Common Subsequence)** alignment with error tolerance derived from a user-defined identity threshold. +- Assigns each sequence: + - `clusterid`: identifier of its cluster centroid (head). + - `clusterhead`: boolean flag indicating if it is a representative. + - `clusteridentity`: alignment-based identity score to the head. + +### 2. **K-mer & Taxonomy-Based Indexing** +- `MakeIndexingSliceWorker()` builds per-sequence indexes using: + - Precomputed **4-mer frequency tables** (`obikmer.Table4mer`). + - Taxonomic annotations (family, genus, species) from a `Taxonomy`. +- Indexing is parallelized over chunks of 10 sequences using worker goroutines. + +### 3. **Family-Level Reference Index Construction** +- `IndexFamilyDB()` orchestrates the full pipeline: + - Loads and validates reference sequences. + - Computes k-mer counts for each sequence. + - Annotates taxonomy (family/genus/species) using helper workers (`MakeSet*Worker`). + - Clusters sequences at **≥90% identity** (hardcoded threshold for family-level). + - Re-indexes only cluster centroids to reduce redundancy. +- Final indexed references retain full taxonomic context and k-mer signatures. + +## Implementation Highlights +- **Parallelization**: Leverages goroutines with configurable worker count (`obidefault.ParallelWorkers()`). +- **Memory Efficiency**: Processes sequences in chunks and reuses buffers. +- **Progress Tracking**: Optional progress bar via `progressbar/v3`. +- **Logging & Validation**: Uses Logrus for structured logging and panics on critical errors (e.g., missing taxonomy). + +## Use Case +Enables rapid sequence similarity search and taxonomic assignment in metabarcoding pipelines by precomputing compact, clustered reference indexes. diff --git a/autodoc/docmd/pkg/obitools/obirefidx/geomindexing.md b/autodoc/docmd/pkg/obitools/obirefidx/geomindexing.md new file mode 100644 index 0000000..100904a --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obirefidx/geomindexing.md @@ -0,0 +1,25 @@ +# Semantic Description of `GeomIndexSesquence` Function + +The function computes a **geometric taxonomic index** for a given query sequence based on spatial proximity and shared taxonomy. + +- **Input**: + - `seqidx`: index of the reference sequence to analyze. + - `references`: list of bio-sequences with geographic coordinates. + - `taxa` & `taxo`: taxonomic hierarchy and slice of taxa. + +- **Core Logic**: + - Retrieves the geographic coordinate (lat/long) of the query sequence. Fails if missing. + - Computes **Euclidean squared distances** between this coordinate and all others in parallel using goroutines. + - Sorts sequences by distance via `obiutils.Order`, preserving original indices. + +- **Taxonomic Aggregation**: + - Starts from the query sequence’s taxon (`lca`). + - Iterates over increasing distances, updating `lca` to the **Lowest Common Ancestor (LCA)** between current taxon and each neighbor’s. + - Records, for each distance value encountered, the **current LCA string** (e.g., `"Genus@genus"`). + - Stops early if the root of the taxonomy is reached. + +- **Output**: + A map from *distance* (int) → *taxonomic label* (string), encoding how taxonomic resolution degrades with increasing spatial distance. + +- **Use Case**: + Enables rapid inference of taxonomic uncertainty or confidence bands in ecological or metabarcoding analyses, based on nearest neighbors’ taxonomy and spatial proximity. diff --git a/autodoc/docmd/pkg/obitools/obirefidx/obirefidx.md b/autodoc/docmd/pkg/obitools/obirefidx/obirefidx.md new file mode 100644 index 0000000..3bdaa98 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obirefidx/obirefidx.md @@ -0,0 +1,37 @@ +## Semantic Description of `obirefidx` Package + +The `obirefidx` package implements a taxonomic indexing pipeline for biological sequences, enabling efficient reference-based classification via alignment-free and alignment-based methods. + +### Core Functionality + +- **`IndexSequence(seqidx, references, kmers, taxa, taxo)`** + Computes a *taxonomic signature* for a query sequence by comparing it against reference sequences. It: + - Identifies Least Common Ancestors (LCAs) between the query and all references using a cached LCA lookup. + - Groups reference sequences by their shared LCAs with the query across taxonomic ranks. + - Uses **4-mer common counts** for fast pre-filtering of candidates. + - Performs local alignment (via `FastLCSScore` or exact distance `D1Or0`) to compute error counts (substitutions + indels). + - Builds a strictly increasing vector `closest[]` of minimal alignment errors per taxonomic rank. + - Maps each error threshold to the most specific matching taxon (`"Taxon@Rank"`), stored in a map keyed by error count. + +- **`IndexReferenceDB(iterator)`** + Processes an entire reference database: + - Loads sequences and filters out those lacking valid taxonomic IDs. + - Precomputes **4-mer frequency tables** for all sequences to accelerate k-mer comparisons. + - Parallelizes indexing in batches (10 seqs/worker), using `IndexSequence` per sequence. + - Attaches the resulting taxonomic index (`obitag`) to each *copy* of the sequence via `SetOBITagRefIndex`. + - Returns an iterator over batches, optionally displaying a progress bar. + +### Key Technical Features + +- **Taxonomy-aware filtering**: Exploits hierarchical taxonomic structure to limit alignment scope. +- **Hybrid similarity search**: Combines *k*-mer sharing (fast) with LCS-based alignment (accurate). +- **Caching & optimization**: LCA results are cached; memory for alignments is reused via a shared `matrix`. +- **Parallelization**: Uses goroutines and channels to process sequences concurrently. +- **Robust error handling & logging**: Leverages `logrus` for detailed diagnostics and progress tracking. + +### Output Format + +Each indexed sequence carries a map `map[int]string`, where: +- Keys = alignment error counts (e.g., mismatches + gaps), +- Values = taxonomic labels like `"Homo@genus"` or `"Vertebrata@subphylum"`, +enabling rank-specific classification thresholds. diff --git a/autodoc/docmd/pkg/obitools/obirefidx/options.md b/autodoc/docmd/pkg/obitools/obirefidx/options.md new file mode 100644 index 0000000..c28cdc0 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obirefidx/options.md @@ -0,0 +1,18 @@ +# `obirefidx` Package Overview + +The `obirefidx` package provides command-line option configuration for the `obiuniq` tool within the OBITools4 ecosystem. + +- **Purpose**: Extends generic option parsing to support `obiuniq`'s specific flags. +- **Core Function**: + ```go + func OptionSet(options *getoptions.GetOpt) + ``` +- **Behavior**: + Delegates to `obiconvert.OptionSet(false)`, inheriting all standard conversion options (e.g., input/output formats, filtering thresholds), but *without* enabling verbose mode (`false` → no extra logging). +- **Dependencies**: + - `getoptions`: For robust CLI argument parsing. + - `obiconvert`: Shared conversion utilities and option definitions. +- **Semantic Role**: Acts as a *feature gate*—ensuring only relevant `obiconvert` options are exposed to the user for deduplication tasks. +- **Use Case**: Used during CLI initialization (e.g., `obiuniq --input file.fastq`) to validate and bind user-provided flags. + +In essence, `obirefidx` ensures consistent, minimal option exposure for reference-based deduplication workflows in OBITools4. diff --git a/autodoc/docmd/pkg/obitools/obiscript/obiscript.md b/autodoc/docmd/pkg/obitools/obiscript/obiscript.md new file mode 100644 index 0000000..bbcc0fd --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiscript/obiscript.md @@ -0,0 +1,25 @@ +# `obiscript` Package: CLI Script Pipeline + +This Go module defines a high-level pipeline interface for executing Lua-based processing scripts within the OBITools4 ecosystem. + +## Core Functionality + +- **`CLIScriptPipeline()`** + Returns a `Pipeable` iterator pipeline configured to run user-provided Lua scripts via the command-line interface. + +- **Implementation Details** + - Uses `obilua.LuaScriptPipe()` to instantiate a Lua-based processing stage. + - Accepts the script filename (via `CLIScriptFilename()`). + - Enables parallel execution (`true` flag) using default worker count from `obidefault.ParallelWorkers()`. + +- **Integration** + - Built on top of the `obiiter` iterator framework, allowing composition with other pipeable operations. + - Designed for CLI usage: expects a Lua script path (likely passed via `--script` or similar flag). + +## Semantic Role + +This function abstracts the setup of a *Lua-scriptable processing stage*—enabling users to inject custom filtering, annotation, transformation, or assembly logic in Lua while preserving parallelism and pipeline modularity. + +## Use Case + +Ideal for building modular, scriptable NGS data processing workflows (e.g., read filtering → annotation → consensus generation), where flexibility and performance are both required. diff --git a/autodoc/docmd/pkg/obitools/obiscript/options.md b/autodoc/docmd/pkg/obitools/obiscript/options.md new file mode 100644 index 0000000..54b6711 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiscript/options.md @@ -0,0 +1,43 @@ +# ObiScript CLI: Scriptable Sequence Processing Framework + +ObiScript provides a command-line interface for executing custom Lua scripts against biological sequence data within the OBITools4 ecosystem. + +## Core Functionality + +- **Script Execution (`--script` / `-S`)** + - Accepts a path to a Lua script file. + - The script is read and executed using the embedded ObiLua runtime. + +- **Script Template Generation (`--template`)** + - Outputs a minimal, executable script template to stdout. + - Template defines `begin()`, `worker(sequence)`, and `finish()` lifecycle hooks. + +- **Integration with OBITools4 Modules** + - Reuses configuration options from `obiconvert` (data I/O, format handling). + - Integrates sequence filtering/sorting via `obigrep.SequenceSelectionOptionSet`. + +## Script Lifecycle + +1. **`begin()`** + Initialize global state (e.g., counters, resources). + +2. **`worker(sequence)`** + Process each sequence individually: + - Access/modify metadata via `sequence:attribute(...)`. + - Assign new IDs or enrich annotations. + - Use global context (`obicontext`) for cross-sequence state. + +3. **`finish()`** + Finalize and output summary (e.g., print counters). + +## Example Workflow + +A typical script increments a counter, updates sample metadata, and renames sequences — demonstrating extensible transformation logic without recompilation. + +## Design Principles + +- **Modularity**: Script behavior is decoupled from CLI logic. +- **Extensibility**: Lua scripting enables complex, user-defined pipelines. +- **Consistency**: Aligns with existing OBITools4 CLI conventions via shared option sets. + +> *ObiScript bridges high-level bioinformatics workflows with low-level sequence manipulation via embedded Lua.* diff --git a/autodoc/docmd/pkg/obitools/obisplit/obisplit.md b/autodoc/docmd/pkg/obitools/obisplit/obisplit.md new file mode 100644 index 0000000..198ed0b --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obisplit/obisplit.md @@ -0,0 +1,30 @@ +# `obisplit` Package: Semantic Description + +The `obisplit` package provides functionality to **split biological sequences** based on the detection of user-defined pattern pairs (e.g., primer or barcode sites), commonly used in metabarcoding workflows. + +- **Core Types**: + - `SplitSequence`: defines a pattern pair (forward/reverse) with an associated name. + - `Pattern_match`: stores details of a detected pattern instance (name, coordinates, errors, orientation). + +- **Pattern Detection (`LocatePatterns`)**: + Scans a sequence for all occurrences of forward and reverse patterns using approximate matching (allowing errors). It: + - Converts the input sequence to an indexed format for efficient pattern search. + - Extracts matches, normalizes coordinates and reverse-complements backward hits. + - Sorts results by start position. + - Removes overlapping matches, keeping the one with fewer errors. + +- **Sequence Splitting (`SplitPattern`)**: + Splits the input sequence into fragments *between* matched patterns. Each fragment is annotated with metadata: + - `obisplit_frg`: fragment number (1-based). + - `obisplit_nfrg`: total number of fragments. + - `obisplit_group`: pair-wise group name (e.g., `"primerA-primerB"` or `"extremity"`, for terminal regions). + - `obisplit_set`: the relevant pattern group (e.g., `"primerA"`), or `"NA"`. + - `obisplit_location`: genomic coordinates (1-based, inclusive). + - Left/right pattern info: name, match string, and error count. + +- **Pipeline Integration (`SplitPatternWorker`, `CLISlitPipeline`)**: + - Exposes splitting logic as a reusable `SeqWorker`. + - Wraps it into an iterable pipeline supporting parallel execution via standard OBITools4 infrastructure. + +- **Use Case**: + Designed for demultiplexing and amplicon trimming in high-throughput sequencing data, where sequences are partitioned between known molecular markers. diff --git a/autodoc/docmd/pkg/obitools/obisplit/options.md b/autodoc/docmd/pkg/obitools/obisplit/options.md new file mode 100644 index 0000000..e0aa932 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obisplit/options.md @@ -0,0 +1,16 @@ +# `obisplit` Package Overview + +The `obisplit` package provides functionality to split sequencing reads based on user-defined molecular tags (e.g., PCR or sample barcodes), using pattern-matching with configurable error tolerance. + +- **Configuration via CSV**: Reads a configuration file (CSV format) mapping `tag` sequences to `pcr_pool` names. +- **Pattern Compilation**: Uses the `obiapat` module to compile tag sequences into fuzzy pattern matchers, allowing mismatches and optionally indels. +- **Reverse Complement Support**: Automatically computes reverse-complemented versions of patterns for dual-indexed or stranded workflows. +- **CLI Integration**: Integrates with `getoptions` to define command-line flags: + - `-C`, `--config`: Specify the configuration CSV file. + - `--template`: Output a sample config template to stdout (for quick start). + - `--pattern-error N`: Set max allowed mismatches in pattern matching (default: 4). + - `--allows-indels`: Enable indel-aware matching. +- **Error Handling**: Logs fatal errors on invalid config (missing `tag` column), failed pattern compilation, or file access issues. +- **Extensibility**: Extends `obiconvert.OptionSet`, suggesting compatibility with broader OBITools4 conversion pipelines. + +The core data structure `SplitSequence` stores parsed tag metadata (name, forward/reverse patterns) for downstream splitting logic. diff --git a/autodoc/docmd/pkg/obitools/obisummary/obisummary.md b/autodoc/docmd/pkg/obitools/obisummary/obisummary.md new file mode 100644 index 0000000..05103df --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obisummary/obisummary.md @@ -0,0 +1,47 @@ +# Semantic Description of `obisummary` Package + +The `obisummary` package provides statistical summarization of biological sequence data processed by OBITools4. It aggregates metadata and content-level features across sequences in an iterator, supporting parallel execution. + +## Core Components + +- **`DataSummary` struct**: Holds counters and maps tracking: + - Global counts: reads, variants (unique sequences), total symbols. + - Presence flags for special attributes (`merged_sample`, `obiclean_status/weight`). + - Per-attribute-type counts: scalar, map (`map_tags`), and vector/vector-like tags. + - Per-sample statistics (variant count, singleton counts, bad `obiclean` flags). + +- **Helper functions**: + - `sumUpdateIntMap`, `countUpdateIntMap`: Aggregate or increment map values. + - `plusOne/PlusUpdateIntMap`: Increment specific keys. + +- **`Add()` method**: Merges two `DataSummary`s (thread-safe accumulation). + +## Main Functionality + +- **`Update()` method**: Processes a single `BioSequence`, updating internal counters: + - Reads count (via `.Count()`), variant and symbol counts. + - Detects `merged_sample` or single-sample annotations to populate sample-level stats (e.g., singleton detection). + - Classifies annotation keys into scalar, map, or vector categories. + +- **`ISummary()` function**: + - Parallelizes summarization across `nproc` workers using goroutines. + - Aggregates partial summaries and returns a structured dictionary with: + ```json + { + "count": { "variants", "reads", "total_length" }, + "annotations": { + "scalar_attributes", + "map_attributes", + "vector_attributes", + "keys": { scalar: {...}, map: {...}, vector: {...} } + }, + "samples": { + "sample_count", + "sample_stats": { sample_name: { reads, variants, singletons [, obiclean_bad] } } + } + } + ``` + +## Use Case + +Designed for lightweight, high-performance profiling of sequence datasets (e.g., after `obiclean`, merging), enabling quick quality checks and metadata exploration in OBITools4 pipelines. diff --git a/autodoc/docmd/pkg/obitools/obisummary/options.md b/autodoc/docmd/pkg/obitools/obisummary/options.md new file mode 100644 index 0000000..7191f74 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obisummary/options.md @@ -0,0 +1,26 @@ +# `obisummary` Package Overview + +The `obisummary` package provides command-line interface (CLI) configuration and output formatting utilities for the `obicount` tool within the OBITools4 ecosystem. + +## Core Functionality + +- **Option Parsing Setup** + - `SummaryOptionSet()`: Registers CLI flags specific to summary reporting: + - `-json-output`, `-yaml-output` (boolean): Select output format. + - `-map `: Specifies one or more map attributes to include in the summary. + +- **Extended Option Aggregation** + - `OptionSet()`: Extends `SummaryOptionSet()` by appending input-handling options from the `obiconvert` package. + +- **Output Format Detection** + - `CLIOutFormat()`: Returns `"yaml"` or `"json"` based on active flags (YAML takes precedence only if JSON is *not* enabled). + +- **Map Attribute Access** + - `CLIHasMapSummary()`: Returns whether any map attributes were specified. + - `CLIMapSummary()`: Retrieves the list of requested attribute names. + +## Design Notes + +- Uses global variables for state (e.g., `__json_output__`, `__map_summary__`). +- Designed for integration with the [`go-getoptions`](https://github.com/DavidGamba/go-getoptions) library. +- Minimal, focused scope: solely configures CLI behavior for summary generation—no data processing logic included. diff --git a/autodoc/docmd/pkg/obitools/obitag/obigeomtag.md b/autodoc/docmd/pkg/obitools/obitag/obigeomtag.md new file mode 100644 index 0000000..68ea2aa --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obitag/obigeomtag.md @@ -0,0 +1,36 @@ +# Geometric Taxonomic Assignment Module (`obitag`) + +This Go package implements a **geometric approach for taxonomic assignment** of biological sequences using landmark-based coordinate mapping and distance minimization. + +## Core Functionality + +- **Landmark Extraction** (`ExtractLandmarkSeqs`): + Retrieves sequences marked with non-default `LandmarkID`s and returns them in ID-indexed order. + +- **Taxon Set Extraction** (`ExtractTaxonSet`): + Maps each reference sequence to its corresponding taxonomic node using the provided taxonomy; panics on missing taxa. + +- **Landmark Coordinate Mapping** (`MapOnLandmarkSequences`): + Computes a *coordinate vector* for any query sequence by measuring LCS-based distances to each landmark. + +- **Geometric Nearest Neighbor Search** (`FindGeomClosest`): + Finds reference sequences with minimal Euclidean distance in landmark space; among ties, selects the one with highest sequence identity (via LCS). + +- **Taxonomic Assignment** (`GeomIdentify`): + Assigns taxonomy to a query sequence if best identity >50%: uses LCA of matching references’ taxa, weighted by geometric distance. Otherwise assigns root taxon. + +- **Worker & CLI Integration** (`GeomIdentifySeqWorker`, `CLIGeomAssignTaxonomy`): + Wraps assignment logic into reusable sequence workers and integrates with iterator-based pipelines. + +## Key Design Principles + +- **Landmark-centric geometry**: Taxonomic inference relies on spatial proximity in landmark-derived feature space. +- **Robustness to alignment ambiguity**: Uses LCS (Longest Common Subsequence) scores instead of full alignments. +- **Parallelization support**: Leverages `obiiter` for scalable batch processing. + +## Output Attributes + +Each assigned sequence gains metadata: +- `"scientific_name"`, `"obitag_rank"` +- `"obitat_bestid"` (identity), `"obitag_min_dist"`, `"obitag_match_count"` +- `"obitat_coord"` (landmark coordinates), `"obitation_similarity_method": "geometric"` diff --git a/autodoc/docmd/pkg/obitools/obitag/obitag.md b/autodoc/docmd/pkg/obitools/obitag/obitag.md new file mode 100644 index 0000000..dcb3475 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obitag/obitag.md @@ -0,0 +1,41 @@ +# OBITag Taxonomic Identification Module + +This Go package (`obitag`) provides tools for **taxonomic assignment of biological sequences** using reference databases and alignment-based similarity scoring. + +## Core Functionalities + +- **`MatchDistanceIndex`**: + Maps a distance value to the closest taxonomic entry in an indexed map (`distanceIdx`). It performs binary search on sorted distance keys and returns the corresponding taxon (taxid, rank, scientific name). Falls back to root if no match is found. + +- **`FindClosests`**: + Identifies the most similar reference sequences to a query sequence using: + - **4-mer frequency overlap** (`Common4Mer`) for fast pre-screening. + - **LCS-based alignment scoring** (Longest Common Subsequence) for precise similarity measurement. + - Returns top matches, edit distance (`maxe`), sequence identity score, best match ID, and indices. + +- **`Identify`**: + Performs full taxonomic classification: + - Uses `FindClosests()` to retrieve best matching references. + - Leverages precomputed reference indices (`OBITagRefIndex`) to resolve taxonomic assignments per distance level. + - Computes the **Lowest Common Ancestor (LCA)** of all matching taxa to assign robust taxonomy. + - Marks unidentifiable sequences with root taxon and sets metadata attributes (rank, identity %, match count). + +- **`IdentifySeqWorker`**: + Wraps `Identify()` into a reusable sequence worker function for batch processing. + +- **`CLIAssignTaxonomy`**: + High-level CLI entry point: + - Filters and indexes reference sequences (4-mer counting, taxon validation). + - Builds a `SeqWorker` pipeline for parallel execution. + - Supports logging, filtering of invalid references, and configurable concurrency. + +## Key Features + +- **Hybrid speed/accuracy**: Uses k-mer pre-screening + LCS alignment. +- **Index caching**: Reuses taxonomic indexes per reference to avoid recomputation. +- **Robustness**: Gracefully handles missing taxonomy data and invalid inputs via fallbacks to root. +- **Extensibility**: Designed for integration into larger OBITools4 pipelines. + +## Dependencies + +Uses core modules from `obitools4`: sequence (`obiseq`), taxonomy (`obitax`), alignment (` obialign`), k-mer analysis (`obikmer`), iteration utilities, and logging. diff --git a/autodoc/docmd/pkg/obitools/obitag/options.md b/autodoc/docmd/pkg/obitools/obitag/options.md new file mode 100644 index 0000000..e8fd117 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obitag/options.md @@ -0,0 +1,33 @@ +# `obitag` Package Overview + +The `obitag` package provides command-line interface (CLI) utilities and core logic for assigning taxonomic tags to biological sequences using a reference database. It is part of the OBITools4 ecosystem, designed for high-throughput sequence analysis in metabarcoding workflows. + +## Key Functionalities + +- **Reference Database Handling**: + - Loads a reference database from file via `CLIRefDB()`, returning a slice of biological sequences (`BioSequenceSlice`). + - Supports saving the loaded (and potentially processed) reference DB to disk with `CLISaveRefetenceDB()`, including optional compression and parallel I/O. + +- **CLI Option Parsing**: + - `TagOptionSet()` defines required and optional flags: + - `-R/--reference-db`: Input reference database file (mandatory). + - `--save-db`: Optional output path to persist the processed DB. + - `-G/--geometric`: Enables an *experimental* geometric similarity heuristic for faster matching. + +- **Integration with OBITools4 Components**: + - Leverages `obiconvert`, `obiiter`, `obiseq`, and `obiformats` for sequence I/O, iteration batching, parallelization, and format handling (FASTA/FASTQ/JSON/OBI). + - Inherits standard conversion options via `obiconvert.OptionSet(false)`. + +- **Runtime Configuration Helpers**: + - Accessors like `CLIGeometricMode()`, `CLIRefDBName()`, and `CLIRunExact()` expose internal state for downstream processing modules. + +- **Performance Optimizations**: + - Uses batched iteration (`IBatchOver`) and configurable parallel workers (scaled from total pool). + - Supports output compression based on global defaults. + +## Design Notes + +- Heuristic mode (`--geometric`) trades accuracy for speed; exact matching is currently commented out but can be re-enabled. +- The package assumes a pre-built reference DB (e.g., curated barcode library) and focuses on *tagging* rather than alignment or assembly. +- Error handling is strict: panics on DB read failure, fatal logs on write errors. + diff --git a/autodoc/docmd/pkg/obitools/obitagpcr/options.md b/autodoc/docmd/pkg/obitools/obitagpcr/options.md new file mode 100644 index 0000000..f0c56ba --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obitagpcr/options.md @@ -0,0 +1,29 @@ +# `obitagpcr` Package Overview + +The `obitagpcr` package provides command-line interface (CLI) support for tag-based PCR data processing within the OBITools4 ecosystem. It defines options and utilities to configure orientation handling of sequencing reads in relation to PCR primers. + +## Core Functionality + +- **`TagPCROptionSet()`**: Adds a `--reorientate` boolean flag to the CLI option parser. When enabled, it reverse-complements reads as needed so all sequences are stored in a consistent orientation relative to the forward and reverse primers. + +- **`OptionSet()`**: Aggregates all required option sets for tag-PCR workflows by extending: + - `obipairing.OptionSet()` — handling paired-end read pairing options, + - `obimultiplex.MultiplexOptionSet()` — supporting sample demultiplexing, + - `TagPCROptionSet()` — adding the reorientation flag. + +- **`CLIReorientate()`**: Returns a boolean indicating whether read reorientation is enabled, allowing downstream logic to apply reverse-complementation conditionally. + +## Semantic Behavior + +- **Reorientation semantics**: Ensures uniform strand orientation across samples—critical for downstream alignment, consensus building, or variant calling where primer directionality matters. + +- **Modular design**: Leverages existing OBITools4 modules (`obipairing`, `obimultiplex`) to compose a coherent, reusable CLI configuration for tag-PCR pipelines. + +## Use Case + +Typically used in amplicon sequencing workflows where: +1. Reads originate from both strands due to PCR amplification, +2. Primer positions are known and fixed (forward/reverse), +3. Consistent orientation improves analysis accuracy. + +This package ensures that the `--reorientate` option is available and correctly wired into the processing pipeline. diff --git a/autodoc/docmd/pkg/obitools/obitagpcr/pcrtag.md b/autodoc/docmd/pkg/obitools/obitagpcr/pcrtag.md new file mode 100644 index 0000000..1844c00 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obitagpcr/pcrtag.md @@ -0,0 +1,23 @@ +# `obitagpcr` Package: Paired-End Sequence Demultiplexing and Tagging + +The `obitagpcr` package provides high-performance, parallelized demultiplexing and annotation of paired-end NGS reads using molecular barcodes (e.g., PCR tags). Its core function, `IPCRTagPESequencesBatch`, processes sequence pairs from an iterator and outputs annotated reads with sample-specific metadata. + +## Key Functionalities + +- **Paired-end assembly**: Reads are assembled into consensus sequences using `obipairing.AssemblePESequences`, with parameters for alignment gap, scale, overlap length (`minOverlap`), identity threshold (`minIdentity`), and fast alignment heuristics. + +- **Barcode extraction**: A compiled NGS filter (`CLINGSFIlter`) extracts barcodes from each consensus. Only reads with a *single*, valid barcode (no error flags) are assigned to samples. + +- **Metadata propagation**: Upon successful demultiplexing, barcode identity (`forward_tag`, `reverse_tag`), directionality (`obimultiplex_direction`), mismatches, sample name, and experiment ID are added as annotations to *both* reads in the pair. + +- **Reorientation support**: If enabled (`CLIReorientate`), reverse-direction reads are reversed-complemented and re-paired to ensure consistent forward orientation of tags. + +- **Error handling & filtering**: Unassigned reads (failed demultiplexing) are flagged with an `obimultiplex_error` annotation. By default, they can be discarded or saved to a separate file (`CLIUnidentifiedFileName`). + +- **Parallel processing**: Uses goroutines and batched iteration to scale across CPU cores (`obidefault.ParallelWorkers()`), maximizing throughput. + +- **Statistics & logging**: Optional stats collection (`withStats`) and structured log messages track pipeline stages (e.g., worker start/end, filtering decisions). + +## Dependencies & Integration + +Built on core `obitools4` modules (`obiiter`, `obiseq`, `obialign`, `obimultiplex`), it integrates seamlessly into larger NGS processing pipelines for metabarcoding and amplicon sequencing workflows. diff --git a/autodoc/docmd/pkg/obitools/obitaxonomy/obitaxonomy.md b/autodoc/docmd/pkg/obitools/obitaxonomy/obitaxonomy.md new file mode 100644 index 0000000..25d0902 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obitaxonomy/obitaxonomy.md @@ -0,0 +1,11 @@ +This Go package `obitaxonomy` provides CLI-oriented utilities for manipulating and exporting taxonomic data within the OBITools4 framework. + +- **`CLITaxonRestrictions()`**: Applies user-defined taxonomic clade and rank filters to a taxonomy iterator, returning a constrained view. +- **`CLIFilterRankRestriction()`**: Filters the taxonomy iterator to include only taxa matching a specified taxonomic rank (e.g., "species", "genus"). +- **`CLISubTaxonomyIterator()`**: Returns an iterator over a subtree of the default taxonomy, starting from a specified node; exits if no sub-taxonomy is selected via CLI. +- **`CLICSVTaxaIterator()`**: Converts a taxonomy iterator into an CSV record stream, supporting optional inclusion of scientific names, ranks, paths, parent taxa IDs, and raw taxids. +- **`CLICSVTaxaWriter()`**: Wraps `CLICSVTaxaIterator()` to produce a CSV writer, handling output destination and terminal execution. +- **`CLINewickWriter()`**: Exports a taxonomy subtree as Newick format (with optional compression, rank/scientific name inclusion, taxid support), writing to file or stdout. +- **`CLIDownloadNCBITaxdump()`**: Downloads the latest NCBI taxonomy dump (`taxdump.tar.gz`) and saves it as `ncbitaxo_YYYYMMDD.tgz` (or a user-specified filename). + +All functions integrate with CLI flags and logging, support output redirection (`-` for stdout), and rely on standardized iterators from the `obitools4/pkg/...` ecosystem. diff --git a/autodoc/docmd/pkg/obitools/obitaxonomy/options.md b/autodoc/docmd/pkg/obitools/obitaxonomy/options.md new file mode 100644 index 0000000..6cf1432 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obitaxonomy/options.md @@ -0,0 +1,31 @@ +# Taxonomy Processing CLI Module (`obitaxonomy`) + +This Go package provides a command-line interface (CLI) for interacting with taxonomic data, built on top of the OBItools4 framework. It supports flexible querying, filtering, and export of taxonomic trees. + +## Core Functionalities + +- **Taxonomy Loading & Management**: Integrates with `obitax` to load and manage taxonomic databases (e.g., NCBI). +- **Taxon Filtering**: Allows restricting output to specific clades via `--restrict-to-taxon` (`-r`) using taxon IDs or names. +- **Rank-Based Filtering**: Restricts output to a specific rank (e.g., `species`, `genus`) with `-rank` (`--rank`). +- **Tree Navigation**: + - `--parents` (`-p`) displays the full lineage (path) for a given taxon ID. + - `--sons` (`-s`) lists all direct children of a given taxon ID. + - `--dump` (`-D`) exports the entire subtree rooted at a given taxon. +- **Output Formatting**: + - Columns can be toggled: scientific name (`--without-scientific-name`), taxonomic rank (`-R`, `--without-rank`), parent ID (via `-W`, implied via `--without-parent`). + - Full taxonomic path (`-P`) and matching query source (`--with-query`) can be included. + - Supports Newick tree output (`-N`, `--newick-output`) with optional leaf labels and root trimming. +- **Data Acquisition**: + - `--download-ncbi`: Fetches and installs the latest NCBI taxonomy dump. + - `--extract-taxonomy`: Extracts taxonomic labels from sequence files (e.g., FASTA/FASTQ). +- **Pattern Matching**: + - `--fixed` (`-F`) enables literal (non-regexp) taxon name matching. + - `--rank-list` (`-l`) prints all available ranks in the loaded taxonomy. + +## Utility Functions + +Helper functions (e.g., `CLIRankRestriction()`, `CLIWithScientificName()`), expose parsed CLI flags for downstream processing modules. + +## Integration + +Designed to be composed with `obiconvert` (output formatting) and standard OBItools4 option parsing (`getoptions`). Fully modular, extensible for taxonomic workflows in metagenomics and biodiversity informatics. diff --git a/autodoc/docmd/pkg/obitools/obiuniq/options.md b/autodoc/docmd/pkg/obitools/obiuniq/options.md new file mode 100644 index 0000000..971791a --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiuniq/options.md @@ -0,0 +1,27 @@ +# `obiuniq` Package: Semantic Feature Overview + +The `obiuniq` package provides command-line and programmatic configuration for deduplicating biological sequence data, grouping identical sequences while preserving metadata-rich distinctions. + +## Core Functionality + +- **Sequence Grouping**: Groups sequences based on user-defined *category attributes* (`--category-attribute` / `-c`) and optional merging criteria. +- **Singleton Filtering**: Optionally excludes sequences occurring only once (`--no-singleton`), reducing noise from rare artifacts. +- **NA Handling**: Allows custom placeholder (`--na-value`) for missing classifier tags (e.g., taxonomy labels). +- **Scalable Processing**: Uses chunked disk/memory storage (`--chunk-count`, `--in-memory`) to handle large datasets efficiently. + +## Configuration API + +- **CLI Options**: Built via `getoptions`, exposing flags like `-m` (merge stats), `-c` (grouping keys). +- **State Accessors**: Functions like `CLIKeys()`, `CLINAValue()`, and `CLINoSingleton()` expose runtime configuration. +- **Mutable Setters**: Enables programmatic tuning (e.g., `SetNAValue()`, `AddStatsOn()`). + +## Statistics & Metadata + +- **Merged Attributes**: Tracks original IDs per group via `--merge` (`_StatsOn`) — useful for provenance and QC. +- **Flexible Grouping**: Supports multiple attributes (e.g., `sequence`, `umi`, `sample`) to define *identity* beyond raw sequence. + +## Integration + +- Extends generic I/O options from `obiconvert.OptionSet`, ensuring compatibility with OBItools4 pipelines. + +> Designed for high-performance, metadata-aware deduplication in NGS workflows (e.g., amplicon or UMI-based data). diff --git a/autodoc/docmd/pkg/obitools/obiuniq/unique.md b/autodoc/docmd/pkg/obitools/obiuniq/unique.md new file mode 100644 index 0000000..349ae85 --- /dev/null +++ b/autodoc/docmd/pkg/obitools/obiuniq/unique.md @@ -0,0 +1,17 @@ +# Semantic Description of `CLIUnique` Functionality + +The `CLIUnique` function implements a **dereplication pipeline** for biological sequence data (e.g., amplicons, reads), returning a deduplicated iterator of sequences (`obiiter.IBioSequence`). + +- **Core purpose**: Collapse identical or near-identical sequences while preserving metadata and counting occurrences. +- **Input/Output**: Accepts a sequence iterator; outputs an iterator over unique sequences with abundance annotations. +- **Chunking**: Processes data in configurable batches (`OptionBatchCount`) to manage memory and scalability. +- **Sorting Strategy**: Supports in-memory or disk-based sorting via CLI flags (`--on-disk`), optimizing for large datasets. +- **Singleton Handling**: Optionally filters out sequences observed only once (`--no-singleton`), configurable at runtime. +- **Parallelization**: Leverages default parallel workers (`OptionsParallelWorkers`) to accelerate sorting/deduplication. +- **Batching**: Uses default batch size (`OptionsBatchSize`) to balance throughput and memory usage. +- **Missing Data**: Handles missing values (`OptionNAValue`) as defined by CLI arguments (e.g., `CLINAValue`). +- **Statistics**: Enables optional per-category statistics collection (`OptionStatOn`) based on user-specified keys. +- **Subcategorization**: Groups sequences by metadata keys (`OptionSubCategory`) to enable stratified dereplication (e.g., per sample, primer). +- **Error Handling**: Logs fatal errors during pipeline initialization or execution using `log.Fatal`. + +The function integrates CLI-driven configuration into a modular, extensible chunk-based processing framework (`obichunk`), supporting both scalability and flexibility in high-throughput sequencing workflows. diff --git a/autodoc/docmd/pkg/obiutils/abs.md b/autodoc/docmd/pkg/obiutils/abs.md new file mode 100644 index 0000000..7df8629 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/abs.md @@ -0,0 +1,41 @@ +# `obiutils.Abs` — Generic Absolute Value Function + +This package provides a **type-generic utility function** for computing the absolute value of signed numeric types in Go. + +## Function Signature + +```go +func Abs[T constraints.Signed](x T) T +``` + +- **Generic constraint**: `T` must satisfy `constraints.Signed`, i.e., any signed integer type (`int`, `int8`–`int64`) or floating-point type (via future Go versions supporting floats in `constraints.Signed`). +- **Input**: A value of type `T`. +- **Output**: The absolute (non-negative) counterpart, same type as input. + +## Semantics + +- Returns `x` if `x ≥ 0`. +- Otherwise, returns `-x`, effectively flipping the sign. +- Handles all signed numeric types uniformly — no need for type-specific overloads. + +## Example Usage + +```go +absInt := obiutils.Abs(-5) // → 5 (type: int) +absFloat64 := obiutils.Abs(-3.14) // → 3.14 (type: float64) +``` + +## Design Rationale + +- Leverages Go generics for **reusability** and type safety. +- Avoids duplication across `AbsInt`, `AbsFloat64`, etc. +- Follows Go’s standard library conventions (e.g., similar to `math.Abs` but *generic* and not limited to floats). + +## Limitations + +- Does **not** support unsigned types (by design: `constraints.Signed` excludes them). +- For floating-point special cases (`NaN`, `-0.0`) behavior matches native negation semantics. + +## Dependencies + +- Requires `golang.org/x/exp/constraints` for the generic type constraint. diff --git a/autodoc/docmd/pkg/obiutils/abs_test.md b/autodoc/docmd/pkg/obiutils/abs_test.md new file mode 100644 index 0000000..ec37357 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/abs_test.md @@ -0,0 +1,23 @@ +## Semantic Description of `obiutils.Abs` Functionality + +The provided Go test suite (`TestAbs`) validates the semantic behavior of a utility function `Abs` from the package [`obiutils`](https://git.metabarcoding.org/obitools/obitools4), part of the OBITools 4 ecosystem — a toolkit for DNA metabarcoding data analysis. + +- **Function Purpose**: + `obiutils.Abs` computes the *absolute value* of an integer, returning its non-negative magnitude regardless of sign. + +- **Test Coverage**: + The test verifies correctness across two categories: + - *Non-negative inputs* (`0`, `1`, `5`, `10`) → outputs unchanged. + - *Negative inputs* (`-1`, `-5`, `-10`) → outputs their positive counterparts. + +- **Semantic Semantics**: + The function adheres to the mathematical definition: `Abs(x) = x` if `x ≥ 0`, else `-x`. + It ensures robustness for edge cases (e.g., zero) and typical integer ranges used in bioinformatic pipelines. + +- **Integration Context**: + As part of `obitools4`, such low-level utilities likely support numerical operations in sequence alignment scoring, quality filtering, or coordinate transformations — where signed differences must be normalized. + +- **Test Quality**: + Uses table-driven testing (Go idiom), promoting maintainability and clarity. No external dependencies are required — confirming the function is pure, deterministic, and self-contained. + +In summary: `Abs` provides a foundational arithmetic primitive with guaranteed correctness for integer inputs, enabling reliable downstream computation in OBITools’ data processing workflows. diff --git a/autodoc/docmd/pkg/obiutils/array.md b/autodoc/docmd/pkg/obiutils/array.md new file mode 100644 index 0000000..5a5f241 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/array.md @@ -0,0 +1,23 @@ +# `obiutils` Package: Semantic Overview + +This Go package (`obiutils`) provides generic utilities for numerical and matrix operations, leveraging generics (Go 1.18+). It defines foundational types and helper functions for working with multidimensional data structures. + +- **Type Interfaces** + - `Integer`: Constraint covering signed integer types (`int`, `int8`–`int64`). + - `Float`: Constraint for floating-point types (`float32`, `float64`). + - `Numeric`: Union of both, enabling generic numeric functions. + +- **Data Structures** + - `Vector[T]`: A slice-based vector (`[]T`). + - `Matrix[T]`: A row-major representation of a 2D matrix (`[][]T`), backed by contiguous memory for performance. + +- **Core Functions** + - `Make2DArray[T]`: Allocates a zero-initialized, contiguous-row-major matrix of arbitrary type `T`. + - `Make2DNumericArray[T]`: Same as above, but restricted to numeric types; optionally pre-fills with zeros if `zeroed=true`. + +- **Matrix Methods** + - `.Column(i int)`: Extracts column `i` as a slice (not row-wise access). + - `.Rows(i ...int)`: Returns a new matrix containing only the specified row indices. + - `.Dim() (int, int)`: Returns `(rows, cols)` safely handling `nil` or empty matrices. + +The design prioritizes memory efficiency (via contiguous backing arrays), type safety through generics, and ergonomic access patterns for linear algebra-like workflows. diff --git a/autodoc/docmd/pkg/obiutils/array_test.md b/autodoc/docmd/pkg/obiutils/array_test.md new file mode 100644 index 0000000..b986552 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/array_test.md @@ -0,0 +1,15 @@ +# Semantic Description of `obiutils` Matrix Functionality + +The `package obiutils` provides a generic, type-safe matrix abstraction in Go with core utility methods for construction and querying. + +- **`Make2DArray[T]()`**: A generic constructor that initializes a 2D slice (matrix) of type `Matrix[T]`, with specified numbers of rows and columns. All elements are zero-initialized (e.g., `0` for integers, empty string for strings, default struct values). + +- **`.Column(colIndex int)`**: Extracts and returns a single column (as `[]T`) from the matrix at the given 0-based index, preserving element order across rows. + +- **`.Rows(indices ...int)`**: Returns a new matrix composed of only the specified row indices (0-based), supporting single-row, multi-row, or empty selections. + +- **`.Dim() (rows, cols int)`**: Returns the dimensions of the matrix as `(number_of_rows, number_of_columns)`. Handles edge cases: `nil`, empty (`{}`), and jagged or zero-column matrices safely (e.g., `{ { } }` yields `(1, 0)`). + +All functionality is implemented as methods on the `Matrix[T]` type (implicitly defined via slices of slices), leveraging Go generics for compile-time safety and runtime efficiency. + +The package includes comprehensive unit tests validating correctness across types (`int`, `string`, custom structs) and boundary conditions. diff --git a/autodoc/docmd/pkg/obiutils/bytes.md b/autodoc/docmd/pkg/obiutils/bytes.md new file mode 100644 index 0000000..5ebc278 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/bytes.md @@ -0,0 +1,27 @@ +# `InPlaceToLower` Function — Semantic Description + +The `obiutils.InPlaceToLower` function provides a high-performance, memory-efficient utility for converting ASCII uppercase letters to lowercase **in place**, without allocating new data structures. + +## Core Functionality +- Takes a `[]byte` slice (`data`) as input. +- Iterates over each byte, identifying uppercase ASCII characters (i.e., `'A'`–`'Z'`, values `65`–`90`). +- Converts each uppercase byte to its lowercase counterpart using a bitwise OR with `32`, leveraging the ASCII encoding property: + `lowercase = uppercase | 0b0010_0000` (since `'a' - 'A' = 32`). +- Returns the **same** `[]byte` slice, now mutated in-place. + +## Key Characteristics +- ✅ **Zero-copy**: No new memory is allocated—ideal for performance-critical or low-level contexts (e.g., streaming, embedded systems). +- ✅ **ASCII-safe**: Only modifies bytes in the `'A'`–`'Z'` range; other bytes (e.g., digits, symbols, non-ASCII) remain unchanged. +- ✅ **Idiomatic Go**: Uses idioms like `range` with index/value and bitwise optimization. +- ⚠️ **Destructive**: Input data is permanently modified—callers must clone if preservation is needed. + +## Use Cases +- Preprocessing raw HTTP headers or payloads. +- Optimizing case-insensitive comparisons in high-throughput systems. +- Embedded tools where GC pressure or heap allocation must be minimized. + +## Example +```go +buf := []byte("HTTP/1.1 200 OK") +InPlaceToLower(buf) // buf is now []byte("http/1.1 200 ok") +``` diff --git a/autodoc/docmd/pkg/obiutils/bytes_test.md b/autodoc/docmd/pkg/obiutils/bytes_test.md new file mode 100644 index 0000000..554eccb --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/bytes_test.md @@ -0,0 +1,20 @@ +# `obiutils` Package Functional Overview + +The `obiutils` package provides two core utility functions for low-level and numerical operations in Go: + +- **`InPlaceToLower([]byte) []byte`** + Converts all ASCII uppercase letters in a byte slice to lowercase *in-place*, returning the modified slice. + - Non-alphabetic bytes remain unchanged. + - Memory-efficient: modifies input directly (no allocation of new slice). + +- **`Make2DNumericArray[T any](rows, cols int, zeroed bool) Matrix[T]`** + Generates a generic 2D numeric array (`Matrix`) of type `T`, supporting any comparable/numeric Go type. + - Parameters: number of rows, columns, and whether to initialize with zero values (`true`) or default `T` (e.g., 0 for int). + - Uses Go generics (`[T any]`) for type safety and flexibility. + +Both functions are thoroughly unit-tested in `*_test.go`, covering edge cases: +- Empty/nil inputs (`InPlaceToLower`) +- Various dimensions and zero-initialization modes (`Make2DNumericArray`) + +Tests use `reflect.DeepEqual` for structural comparison and subtests via `t.Run`. +The package assumes a custom type alias: `type Matrix[T any] [][]T`. diff --git a/autodoc/docmd/pkg/obiutils/cast_interface.md b/autodoc/docmd/pkg/obiutils/cast_interface.md new file mode 100644 index 0000000..d4285b3 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/cast_interface.md @@ -0,0 +1,15 @@ +# `obiutils` Package — Semantic Feature Summary + +This Go package provides a set of utility functions for **type conversion**, **casting validation**, and **map/slice transformation** in a flexible, error-tolerant manner. + +- `InterfaceToString(i interface{})`: Converts any value to a string, preferring the `Stringer` interface if implemented. +- `CastableToInt(i interface{})`: Checks whether a value is *numerically castable* to an `int` (supports all numeric types). +- `InterfaceToBool(i interface{})`: Safely converts various input types (`bool`, numeric, string like `"true"`, `"1"`, etc.) to `bool`; returns a custom error for unsupported types. +- `InterfaceToInt(i interface{})`: Converts numeric or string representations to an `int`, with precise error handling. +- `InterfaceToFloat64(i interface{})`: Converts numeric or string types to `float64`, using standard parsing. +- `MapToMapInterface(m interface{})`: Converts specialized map types (e.g., read-only or concurrency-safe maps) to `map[string]interface{}` via reflection. +- `InterfaceToIntMap(i interface{})`: Converts compatible map types (`map[string]int`, `hasMap` interfaces, or generic maps) to a concrete `map[string]int`. +- `InterfaceToStringMap(i interface{})`: Converts map values to strings, yielding a clean `map[string]string`. +- `InterfaceToStringSlice(i interface{})`: Converts slices of interfaces or strings into a pure `[]string`. + +All functions include **explicit error handling** via custom types (e.g., `NotAnInteger`, `NotAMapInt`) and use logging via Logrus for debugging. The package prioritizes **type safety**, **robustness**, and interoperability across Go types. diff --git a/autodoc/docmd/pkg/obiutils/counter.md b/autodoc/docmd/pkg/obiutils/counter.md new file mode 100644 index 0000000..2f4e10a --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/counter.md @@ -0,0 +1,25 @@ +# `obiutils.Counter`: Thread-Safe Atomic Counter + +A minimal, thread-safe counter implementation in Go. + +## Features +- **Atomic increment/decrement**: `Inc()` and `Dec()` modify the internal counter atomically using a mutex. +- **Current value retrieval**: `Value()` safely returns the current count without modifying it. +- **Initial value support**: Constructor accepts an optional initial integer (defaults to `0`). +- **Closure-based API**: Encapsulates state and synchronization behind clean, functional methods. +- **No external dependencies**: Uses only the standard library (`sync`). + +## Usage Example +```go +counter := obiutils.NewCounter(10) // start at 10 +fmt.Println(counter.Inc()) // → 11 +fmt.Println(counter.Dec()) // → 10 +fmt.Println(counter.Value()) // → 10 (unchanged) +``` + +## Thread Safety +All operations are protected by a `sync.Mutex`, ensuring correctness in concurrent environments. + +## Design Notes +- Immutable interface: methods return updated values, not pointers. +- No reset method provided—intentionally minimal and focused on core counting semantics. diff --git a/autodoc/docmd/pkg/obiutils/download.md b/autodoc/docmd/pkg/obiutils/download.md new file mode 100644 index 0000000..d220633 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/download.md @@ -0,0 +1,37 @@ +## `obiutils.DownloadFile` — Semantic Feature Overview + +- **Core Functionality**: Downloads a file from a given URL to a specified local path. + +- **HTTP Client Behavior**: + - Uses `http.Get()` for simple, synchronous GET requests. + - Validates the HTTP status code; aborts on non-200 responses with a descriptive error. + +- **Resource Management**: + - Ensures proper cleanup via `defer resp.Body.Close()` and `defer out.Close()`. + +- **Progress Tracking**: + - Integrates [`progressbar`](https://github.com/schollz/progressbar) to display real-time download progress. + - Uses `DefaultBytes()` for a human-readable, byte-based indicator (e.g., "downloading 12.3 MB / 45.6 MB"). + +- **Efficient I/O**: + - Leverages `io.Copy()` with an `io.MultiWriter` to stream data directly from the HTTP response body into both: + - The target file (`out`) + - The progress bar (to update on each chunk written) + +- **Error Handling**: + - Returns early with wrapped errors for network failures, HTTP non-success codes, or file I/O issues. + +- **Simplicity & Usability**: + - Minimal API surface: only two arguments (`url`, `filepath`). + - No external configuration needed — ideal for CLI tools or batch scripts. + +- **Assumptions**: + - No authentication, redirects, proxies, timeouts, or retries are implemented. + - Designed for straightforward downloads where robustness is secondary to simplicity. + +- **Typical Use Cases**: + - CLI utilities, build scripts, CI/CD pipelines. + - Prototyping or internal tools where advanced download features are unnecessary. + +- **Limitations**: + - Not suitable for large-scale or production-grade downloads without enhancements (e.g., retries, concurrency control). diff --git a/autodoc/docmd/pkg/obiutils/goutils.md b/autodoc/docmd/pkg/obiutils/goutils.md new file mode 100644 index 0000000..0b0766f --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/goutils.md @@ -0,0 +1,16 @@ +# `obiutils` Package Overview + +This Go package provides utility functions for common data conversion, serialization, and reflection tasks. + +- **Custom Error Types**: Defines typed errors (`NotAnInteger`, `NotAFloat64`, etc.) for precise type validation failures. +- **Interface-to-Type Casting**: Offers robust conversion functions: + - `InterfaceToFloat64Map`, `InterfaceToIntSlice`, etc., handling nested interfaces and type coercion (e.g. `int` → `float64`, slices of `interface{}`). +- **File I/O**: `ReadLines` reads a file line-by-line into a string slice, handling buffered reading efficiently. +- **Concurrency**: `AtomicCounter` returns an incrementing integer generator—thread-safe via mutex, optionally starting from a given value. +- **JSON Serialization**: `JsonMarshal` and `JsonMarshalByteBuffer` provide UTF‑8–preserving JSON encoding (avoids Go’s default HTML escaping). +- **Reflection Helpers**: + - `IsAMap`, `IsASlice`, `IsAnArray` detect container types. + - `HasLength`, `Len`, and `IsAContainer` abstract length operations across maps, slices, arrays, or custom types with a `.Len()` method. +- **Deep Copying**: `MustFillMap` performs deep copying of nested structures using `go-deepcopy`. + +All functions prioritize safety, type correctness, and usability in data-heavy or concurrent applications. diff --git a/autodoc/docmd/pkg/obiutils/gzipfile.md b/autodoc/docmd/pkg/obiutils/gzipfile.md new file mode 100644 index 0000000..af8bbdf --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/gzipfile.md @@ -0,0 +1,37 @@ +# `obiutils` Package: File and Stream Writing Utilities + +The `obiutils` package provides a unified abstraction for writing data to files or streams, with optional gzip compression and buffered I/O. + +## Core Type: `Wfile` + +- Encapsulates a write-ready output stream (`io.WriteCloser`). +- Supports both **compressed** (gzip) and uncompressed modes. +- Uses `bufio.Writer` for efficient buffered writes. + +## Key Functions + +### `OpenWritingFile(name string, compressed bool, append bool) (*Wfile, error)` +- Opens a file for writing. + - `compressed`: enables gzip compression via `pgzip`. + - `append`: if true, writes at end of file (`os.O_APPEND`). +- Returns a ready-to-use `*Wfile`. + +### `CompressStream(out io.WriteCloser, compressed bool, close bool) (*Wfile, error)` +- Wraps an arbitrary `io.WriteCloser` (e.g., HTTP response, pipe) in buffered/compressed I/O. + - `close`: if true, the underlying writer is closed on `.Close()`. + +## Methods + +- **`Write(p []byte)` / `WriteString(s string)`**: + Buffered writes to the underlying stream (transparently compressed if enabled). + +- **`Close()`**: + - Flushes the buffer. + - Closes gzip writer (if compressed). + - Closes underlying file/stream *only if* `close == true`. + +## Design Highlights + +- **Transparent compression**: Uses high-performance `pgzip` for parallel gzip. +- **Resource control**: Explicit flag (`close`) prevents premature closure of shared writers (e.g., in pipelines). +- **Efficiency**: Double buffering via `bufio.Writer` + gzip stream. diff --git a/autodoc/docmd/pkg/obiutils/memsize.md b/autodoc/docmd/pkg/obiutils/memsize.md new file mode 100644 index 0000000..c59a0c9 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/memsize.md @@ -0,0 +1,15 @@ +# `obiutils` Package: Memory Size Parsing and Formatting + +This Go package provides two complementary utility functions for handling human-readable memory sizes: + +- **`ParseMemSize(s string) (int, error)`** + Parses a memory size string into an integer number of bytes. Supports case-insensitive units: `B`, `K`/`KB`, `M`/`MB`, `G`/`GB`, and `T`/`TB`. + Examples: `"128K"` → `131072`, `"512MB"` → `536870912`. + Returns an error for invalid input (e.g., empty string, non-numeric prefix, or unknown unit). + +- **`FormatMemSize(n int) string`** + Converts a byte count into the most appropriate human-readable format using powers of 1024. + Uses suffixes `T`, `G`, `M`, or `K`; falls back to bytes (`B`) if < 1 KiB. + Integers are displayed without decimals (e.g., `2048` → `"2K"`), while fractional values use one decimal (e.g., `1536` → `"1.5K"`). + +Both functions ensure semantic clarity and consistency for memory-related I/O, logging, or configuration parsing. diff --git a/autodoc/docmd/pkg/obiutils/mimetypes.md b/autodoc/docmd/pkg/obiutils/mimetypes.md new file mode 100644 index 0000000..6f7014c --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/mimetypes.md @@ -0,0 +1,32 @@ +# OBIMimeUtils: Semantic Description of Features + +The `obiutils` Go package provides utilities for detecting and handling biological data file formats, primarily via MIME type inference. + +## Core Functionalities + +- **BOM Detection (`HasBOM`)** + Identifies Byte Order Marks (BOMs) for UTF-8, UTF-16 BE/LE, and UTF-32 BE/LE encodings. Logs detected types for transparency. + +- **Last-Line Trimming (`DropLastLine`)** + Removes the final newline-delimited line from a byte slice — useful for sanitizing incomplete or truncated files. + +- **MIME Type Registration (`RegisterOBIMimeType`)** + Extends generic MIME types (e.g., `text/plain`, `application/octet-stream`) with format-specific detectors for: + - **CSV**: Validates structured comma-separated data (≥2 fields, ≥2 lines). + - **FASTA/FASTQ**: Regex-based detection of sequence headers (`>` or `@`). + - **ecoPCR2**: Detects files starting with the magic header `#@ecopcr-v2`. + - **GenBank/EMBL**: Checks for standard sequence record prefixes (`LOCUS`, `ID`). + +- **Format-Specific Extensions** + Registers custom MIME subtypes (e.g., `text/fasta`, `.fasta`) and associates them with appropriate file extensions. + +- **Idempotent Registration** + Ensures MIME detectors are registered only once using a guard flag. + +## Design Goals + +- Robust, lightweight format inference without full parsing. +- Extensible architecture for future bioinformatics formats. +- Logging-friendly (via `logrus`) to aid debugging and observability. + +This package enables accurate, context-aware MIME detection in pipelines handling heterogeneous biological data. diff --git a/autodoc/docmd/pkg/obiutils/minmax.md b/autodoc/docmd/pkg/obiutils/minmax.md new file mode 100644 index 0000000..318d623 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/minmax.md @@ -0,0 +1,36 @@ +# `obiutils` Package: Semantic Overview + +The `obiutils` package provides generic and reflection-based utilities for computing minima and maxima across multiple data structures in Go. + +## Core Features + +- **Generic `MinMax` / `Min/MaxSlice`**: + - `MinMax[T constraints.Ordered]`: Returns the ordered pair `(min, max)` of two values. + - `MinMaxSlice[T constraints.Ordered]`: Finds min and max in a slice of ordered types (panics on empty input). + +- **Map-based Min/Max**: + - `MinMap` / `MaxMap`: Returns the key and value of the smallest/largest *value* in a map (errors on empty maps). + +- **Unified `Min` / `Max` Functions**: + - Accepts *any* Go value: single scalar, slice/array/map. + - Uses reflection to dispatch logic based on runtime type (`reflect.Kind`). + - Supports ordered kinds: integers, floats, strings (signed/unsigned ints via `constraints.Ordered` subset). + - Returns an error for unsupported or empty containers. + +- **Helper Reflection Functions**: + - `minFromIterable` / `maxFromIterable`: Scan slices/arrays. + - `minFromMap` / `maxFromMap`: Iterate over map values (ignores keys in comparisons). + - `isOrderedKind`, `less`, `greater`: Internal comparison logic for reflection-based ordering. + +## Design Highlights + +- **Type Safety & Generics**: Leverages Go 1.18+ generics for compile-time type constraints where possible. +- **Flexibility**: The `Min(data interface{})` / `Max(...)` functions allow a *single API* for heterogeneous inputs. +- **Error Handling**: Explicit errors (e.g., `"empty slice"`, `"unsupported type"`), no panics for user-facing APIs. +- **Fallback Support**: Checks if the input has a `Min()`/`Max()` method (via reflection) before falling back to generic logic. + +## Limitations + +- Reflection-based paths are slower than direct generics. +- No support for custom types without ordering defined (e.g., structs unless they satisfy `constraints.Ordered`). +- Maps compare only *values*; keys are irrelevant for min/max selection. diff --git a/autodoc/docmd/pkg/obiutils/minmultiset.md b/autodoc/docmd/pkg/obiutils/minmultiset.md new file mode 100644 index 0000000..3574953 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/minmultiset.md @@ -0,0 +1,35 @@ +# `MinMultiset[T]` — A Lazy-Delete Min-Multiset Implementation + +A generic, type-safe multiset data structure in Go that maintains elements with multiplicity and provides efficient access to the current minimum. Built on top of a min-heap (`container/heap`) with **lazy deletion** to support efficient removals without rebuilding the heap. + +## Core Features + +- ✅ **Generic over comparable types** (`T`) with custom ordering via `less` comparator +- ✅ **Multiset semantics**: supports multiple occurrences of the same value +- ✅ **O(log n) insertion** (`Add`) and **amortized O(1)** minimum access +- ✅ **Lazy deletion**: `RemoveOne` marks items for removal; physical cleanup occurs on next `Min()` call +- ✅ **Size tracking**: logical size (`Len()`) excludes deleted items, even if still in heap +- ✅ **Memory-efficient cleanup**: `shrink()` and `cleanTop()` prevent tombstone accumulation + +## API Summary + +| Method | Description | +|--------|-------------| +| `NewMinMultiset(less)` | Constructor; initializes heap, maps (`count`, `pending`), and sets ordering | +| `Add(v)` | Inserts one occurrence of `v`; increments logical size & count map | +| `RemoveOne(v)` | Removes *one* occurrence if present; returns success flag (`false` otherwise) | +| `Min()` | Returns current minimum (or zero value + `ok=false`) after cleaning stale top entries | +| `Len()` | Returns logical size (excludes pending deletions) | + +## Internal Mechanism + +- **`count[T]int`**: tracks how many times each value is *logically* present +- **`pending[T]int`**: tracks how many times each value is *marked for removal* +- **Heap invariant maintained only up to logical size** — stale entries are pruned lazily during `Min()` or after deletions +- **No manual cleanup needed** — the structure self-balances incrementally + +## Use Cases + +Priority queues with deletable arbitrary elements (e.g., Dijkstra’s algorithm where distances are updated), sliding-window minima, event scheduling with cancellation. + +> ⚠️ Note: `less` must define a *strict total order* (transitive, antisymmetric, connected) for correctness. diff --git a/autodoc/docmd/pkg/obiutils/path.md b/autodoc/docmd/pkg/obiutils/path.md new file mode 100644 index 0000000..043a43d --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/path.md @@ -0,0 +1,16 @@ +## `obiutils` Package: File Path Utility Functions + +This Go package provides two utility functions for manipulating file paths by removing extensions: + +### `RemoveAllExt(p string) string` +- **Purpose**: Strips *all* file extensions from a given path (e.g., `/dir/file.tar.gz` → `/dir/file`). +- **Mechanism**: Iteratively uses `path.Ext()` and `strings.TrimSuffix` to remove extensions from the *full path*, including directory components if they contain dots (though rare). +- **Use Case**: Useful when you need to sanitize a full path for naming or comparison, regardless of extension stacking. + +### `Basename(path string) string` +- **Purpose**: Extracts the base filename *without any extensions* (e.g., `/dir/file.tar.gz` → `file`). +- **Mechanism**: Uses `filepath.Base()` to get the filename, then iteratively strips extensions via `strings.TrimSuffix`. +- **Key Difference**: Operates *only on the filename*, not directory parts — safer and more conventional for typical file handling. + +Both functions handle multi-extension files (e.g., `.tar.gz`, `.backup.zip`) robustly. They avoid reliance on `strings.LastIndex` or regex, favoring clarity and standard library usage (`path`, `filepath`). +Designed for portability across Unix-like systems (uses forward slashes), though Windows paths are supported via `filepath.Base`. diff --git a/autodoc/docmd/pkg/obiutils/path_test.md b/autodoc/docmd/pkg/obiutils/path_test.md new file mode 100644 index 0000000..37806a3 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/path_test.md @@ -0,0 +1,19 @@ +# `obiutils` Package: Functional Overview + +The `obiutils` package provides utility functions for common file path manipulations in Go. Its current public API includes: + +- **`RemoveAllExt(path string) string`** + Strips *all* file extensions from a given path, returning the base name without any trailing suffixes (e.g., `.txt`, `.tar.gz`). + - Handles paths with no extensions unchanged. + - Correctly processes single- and multi-part (e.g., `.tar.gz`) extensions. + - Designed for robustness across Unix-like and cross-platform path conventions. + +The package currently includes a single unit test suite: + +- **`TestRemoveAllExt(t *testing.T)`** + Validates the correctness of `RemoveAllExt` using three test cases: + • `"path/to/file"` → unchanged (`"path/to/file"`) + • `"path/to/file.txt"` → stripped to `"/file"` (→ `"path/to/file"`) + • `"path/to/file.tar.gz"` → fully stripped to `"/file"` (→ `"path/to/file"`) + +This ensures reliable behavior for downstream code relying on extension-agnostic path handling—e.g., in build systems, data pipelines, or file-processing tools. diff --git a/autodoc/docmd/pkg/obiutils/pipe.md b/autodoc/docmd/pkg/obiutils/pipe.md new file mode 100644 index 0000000..817bec0 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/pipe.md @@ -0,0 +1,36 @@ +# `obiutils` Package: Pipe Synchronization Utilities + +This Go package provides lightweight synchronization primitives for managing concurrent pipeline execution, particularly useful in CLI or batch-processing applications. + +## Core Components + +- **`globalLocker`:** A `sync.WaitGroup` tracking active pipeline goroutines. +- **`globalLockerCounter`:** An integer counter for logging/debugging the number of active pipes. + +## Public Functions + +### `RegisterAPipe()` +- Increments both the WaitGroup and counter. +- Logs current count at debug level (`log.Debugln`). +- Typically called when starting a new pipeline stage or goroutine. + +### `UnregisterPipe()` +- Decrements the WaitGroup and counter. +- Logs updated count at debug level. +- Should be invoked when a pipeline finishes (e.g., `defer UnregisterPipe()`). + +### `WaitForLastPipe()` +- Blocks until all registered pipes complete (`globalLocker.Wait()`). +- Intended to be called at the end of `main()`, ensuring graceful shutdown. + +## Semantic Use Case + +Enables safe, concurrent execution of multiple independent pipelines (e.g., data processing stages), ensuring the program waits for all to finish before exiting — without explicit channel or mutex management. + +## Design Notes + +- **Thread-safe** via `sync.WaitGroup`. +- **Minimalist**: No error handling; assumes correct usage. +- **Logging-focused** for observability in development/debug builds. + +> ⚠️ Not production-ready without additional safeguards (e.g., panic recovery, timeout support). diff --git a/autodoc/docmd/pkg/obiutils/ranks.md b/autodoc/docmd/pkg/obiutils/ranks.md new file mode 100644 index 0000000..76a6b5d --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/ranks.md @@ -0,0 +1,35 @@ +# `obiutils` — Semantic Description of Core Functionality + +This Go package provides generic and type-specific utilities for **ranking** and **ordering** data without modifying the original slice. It leverages Go’s `sort` package to compute index permutations that reflect sorted order. + +## Key Components + +- **IntOrder(data []int) []int** + Returns indices that would sort a slice of integers in *ascending* order. The original data remains unchanged. + +- **ReverseIntOrder(data []int) []int** + Same as `IntOrder`, but returns indices for *descending* order. + +- **Order[T sort.Interface](data T) []int** + Generic version accepting any type implementing `sort.Interface`. Returns stable sorted indices. + +## Internal Design + +- **intRanker** and **Ranker[T]**: Helper types wrapping data + index list (`r`). + They implement `sort.Interface` *indirectly*—sorting indices instead of mutating data. + +- **Index-based sorting**: + By permuting a list of indices (`r = [0,1,...]`), the original data is never copied or altered—ideal for large datasets or immutable inputs. + +- **Stability**: `Order` uses `sort.Stable`, preserving relative order of equal elements. + +## Use Cases + +- Sorting metadata (e.g., sorting labels by associated scores). +- Preparing orderings for downstream operations (plots, ranking metrics). +- Efficiently tracking original positions after sort. + +## Constraints + +- Requires `sort.Interface` for generic version (e.g., custom structs with methods). +- Returns empty slice (`nil`) on zero-length input. diff --git a/autodoc/docmd/pkg/obiutils/set.md b/autodoc/docmd/pkg/obiutils/set.md new file mode 100644 index 0000000..2576965 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/set.md @@ -0,0 +1,34 @@ +# `obiutils.Set` — Generic Set Implementation in Go + +This package provides a generic, type-safe set data structure for Go (1.20+), leveraging generics (`comparable` constraint). It supports common set operations with intuitive APIs. + +## Core Features + +- **Generic Type Support**: `Set[E]` works for any comparable type (e.g., `int`, `string`, custom structs with equality). +- **Memory-Efficient Representation**: Implemented as a map from element to empty struct (`struct{}{}`), minimizing memory overhead. +- **Immutability by Default**: Methods like `Union` and `Intersection` return *new* sets; in-place mutation is explicit (e.g., via `Add()`). + +## Key Functions & Methods + +| Function/Method | Description | +|-----------------|-------------| +| `MakeSet[E](vals ...E)` | Creates and returns a new set populated with given values. | +| `NewSet[E](vals ...E)` | Same as `MakeSet`, but returns a pointer (`*Set[E]`). | +| `(s Set[E]) Add(vals ...E)` | Inserts one or more elements into the set (in-place). | +| `(s Set[E]) Contains(v E) bool` | Checks membership of an element. O(1). | +| `(s Set[E]) Members() []E` | Returns all elements as a slice (order not guaranteed). | +| `(s Set[E]) String() string` | Human-readable representation via `fmt.Sprintf`. | +| `(s Set[E]) Union(s2 Set[E])` | Returns a new set containing elements from both sets. | +| `(s Set[E]) Intersection(s2 Set[E])` | Returns a new set with elements common to both sets. | + +## Example Usage + +```go +s1 := obiutils.MakeSet(1, 2, 3) +s2 := obiutils.NewSet("a", "b") +fmt.Println(s1.Contains(2)) // true +union := s1.Union(MakeSet(3, 4)) +fmt.Println(union.Members()) // e.g., [1 2 3 4] +``` + +> Designed for clarity, performance, and idiomatic Go usage. diff --git a/autodoc/docmd/pkg/obiutils/set_test.md b/autodoc/docmd/pkg/obiutils/set_test.md new file mode 100644 index 0000000..be95fc4 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/set_test.md @@ -0,0 +1,36 @@ +# `obiutils` Package: Set Implementation in Go + +The `obiutils` package provides a generic, type-safe set data structure for Go (v1.18+), along with comprehensive unit tests. + +## Core Features + +- **Generic Set Type**: Implemented as `Set[T]`, using a map for O(1) membership checks. +- **Constructors**: + - `MakeSet[T](...T)` returns a new set populated with given elements. + - `NewSet[T]()` allocates an empty pointer to a set; useful for dynamic initialization. +- **Methods**: + - `Add(...T)` inserts one or more elements (idempotent). + - `Contains(T) bool` checks membership. + - `Members() []T` returns a sorted slice of elements (deterministic iteration). + - `String() string` provides human-readable representation (`[a b c]` format). +- **Set Operations**: + - `Union(other Set[T]) Set[T]`: returns a new set with elements in either operand. + - `Intersection(other Set[T]) Set[T]`: returns a new set with elements common to both. + +## Test Coverage + +Unit tests validate: +- Set creation (empty, single/multiple values). +- Element addition and membership. +- String formatting for various sizes. +- Correctness of union/intersection across edge cases (empty sets, disjoint/common elements). + +All tests use `reflect.DeepEqual` for precise structural comparison and sort outputs where order is non-deterministic. + +## Design Notes + +- Immutable operations: methods return *new* sets rather than mutating in-place. +- No duplicate support (standard set semantics). +- Efficient storage via Go maps; no external dependencies. + +> **Note**: This is a minimal, idiomatic set implementation—ideal for utility or testing contexts. diff --git a/autodoc/docmd/pkg/obiutils/slices.md b/autodoc/docmd/pkg/obiutils/slices.md new file mode 100644 index 0000000..a893faf --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/slices.md @@ -0,0 +1,22 @@ +# `obiutils` Package Overview + +The `obiutils` package provides generic, reusable utility functions for common slice operations in Go. + +- **`Contains[T comparable](arr []T, x T) bool`** + Checks whether a given element exists in the slice. Uses generic type `T`, requiring only that it supports equality comparison. + +- **`LookFor[T comparable](arr []T, x T) int`** + Returns the index of the *first* occurrence of `x`, or `-1` if not found. Also generic over comparable types. + +- **`RemoveIndex[T comparable](s []T, index int) []T`** + Removes the element at `index`, returning a new slice. Works in O(1) time (amortized), using `append` to rebuild the slice. + +- **`Reverse[S ~[]E, E any](s S, inplace bool) S`** + Reverses the slice elements. If `inplace = true`, modifies the original; otherwise, copies first and returns a reversed copy. Uses type constraint `~[]E` for flexibility across slice aliases. + +All functions are designed to be: +- Type-safe via Go generics (no reflection), +- Efficient and idiomatic, +- Well-documented with clear parameter/return semantics. + +Ideal for use in data processing, validation logic, or general-purpose slice manipulation. diff --git a/autodoc/docmd/pkg/obiutils/strings.md b/autodoc/docmd/pkg/obiutils/strings.md new file mode 100644 index 0000000..847ccea --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/strings.md @@ -0,0 +1,33 @@ +# `obiutils` Package Overview + +The `obiutils` package provides low-level, high-performance utilities for ASCII string and set manipulation in Go. + +### Core Components + +- **`AsciiSet[256]bool`**: A compact boolean lookup table for ASCII characters (0–127), optimized for membership tests. +- **Predefined Sets**: + - `AsciiSpaceSet`: Whitespace characters (`\t\n\v\f\r `) + - `AsciiDigitSet`, `AsciiUpperSet`, `AsciiLowerSet` + - Derived sets: `Alpha` (letters), `Alnum` (alphanumeric) + +### Key Functions + +- **Set Operations**: + - `AsciiSetFromString(s string)`: Build a set from characters in a literal. + - `.Contains(c byte)` / `.Union()` / `.Intersect()`: Efficient membership and set algebra. + +- **String Parsing & Transformation**: + - `UnsafeStringFromBytes([]byte) string`: Zero-copy conversion (⚠️ unsafe; use only when memory safety is externally guaranteed). + - `FirstWord(s string)`: Extract first non-whitespace token. + - `(AsciiSet).FirstWord(...) (string, error)`: Same as above but validates characters against a restriction set. + - `TrimLeft(s string)` (via method on *AsciiSet): Remove leading whitespace using space-aware logic. + - `LeftSplitInTwo(s string, sep byte)`: Split at first occurrence of a separator. + - `RightSplitInTwo(s string, sep byte)`: Split at last occurrence. + +### Design Goals + +- **Performance**: Avoid allocations where possible (e.g., `unsafe.String`, direct indexing). +- **Simplicity**: Focused on ASCII-only operations for speed and predictability. +- **Safety Trade-offs**: `UnsafeStringFromBytes` trades safety for efficiency; other functions are safe and bounds-checked. + +Intended use: embedded systems, parsers, or performance-critical text processing where standard library overhead is undesirable. diff --git a/autodoc/docmd/pkg/obiutils/tar.md b/autodoc/docmd/pkg/obiutils/tar.md new file mode 100644 index 0000000..94c8a76 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/tar.md @@ -0,0 +1,24 @@ +# `TarFileReader` — Semantic Description + +The function `TarFileReader`, defined in the Go package `obiutils`, provides a targeted extraction capability for files within a TAR archive. + +- **Input**: + - `file`: A generic reader (`*Reader`) implementing the standard Go `io.Reader` interface — typically wrapping an archive file or stream. + - `path`: A string specifying the *exact* path (relative to archive root) of the desired file inside the TAR. + +- **Core Logic**: + - Instantiates a `tar.Reader` from the provided input stream. + - Iterates sequentially over TAR entries using `Next()`. + - Compares each entry’s header name (`header.Name`) with the requested `path`. + +- **Output**: + - On match: Returns a pointer to the *current* `tar.Reader`, positioned at the start of the requested file’s content (ready for subsequent reads). + - On failure: Returns `nil` and a formatted error `"file not found: "`. + +- **Semantics**: + - Enables *lazy*, on-demand access to a specific file inside a TAR archive — without decompressing the entire structure. + - Assumes exact path matching (no globbing, wildcards, or directory traversal). + - Does *not* handle symbolic links, hardlinks, or nested archives — only plain file entries. + +- **Use Case**: + Ideal for lightweight tools that need to inspect or extract a single known file from large TAR archives (e.g., config files, manifests), minimizing memory and I/O overhead. diff --git a/autodoc/docmd/pkg/obiutils/unsafe.md b/autodoc/docmd/pkg/obiutils/unsafe.md new file mode 100644 index 0000000..f06f300 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/unsafe.md @@ -0,0 +1,25 @@ +# `obiutils`: Unsafe String–Byte Conversions in Go + +This package provides low-level, zero-copy utilities for converting between `string` and `[]byte` in Go using the `unsafe` package. + +## Core Functions + +- **`UnsafeBytes(str string) []byte`** + Converts a `string` to a mutable byte slice **without copying**, by directly accessing the underlying memory. + ⚠️ *Unsafe*: Modifications to the returned slice may corrupt or alter the original string (undefined behavior). + Use only when performance is critical and immutability can be guaranteed. + +- **`UnsafeString(b []byte) string`** + Converts a `[]byte` to an immutable `string`, again **without copying**, by reinterpreting the byte slice’s memory as a string. + ⚠️ *Unsafe*: If `b` is later modified, the resulting string may become invalid (memory safety violation). + Requires that `b` remains immutable for the lifetime of the returned string. + +## Semantic Purpose + +These functions enable high-performance interop between strings and byte slices—critical in systems programming, serialization frameworks, or memory-constrained environments where allocation overhead must be avoided. + +## Risks & Best Practices + +- **Never mutate the returned slice or original input after conversion**. +- Prefer standard conversions (`[]byte(s)`, `string(b)`) unless profiling confirms a measurable bottleneck. +- Ensure inputs are valid and owned (e.g., not shared across goroutines without synchronization). diff --git a/autodoc/docmd/pkg/obiutils/xopen.md b/autodoc/docmd/pkg/obiutils/xopen.md new file mode 100644 index 0000000..8544ad1 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/xopen.md @@ -0,0 +1,38 @@ +# `obiutils` — Universal File I/O with Transparent Compression Support + +The `xopen`-based package in the `obiutils` module provides a unified interface for reading and writing files, streams, HTTP resources, or command outputs—**transparently handling multiple compression formats**: gzip, xz, zstd, and bzip2. + +## Key Functionalities + +- **`Ropen(f string)`** + Opens a file, stdin (`"-"`), HTTP(S) URL, or shell command (e.g., `"|gzip -dc file.gz"`) for **buffered reading**, auto-detecting compression via magic bytes. + +- **`Wopen(f string)` / `WopenFile(...)`** + Opens a file or stdout (`"-"`) for **buffered writing**, automatically compressing output based on extension (`.gz`, `.xz`, `.zst`, `.bz2`). + +- **Compression Detection** + Functions like `IsGzip()`, `IsXz()`, `IsZst()`, and `IsBzip2()` inspect the first bytes of a buffered reader to infer format. + +- **Path Utilities** + - `ExpandUser(path)` expands POSIX-style paths (`~`, `~/path`) to absolute ones. + - `Exists(path)` checks file existence after user expansion. + +- **Error Handling** + Defines semantic errors: `ErrNoContent`, `ErrDirNotSupported`. + +- **Buffered IO** + All readers/writers use a default buffer size of `65,536` bytes for performance. + +- **Resource Management** + `Close()` methods ensure proper cleanup of underlying readers/writers and compression streams. + +## Supported Sources & Formats + +| Source | Format(s) | +|-------------------|------------------------| +| Local files | plain, `.gz`, `.xz`, `.zst`, `.bz2` | +| Stdin (`"-"`) | auto-detected | +| HTTP(S) URLs | transparent decompression on stream read | +| Pipe commands (`"|cmd"`) | output piped and auto-decompressed | + +This abstraction simplifies bioinformatics or data-processing pipelines where input sources vary widely, and compression is common. diff --git a/autodoc/docmd/pkg/obiutils/xopen_test.md b/autodoc/docmd/pkg/obiutils/xopen_test.md new file mode 100644 index 0000000..5c02288 --- /dev/null +++ b/autodoc/docmd/pkg/obiutils/xopen_test.md @@ -0,0 +1,19 @@ +# `obiutils` Package: Semantic Overview + +The `xopen.go` test suite (via GoCheck) validates utility functions for flexible file/stream I/O in Go. Key features: + +- **`IsGzip()`**: Detects gzip compression by inspecting the first two bytes (`0x1f 0x8b`) of a `bufio.Reader`. +- **`Ropen()`**: Unified reader opener supporting: + - Local files (plain or `.gz`) + - Standard input (`"-"`) — *note: currently unimplemented in tests* + - HTTP(S) URLs (via `net/http`) +- **`Wopen()`**: Unified writer opener for: + - Local files (`".gz"` triggers gzip compression) + - Standard output via `"-"` +- **`Exists()`**: Checks file/directory existence (supports `~` expansion). +- **`ExpandUser()`**: Expands shell-like paths (`~/...`) to absolute ones. +- **Tested robustness**: + - Handles missing files, invalid URLs (404), and malformed paths. + - Validates gzip detection accuracy on both plain and compressed data. + +All operations abstract away compression/format details, enabling uniform read/write semantics across local files, pipes (commented out), and remote HTTP resources. diff --git a/autodoc/docmd/pkg_obialign.md b/autodoc/docmd/pkg_obialign.md new file mode 100644 index 0000000..49648bb --- /dev/null +++ b/autodoc/docmd/pkg_obialign.md @@ -0,0 +1,76 @@ +# `obialign` Package: Semantic Overview + +The `obialign` package delivers high-performance, memory-efficient utilities for biological sequence alignment within the OBITools4 ecosystem. It targets amplicon and metagenomic data processing, emphasizing speed, numerical stability, and scalability. + +--- + +## Core Functionalities + +### 1. **Sequence Encoding & Decoding** +- `Encode4bits`: Converts IUPAC nucleotides (including ambiguous codes like R, Y, N) into compact 4-bit representations. +- Supports bitwise operations for rapid comparison (e.g., via `_FourBitsBaseCode`). +- Handles gaps (`.`/`-`) and invalid characters as `0b0000`. + +### 2. **Alignment Scoring & Probability Models** +- `_MatchRatio`, `_NucPartMatch`: Compute match likelihoods using bitwise overlap of encoded bases. +- Log-space helpers (`_Logaddexp`, `_Logdiffexp`) ensure numerical stability in probabilistic scoring. +- Quality-aware scores via precomputed matrices (`_NucScorePartMatch{Match,Mismatch}`), incorporating Phred scores and base composition priors. + +### 3. **Dynamic Programming (DP) Backtracking** +- `_Backtracking`: Reconstructs optimal alignment paths from precomputed matrices. + - Encodes diagonal runs and gap segments as alternating `(offset, length)` pairs. +- Optimized for batch reuse of path buffers and minimal allocations. + +### 4. **Longest Common Subsequence (LCS) with Error Tolerance** +- `FastLCSEGFScore`, `FastLCSScore`: Compute LCS under bounded error (`maxError`) and optional end-gap-free mode. + - Uses diagonal banding for efficiency. +- Designed for rapid similarity filtering (e.g., UMI/OTU clustering). + +### 5. **Single-Edit Distance Detection** +- `D1Or0`: Determines if two sequences are identical or differ by exactly one edit (substitution/indel). + - Early termination on length mismatch or multiple divergences. + - Critical for error correction and dereplication. + +### 6. **Local Pattern Matching** +- `LocatePattern`: Finds optimal approximate match of a short query (e.g., primer) in longer sequence. + - End-gap-free alignment, using DP with mismatch/gap penalty `-1`. + - Returns start/end positions and error count. + +### 7. **Paired-End Read Alignment** +- `PEAlign`, `_FillMatrixPeLeftAlign`, etc.: Global alignment of paired-end reads with affine gap penalties. + - Supports three modes: `PELeftAlign`, `PERightAlign`, and `PECenterAlign` (for overlaps). + - Integrates k-mer pre-screening (`obikmer.Index4mer`) for fast overlap estimation. + - Quality-aware scoring via `_PairingScorePeAlign`. + +### 8. **Consensus & Alignment Reconstruction** +- `BuildAlignment`, `_BuildAlignment`: Reconstruct aligned sequences from DP path, reusing buffers. +- `BuildQualityConsensus`: Generates consensus with quality-aware base selection: + - Mismatches resolved by higher-quality call or IUPAC ambiguity. + - Optional mismatch statistics recording. + +### 9. **Memory & Performance Optimization** +- `PEAlignArena`: Reusable memory arena for matrices, paths, and buffers. + - Reduces GC pressure in high-throughput pipelines. +- Compact `uint64` encoding for scores, path lengths, and flags (`encodeValues`, `_incscore`). + - Enables fast comparisons during DP. + +--- + +## Design Principles + +- **IUPAC-aware**: Handles ambiguous nucleotides via `obiseq.SameIUPACNuc`. +- **Thread-safe initialization**: `_InitDNAScoreMatrix` uses mutex guards. +- **No allocations in hot paths**: Buffers reused across calls (arena pattern). +- **End-gap flexibility**: Critical for read merging and primer trimming. + +--- + +## Use Cases + +| Functionality | Application | +|---------------|-------------| +| `FastLCSEGFScore`, `D1Or0` | OTU/ASV clustering, UMI deduplication | +| `LocatePattern`, `PEAlign` | Primer trimming, read merging in metabarcoding | +| `BuildQualityConsensus`, `_Backtracking` | Consensus generation post-merge | + +Designed for integration into large-scale NGS pipelines—especially where speed, memory footprint, and numerical robustness are critical. diff --git a/autodoc/docmd/pkg_obiapat.md b/autodoc/docmd/pkg_obiapat.md new file mode 100644 index 0000000..edd0f94 --- /dev/null +++ b/autodoc/docmd/pkg_obiapat.md @@ -0,0 +1,88 @@ +# `obiapat`: High-Performance Approximate Pattern Matching for Biological Sequences + +The `obiapat` Go package delivers **fast, memory-safe approximate pattern matching** over biological sequences (DNA/RNA), leveraging a C-based implementation of the **Apat algorithm**. Designed for NGS preprocessing (e.g., primer detection, adapter trimming), it supports fuzzy matching with mismatches/indels, reverse-complement search, circular topology handling, and efficient non-overlapping match filtering—all while integrating seamlessly with the OBITools4 ecosystem. + +## Core Concepts + +- **`ApatPattern`**: Compiled pattern object (≤64 bp) supporting: + - IUPAC ambiguity codes (`W`, `R`, `[AT]`) + - Negated bases (`!A` = "not A") + - Fixed-position anchors (`#`) +- **`ApatSequence`**: Lightweight wrapper around `obiseq.BioSequence`, enabling optimized pattern scanning with optional circular indexing and memory recycling. + +## Public API + +### Pattern Construction & Transformation +- **`MakeApatPattern(pattern string, errormax int, allowsIndel bool) (*ApatPattern, error)`** + Compiles a pattern string into an executable automaton. Supports: + - `errormax`: Max allowed errors (substitutions only if `allowsIndel=false`; indels included otherwise). + - Pattern syntax: e.g., `"A[T]C!GT#"` → matches "A", then any A/T, then C, allows 1 mismatch at position `!G`, requires exact match at anchored `#T`. +- **`ReverseComplement() *ApatPattern`** + Returns a new pattern representing the reverse complement (essential for strand-agnostic DNA searches). +- **`Len() int`** + Returns the pattern’s length in bases. + +### Matching & Search Operations + +- **`FindAllIndex(seq *ApatSequence, start, end int) [][3]int`** + Returns all valid matches in `[start_pos, end_pos, error_count]` format within `seq[start:end)`. + - Supports partial sequence scans (e.g., for sliding windows). +- **`IsMatching(seq *ApatSequence, start, end int) bool`** + Fast boolean check: does the pattern match *anywhere* in `seq[start:end)` within error tolerance? +- **`BestMatch(seq *ApatSequence, start, end int) (start, end, errors int)`** + Finds the *lowest-error* match in a region. For indel patterns, performs local realignment to refine alignment boundaries. +- **`FilterBestMatch(seq *ApatSequence, start, end int) [][3]int`** + Returns **non-overlapping matches**, prioritizing lower-error occurrences (greedy selection from best to worst). +- **`AllMatches(seq *ApatSequence, start, end int) [][3]int`** + Computes all valid matches (including indel-aware realignment), then filters to non-overlapping set using `FilterBestMatch`. + +### Resource Management +- **`Free()`** + Explicitly releases C-level resources. Finalizers auto-cleanup, but manual `Free()` is recommended in hot loops for predictable memory use. + +## PCR Simulation Module (`PCRSim` family) + +Implements *in silico* PCR with configurable primer tolerance and amplicon constraints: + +- **`PCRSim(seq obiseq.BioSequence, opts ...Option) []Amplicon`** + Simulates PCR on a single sequence. Options include: + - `OptionForwardPrimer(pattern string, errormax int)` / `OptionReversePrimer(...)` + - `OptionMinLength(n)`, `OptionMaxLength(n)` → filter amplicons by size + - `OptionWithExtension(len int, strict bool)` → add flanking regions (trim if `strict=false`) + - `OptionCircular(bool)` → handle circular DNA topology +- **`PCRSlice(seqs []obiseq.BioSequence, opts ...Option) [][]Amplicon`** + Batch PCR across multiple sequences. +- **`PCRSliceWorker(opts ...Option) func(int, obiseq.BioSequence) (int, interface{})`** + Returns a reusable worker for parallel execution via `obiseq.MakeISliceWorker`. + +### Output Format +Each amplicon includes: +- Coordinates, primer positions/errors/directions +- Flanking extensions (if requested) +- Original sequence metadata preserved + +## Predicate Generator: `IsPatternMatchSequence` + +Returns a **reusable function** for sequence filtering: +```go +func IsPatternMatchSequence( + pattern string, errormax int, + bothStrand bool, allowIndel bool +) obiseq.SequencePredicate +``` +- Internally builds `ApatPattern` + reverse complement (if needed). +- Predicate logic: + ```go + func(seq *obiseq.BioSequence) bool { + return pattern.IsMatching(...) || (!bothStrand && false) + || rcPattern.IsMatching(...) + } + ``` +- Ideal for high-throughput read filtering (e.g., barcode detection, primer contamination checks). + +## Implementation Highlights + +- **C interoperability** via `cgo` with custom memory management (no Go heap copies). +- **Finalizers + manual `Free()`** prevent leaks in long-running pipelines. +- Uses `unsafe.SliceData` for zero-copy sequence access during matching. +- Logging via **Logrus** (errors at `ErrorLevel`, debug amplicon details at `DebugLevel`). diff --git a/autodoc/docmd/pkg_obichunk.md b/autodoc/docmd/pkg_obichunk.md new file mode 100644 index 0000000..f7305e2 --- /dev/null +++ b/autodoc/docmd/pkg_obichunk.md @@ -0,0 +1,103 @@ +# `obichunk`: High-Performance Chunking and Dereplication of Biological Sequences + +The `obichunk` package provides scalable, configurable infrastructure for preprocessing large-scale biological sequence data (e.g., FASTA/FASTQ). It enables efficient grouping, sorting, deduplication, and batched streaming of sequences—critical for metabarcoding, metagenomics, or any high-throughput NGS workflow. + +--- + +## Core Functionalities + +### `ISequenceChunk` +Unified entry point for sequence chunking, supporting both **in-memory** and **on-disk** execution modes. +- Accepts an `obiiter.IBioSequence` iterator and a classifier (`obiseq.BioSequenceClassifier`). +- Mode selection via `onMemory` flag: routes to either `ISequenceChunkOnMemory` or `ISequenceChunkOnDisk`. +- Optional parameters: + - `dereplicate`: deduplicate identical sequences per batch. + - `na`: defines placeholder for missing/ambiguous characters (e.g., `"N"`, `"?"`). + - `statsOn`: enables metadata tracking (e.g., sample IDs, primer names) for statistics. + - `uniqueClassifier`: optional secondary classifier to assign unique labels. + +Returns an iterator over processed sequences (`obiiter.IBioSequence`), supporting streaming pipelines and downstream integration. + +--- + +### `ISequenceChunkOnDisk` +Efficiently splits sequences into **temporary on-disk batches** (`.fastx` files), ideal for large datasets. +- Automatically manages a temp directory (`obiseq_chunks_*`) and cleans up post-processing. +- Uses `find` to discover all generated chunk files recursively. +- Asynchronous streaming: batches are yielded via an iterator as they’re written, decoupling production and consumption. +- Optional per-batch dereplication using composite keys (sequence + classification). +- Logs batch count and start events for monitoring. + +Internally leverages: +- `obiiter.MakeIBioSequence()` to build output iterator. +- `obiformats.WriterDispatcher` for parallel file writing. +- A dedicated goroutine to read, classify/dereplicate, and emit batches. + +--- + +### `ISequenceChunkOnMemory` +Performs **in-memory parallel chunking** of sequences into classification-based batches. +- Routes each sequence to a bucket (flux) using the classifier. +- Maintains one `BioSequenceSlice` per classification group in memory (thread-safe via mutex). +- Emits batches **only after full input consumption**, preserving deterministic batch order (0, 1, …). +- Parallel processing: each flux handled in its own goroutine. +- Fails fast on internal errors (e.g., channel issues) via `log.Fatalf`. + +Ideal for RAM-sufficient workloads requiring low-latency, ordered batch output. + +--- + +### `Options` System +Configurable pipeline behavior via functional options pattern. +- Immutable configuration builder: `MakeOptions([]WithOption)` applies setters to internal struct. +- Key options: + - **Categorization**: `OptionSubCategory(...)` appends sample/marker labels; `PopCategories()` retrieves first. + - **Missing values**: `OptionNAValue(na)` customizes placeholder (default: `"?"`). + - **Statistics**: `OptionStatOn(...)` registers fields for metadata tracking. + - **Batching**: + - `OptionBatchCount(n)` sets number of batches (e.g., for hashing). + - `OptionsBatchSize(size)` defines items per batch. + - **Concurrency**: `OptionsParallelWorkers(n)`. + - **Sorting strategy**: + - `OptionSortOnDisk()` enables disk-backed sorting. + - `OptionSortOnMemory()` (default) uses RAM-based sort. + - **Singleton filtering**: + - `OptionsNoSingleton()` excludes singleton reads (count = 1). + - `OptionsWithSingleton()` allows them. + +Defaults drawn from `obidefault`, ensuring reproducibility and ease of use. + +--- + +### `ISequenceSubChunk` +Parallel, class-based sorting and re-batching of sequence batches. +- Input: iterator over `BioSequenceBatch`, classifier, and worker count. +- For each batch: + - If size >1: sequences are sorted *in-place* by classification code (via custom `sort.Interface`). + - Consecutive sequences with same class are regrouped into new batches. +- Uses atomic counters (`nextOrder`) to assign globally increasing order IDs across workers—ensuring deterministic inter-batch ordering. +- Preserves input-order *within* each new batch. + +Use case: preparing sorted, class-homogeneous batches for downstream tasks (e.g., consensus calling or alignment). + +--- + +### `IUniqueSequence` +End-to-end **dereplication** pipeline: groups identical sequences, aggregates counts and metadata. +- Input iterator + optional `Options`. +- Parallelization via configurable workers (falls back to single-threaded if disk sorting enabled). +- **Splitting phase**: + - Uses `HashClassifier` to partition input deterministically (controlled by `BatchCount`). +- **Storage selection**: + - In-memory: via `ISequenceChunkOnMemory`. + - On-disk: uses `ISequenceSubChunk` + external sort (single worker required). +- **Uniqueness logic**: + - Composite classifier: sequence identity + optional annotations (sample, primer). + - NA handling for missing annotation fields. +- **Singleton filtering**: optionally excludes reads with count =1 (`NoSingleton()`). +- **Parallel deduplication**: + - Workers process chunks via `ISequenceSubChunk` + per-group aggregation. +- **Merging**: + - Aggregates results via `IMergeSequenceBatch`, preserving counts, stats, and ordering. + +Scalable from small datasets to terabyte-scale NGS runs. diff --git a/autodoc/docmd/pkg_obicorazick.md b/autodoc/docmd/pkg_obicorazick.md new file mode 100644 index 0000000..81425e6 --- /dev/null +++ b/autodoc/docmd/pkg_obicorazick.md @@ -0,0 +1,70 @@ +# `obicorazick`: Aho-Corasick-Based Sequence Analysis Package + +`obicorazick` is a high-performance Go library for rapid pattern detection in biological sequences (e.g., FASTA/FASTQ), designed to scale efficiently with large pattern sets. Built on the Aho-Corasick algorithm, it enables concurrent scanning of sequences against thousands to millions of patterns—ideal for primer screening, contamination checks, or taxonomic classification. + +## Public API + +### `AhoCorazickWorker(slot string, patterns []string) obiseq.SeqWorker` + +Constructs a **sequence worker function** that scans input sequences for matches against the provided `patterns`, using *multiple* Aho-Corasick automata compiled in parallel (batched internally to manage memory). + +- **Input**: + - `slot`: Name of the attribute field where match counts will be stored (e.g., `"primer_hits"`). + - `patterns`: List of DNA/RNA patterns (strings) to search for. + +- **Behavior**: + - Splits `patterns` into batches of ≤10⁷ items (configurable via environment). + - Compiles one Aho-Corasick matcher per batch in parallel (using `obidefault.ParallelWorkers()`). + - For each sequence: scans both the forward strand and its reverse complement. + - Records three counts as attributes on the sequence: + ```text + → total matches (forward + rev-comp) + _Fwd → forward-strand-only matches + _Rev → rev-comp-specific (i.e., not found on forward) matches + ``` + - Logs match counts at debug level (via Logrus). + +- **Use case**: Annotating sequences with pattern-hit statistics for downstream analysis (e.g., reporting primer coverage per read). + +--- + +### `AhoCorazickPredicate(minMatches int, patterns []string) obiseq.SequencePredicate` + +Returns a **boolean predicate function** that tests whether sequences contain ≥ `minMatches` occurrences of any pattern. + +- **Input**: + - `minMatches`: Minimum number of total matches required to pass the predicate. + - `patterns`: List of patterns (same format as above). + +- **Behavior**: + - Compiles a *single* Aho-Corasick matcher (no batching—assumes pattern set is moderate-sized or memory-safe). + - Scans only the forward strand (for efficiency in filtering contexts where rev-comp is unnecessary). + - Returns `true` if match count ≥ `minMatches`; otherwise `false`. + +- **Use case**: Filtering sequences—e.g., retain only reads containing ≥2 barcode primers, or discard those matching known contaminants. + +--- + +## Implementation Notes (Non-Exported) + +While not part of the public API, internal behavior includes: +- **Batching logic**: Splits patterns to avoid memory exhaustion during automaton construction. +- **Parallel compilation**: Uses goroutines + sync.WaitGroup, respecting `GOMAXPROCS`. +- **Progress feedback**: Optional CLI progress bar (via `progressbar/v3`) when enabled globally. +- **Logging**: Info/debug messages via Logrus (e.g., “Built 3 matchers in parallel” or “Sequence X: 5 total matches”). + +## Typical Workflows + +1. **Annotation pipeline**: + ```go + worker := AhoCorazickWorker("contam", contaminantDB) + annotatedSeqs := obiseq.Map(worker, inputSequences) + ``` + +2. **Filtering pipeline**: + ```go + filter := AhoCorazickPredicate(1, barcodePatterns) + filteredSeqs := obiseq.Filter(filter, inputSequences) + ``` + +Designed for speed and memory efficiency in large-scale NGS data processing. diff --git a/autodoc/docmd/pkg_obidefault.md b/autodoc/docmd/pkg_obidefault.md new file mode 100644 index 0000000..688bcf7 --- /dev/null +++ b/autodoc/docmd/pkg_obidefault.md @@ -0,0 +1,56 @@ +# Obidefault Package: Centralized Configuration Module + +The `obidefault` package provides a unified, runtime-configurable interface for core application-level settings in the Obitools ecosystem. It centralizes global state related to batching, compression, verbosity, progress reporting, quality handling, taxonomy resolution, and parallelism—enabling consistent behavior across modules without parameter passing or recompilation. + +## Batch Configuration + +Controls sequence batching for efficient processing: +- `SetBatchSize(n)`, `_BatchSize()` → Minimum sequences per batch (default: 1). +- `SetBatchSizeMax(n)`, `_BatchSizeMax()` → Hard upper limit on batch size (default: 2000). +- `SetBatchMem(n)`, `_BatchMem()` → Memory cap per batch in bytes (default: 128 MB); `0` disables memory-based batching. +- `_BatchMemStr()` stores the raw CLI string (e.g., `"256M"`) for parsing. +- Supports configuration via `--batch-size`/`OBIBATCHSIZE`, and `--batch-mem`. + +## Output Compression + +Toggles compression of output streams: +- `SetCompressOutput(bool)`, `CompressOutput()` → Enable/disable compression globally. +- Pointer access via `CompressOutputPtr()` for dynamic binding. + +## Warning Verbosity + +Suppresses warning messages when enabled: +- `SetSilentWarning(bool)`, `SilentWarning()` → Control warning output. +- Pointer access via `SilentWarningPtr()`. When true, all warnings should be suppressed (implementation-dependent). + +## Progress Bar Visibility + +Enables/disables progress bar rendering: +- `SetNoProgressBar(bool)`, `NoProgressBar()` → Disable/enable bars (default: enabled). +- `ProgressBar()` returns the inverse of `NoProgressBar()`. +- Pointer access via `NoProgressBarPtr()`. + +## Quality Score Handling + +Configures FASTQ quality score parsing and encoding: +- `SetReadQualitiesShift(byte)`, `ReadQualitiesShift()` → Input offset (default: 33, Phred+33). +- `SetWriteQualitiesShift(byte)`, `WriteQualitiesShift()` → Output offset (default: 33). +- `SetReadQualities(bool)`, `ReadQualities()` → Enable/disable quality parsing (default: true). +- Enables format conversion and performance optimization. + +## Taxonomy Configuration + +Controls taxonomic identifier handling in OBIDMS workflows: +- `SetSelectedTaxonomy(string)`, `UseRawTaxids()`, etc. → Select taxonomy (e.g., `"NCBI"`), toggle raw/normalized IDs, alternative names. +- `SetFailOnTaxonomy(bool)`, `SetUpdateTaxid(bool)` → Control error behavior and auto-updates. +- Provides getters, setters, and pointer accessors for initialization-time configuration. + +## Parallelism Control + +Manages worker counts across read/write/ general operations: +- `SetWorkerPerCore(float64)`, `_ReadWorkerPerCore`/`_WriteWorkerPerCore` → Scaling factors (default: 1.0 / 0.25). +- `SetStrictReadWorker(n)`, `_MaxCPU` → Override with absolute worker counts. +- Functions: `ParallelWorkers()`, `Read/WriteParallelWorkers()` → Compute effective worker counts. +- Configurable via CLI flags (`--max-cpu`, `-m`) and `OBIMAXCPU` environment variable. + +> **Design Note**: All settings are *not* thread-safe; intended for use during initialization. Public API exposes only getters/setters/pointers—no internal mutation beyond controlled access. diff --git a/autodoc/docmd/pkg_obidist.md b/autodoc/docmd/pkg_obidist.md new file mode 100644 index 0000000..4ad8049 --- /dev/null +++ b/autodoc/docmd/pkg_obidist.md @@ -0,0 +1,52 @@ +# `obidist`: Efficient Symmetric Distance and Similarity Matrix Management + +The `obidist` Go package provides memory-efficient, symmetric matrix implementations for pairwise **distance** and **similarity** computations — ideal for clustering, phylogenetics, or any domain requiring fast access with minimal footprint. It enforces structural guarantees (symmetry, fixed diagonal) and offers safe, label-aware operations. + +## Core Types + +| Type | Description | +|------|-------------| +| `DistMatrix` | Symmetric *n×n* matrix for **distances**; diagonal entries are always `0.0`. | +| `SimilarityMatrix` | Symmetric *n×n* matrix for **similarities**; diagonal entries are always `1.0`. | + +Both types store only the upper triangle (`i < j`) to reduce memory from *O(n²)* to *n(n−1)/2*. All access (`Get`, `Set`) is automatically mirrored for symmetry. + +## Constructors + +| Function | Description | +|---------|-------------| +| `NewDistMatrix(n)` / `WithLabels(labels []string)` | Creates a distance matrix of size *n×n* (diag = 0). Labels are optional. | +| `NewSimilarityMatrix(n)` / `WithLabels(labels []string)` | Creates a similarity matrix of size *n×n* (diag = 1). Labels are optional. | + +> **Note**: Passing `labels` with length ≠ *n* panics; empty labels (`nil`) are allowed. + +## Core Operations + +| Method | Description | +|--------|-------------| +| `Get(i, j) float64` | Returns value at *(i,j)*; enforces symmetry (reads stored upper triangle). | +| `Set(i, j, v float64)` | Sets value at *(i,j)*; silently ignores diagonal assignments. | +| `Size() int` | Returns *n*, the matrix dimension. | +| `GetLabel(i) string`, `SetLabel(i int, label string)` | Read/write the *i*-th element’s label. | +| `Labels() []string` | Returns a **copy** of all labels (safe mutation). | +| `GetRow(i) []float64`, `GetColumn(j) []float64` | Returns full row/column as a **new slice** (symmetric copy). | + +> All index access panics on out-of-bounds (`i < 0` or `≥ n`). Diagonal writes (e.g., `Set(i, i, v)`) are silently ignored. + +## Analysis & Utility Methods + +| Method | Description | +|--------|-------------| +| `MinDistance() (val float64, i, j int)` | Returns smallest off-diagonal value and its indices. For *n ≤ 1*, returns `(0, -1, -1)`. | +| `MaxDistance() (val float64, i, j int)` | Returns largest off-diagonal value and its indices. For *n ≤ 1*, returns `(0, -1, -1)`. | +| `Copy() *DistMatrix` | Deep copy (including labels). Safe for concurrent use or immutability. | +| `ToFullMatrix() [][]float64` | Returns a dense *n×n* copy (upper/lower triangles + diagonal). Use sparingly for large matrices. | + +## Edge Cases & Guarantees + +- **Empty matrix** (*n = 0*): All methods behave safely (e.g., `Size()` → `0`, min/max → `(0, -1, -1)`). +- **Singleton matrix** (*n = 1*): Only diagonal exists → min/max return `(0, -1, -1)`. +- **Label integrity**: `Labels()` and row/column copies use defensive duplication. +- **No normalization enforced** on similarity values (e.g., `[-∞, +∞]` allowed), but diagonals are *always* fixed. + +Designed for correctness-first scientific workflows, with rigorous unit tests covering bounds checks and symmetry. diff --git a/autodoc/docmd/pkg_obiformats.md b/autodoc/docmd/pkg_obiformats.md new file mode 100644 index 0000000..edfde5d --- /dev/null +++ b/autodoc/docmd/pkg_obiformats.md @@ -0,0 +1,121 @@ +# `obiformats` Package — Semantic Overview + +The **`obiformats`** package provides a unified, extensible framework for parsing and writing biological sequence data in standard bioinformatics formats (FASTA/FASTQ, EMBL, GenBank, CSV, EcoPCR), while supporting streaming, batching, parallelism, and format-agnostic workflows. + +## Core Objectives + +1. **Format-Agnostic Input**: Automatically detect and parse diverse sequence formats via MIME-type inference. +2. **Streaming & Scalability**: Enable memory-efficient ingestion of large NGS datasets through chunked, concurrent parsing. +3. **Structured Output**: Support flexible export to FASTA/FASTQ, JSON, CSV, Newick, and taxonomy-aware formats. +4. **Interoperability**: Integrate seamlessly with OBITools4 abstractions (`obiseq.BioSequence`, `obiiter.IBioSequence`, `obitax.Taxon`). +5. **Extensibility**: Allow new readers/writers to be plugged in via functional interfaces and options. + +--- + +## Public Functionalities (Grouped by Domain) + +### 📥 **Sequence Reading & Parsing** + +| Function | Format(s) Supported | +|---------|---------------------| +| `ReadSequencesFromFile`, `ReadSequencesFromStdin` | Auto-detected (FASTA/FASTQ/EMBL/GenBank/EcoPCR/CSV) | +| `ReadFasta`, `ReadFastq` | FASTA, FASTQ (with rope/buffered variants) | +| `ReadEMBL`, `ReadGenbank` | EMBL, GenBank (rope-aware for large files) | +| `ReadCSV`, `ReadEcoPCR` | Tabular/amplicon outputs (e.g., EcoPCR v1/v2) | +| `LoadCSVTaxonomy`, `LoadNCBITaxDump` | Taxonomic data (CSV, NCBI dump dir/tar) | + +- **Concurrent Parsing**: Configurable worker count (`OptionsParallelWorkers`) with ordered batch output. +- **Rope-Based Parsing**: Zero-copy parsing for large files (`FastaChunkParserRope`, `EmblChunkParserRope`). +- **Header Parsing**: JSON (`ParseFastSeqJsonHeader`) and legacy OBI-style (`ParseOBIFeatures`). +- **Quality Handling**: Phred offset adjustment, optional `U→T` conversion. + +### 📤 **Sequence Writing & Formatting** + +| Function | Format(s) Supported | +|---------|---------------------| +| `WriteFasta`, `FormatFastq` | FASTA/FASTQ (single/batch, parallel I/O) | +| `WriteJSON` | Structured JSON with annotations (batched + ordered writes) | +| `FormatFastaBatch`, `WriteFastqToFile` | Optimized batch formatting with compression | +| `CSVTaxaIterator`, `CSVSequenceRecord` | Taxonomic/sequence CSV export (configurable columns) | +| `WriteNewick`, `Tree.Newick` | Taxonomy → Newick tree (with optional annotations) | + +- **Compression Support**: Automatic gzip/bgzip via `obiutils.CompressStream`. +- **Paired-End Handling**: Split forward/reverse reads to separate files. +- **Ordered Output**: Preserves sequence order across parallel writes (`WriteFileChunk`). +- **Format-Aware Dispatching**: `WriteSequence()` auto-selects FASTQ/FASTA based on quality presence. + +### 🧬 **Taxonomy & Metadata Handling** + +| Function | Purpose | +|---------|--------| +| `LoadCSVTaxonomy`, `LoadNCBITarTaxDump` | Load taxonomies from CSV/NCBI dumps | +| `DetectTaxonomyFormat`, `LoadTaxonomy` | Auto-detect and load taxonomy from diverse sources | +| `CSVTaxaIterator`, `WriteNewick` | Export taxonomies to CSV or Newick | +| Taxon annotation extraction (e.g., `taxid`, path, rank) | via structured metadata fields | + +- **Root Enforcement**: Ensures presence of NCBI root (`taxid=1`) during loading. +- **Alias Resolution**: Merged taxids mapped to current IDs (`AddAlias`). +- **Flexible Output Fields**: CSV/Newick support configurable metadata (scientific name, taxid, rank, path). + +### ⚙️ **Configuration & Options** + +- `Options` encapsulates all runtime settings via functional setters (`WithOption`, e.g., `BatchSize(1024)`, `OptionsCompressed(true)`). +- Key options include: + - I/O: file append/truncate, compression (`OptionsCompressed`) + - Parsing: header parser toggle, quality read flag + - Export: CSV columns (`CSVId`, `CSVTaxid`), NA value, separator + - Taxonomy: include path/root/rank (`OptionWithoutRootPath`, `WithTaxid`) + - Performance: parallel workers, buffer size +- Defaults ensure safe behavior; options are composable and immutable. + +### 🧵 **Streaming & Chunking Primitives** + +| Type/Function | Purpose | +|---------------|---------| +| `PieceOfChunk`, `FileChunk` | Rope-based buffers for zero-copy streaming | +| `ReadFileChunk()` | Chunk file by record boundaries (not fixed size) | +| `EndOfLastFastaEntry`, `EndOfLastFastqEntry` | Find last complete record in buffer (for safe splitting) | +| `ropeScanner`, `_readline__` | Line-by-line scanner over ropes (no full materialization) | +| `WriteFileChunk()` | Ordered, thread-safe chunk reassembly | + +- Designed for **large-file resilience**: avoids full file load; splits only at valid boundaries. +- Integrates with `obiiter` for push-style streaming iterators. + +### 🔍 **Format Detection & Discovery** + +| Function | Role | +|---------|------| +| `OBIMimeTypeGuesser`, `NGSFilterCsvDetector` | Content-based MIME detection (e.g., FASTA via `>`, EcoPCR via `#@ecopcr-v2`) | +| `DetectTaxonomyFormat` | Detects NCBI dump, CSV, FASTA/FASTQ as taxonomy sources | +| `OBIMimeNGSFilterTypeGuesser` | Distinguishes legacy vs. CSV NGS filter configs | + +- Uses `github.com/gabriel-vasile/mimetype` for robust format sniffing. +- Preserves unread bytes to allow downstream parsers. + +### 📋 **Specialized Parsers & Writers** + +- `ReadCSVFromStdin`, `_ParseFastqFile`: Convenience wrappers for stdin/file I/O. +- `JSONRecord()`, `FormatFastaBatch()`: Optimized serialization with minimal allocations. +- `_parse_json_*` helpers: High-performance JSON parsing using `jsonparser`. +- `WriteFastaToFile`, `_UnescapeUnicodeCharactersInJSON()`: Robust output handling. + +--- + +## Design Principles + +- **Streaming First**: All parsers return `obiiter.IBioSequence` — lazy, batched iterators. +- **Functional Abstraction**: Format handling via `IBatchReader`, `FormatHeader` — decoupled from core logic. +- **Extensibility**: New formats added via `ReadSequencesFromFile()` extension points and MIME registration. +- **Fail-Safe Defaults**: Empty files → empty iterator; missing root taxon → fatal error. +- **Ordered Semantics**: Despite parallelism, batches preserve global order via atomic counters (`nextCounter`). + +--- + +## Integration Highlights + +- **Dependencies**: Uses `obiseq`, `obiiter`, `obitax`, and utilities (`obiutils`/`obidefault`) for core data models. +- **Logging**: Structured logs via `logrus` (format detection, errors, progress). +- **Error Handling**: Panics on unrecoverable issues; graceful fallbacks (e.g., `ReadEmptyFile`). +- **Performance**: Rope-based parsing, zero-copy where possible (`unsafe.String`, buffered writes). + +> ✅ `obiformats` enables scalable, reproducible NGS data processing — from raw ingestion to structured export. diff --git a/autodoc/docmd/pkg_obifp.md b/autodoc/docmd/pkg_obifp.md new file mode 100644 index 0000000..4a35633 --- /dev/null +++ b/autodoc/docmd/pkg_obifp.md @@ -0,0 +1,82 @@ +# `obifp`: Semantic Overview of Public API + +The `obifp` package provides a family of fixed-size, arbitrary-precision unsigned integer types—`Uint64`, `Uint128`, and `Uint256`—designed for high-precision arithmetic where overflow safety, bitwise control, and type consistency are critical (e.g., cryptography, genomics with OBITools). All types share a unified interface (`FPUint[T]`) and enforce strict correctness via panics on overflow/underflow or division-by-zero. + +## Core Principles + +- **Explicit precision**: No silent truncation; narrowing casts emit warnings (via `obilog.Warnf`). +- **Panic-on-error semantics**: Arithmetic operations (`Add`, `Sub`, `Mul`, etc.) panic on overflow/underflow. +- **Bit-level fidelity**: Shifts and bitwise operations operate across full bit-width with carry propagation. + +--- + +## Unified Interface: `FPUint[T]` + +All three types (`Uint64`, `Uint128`, `Uint256`) implement this generic interface: + +- **Construction & Initialization** + - `Zero() T`: Returns the additive identity. + - `Set64(v uint64) T`: Initializes from a native 64-bit value (zero-extended). + - `OneUint[T]`: Helper to construct the value *1*. + +- **Downcasting & Utility** + - `AsUint64() uint64`: Extracts the least-significant limb (assumes higher limbs are zero; warns if not). + - `IsZero() bool`: Checks for equality with zero. + +- **Logical & Bitwise Operations** + - `And(v T)`, `Or(v T)`, `Xor(v T)` — bitwise logic between two values of same type. + - `Not() T` — inverts all bits (two’s complement style for unsigned). + - `LeftShift(n uint) T`, `RightShift(n uint) T` — multi-limb shifts with carry handling; warns if shift ≥ full bit-width. + +- **Arithmetic** + - `Add(v T)`, `Sub(v T)` — with carry/borrow propagation; panics on overflow/underflow. + - `Mul(v T)` — full-width multiplication (uses hardware-optimized limb-wise ops); panics on overflow. + - *Division (`Div`, `Mod`) is implemented only for concrete types (see below).* + +- **Comparison** + - `Cmp(v T) int` — returns `-1`, `0`, or `+1`. + - Overloaded operators: `<`, `<=`, `>`, `>=` (all returning `bool`). + +--- + +## Concrete Types & Specialized Features + +### ✅ `Uint64` +- Native Go `uint64` wrapper with strict overflow checking. +- Uses `math/bits.Add64`, `Mul64` internally for correctness. +- Supports conversion to larger types: `Uint128()`, `Uint256()`. + +### ✅ `Uint128` +- Internally: two limbs (`w0`, `w1`). +- **Arithmetic**: + - Full support: `Add(v)`, `Sub(v)`, `Mul(v)` (128×128), and scalar variants: `Add64`, `Mul64`. + - Division & Modulo: + - `Div(v)`, `Mod(v)` — integer division with remainder. + - `QuoRem(v Uint128) (q, r Uint128)` — combined quotient/remainder. + - `Div64`, `Mod64` for division by 64-bit scalar. +- **Bitwise**: Full support (`And`, `Or`, `Xor`, `Not`), plus shifts. +- **Conversion**: + - Safe upcast to `Uint256`. + - Downcast to `uint64` via `AsUint64()` (warns if high limb ≠ 0). + +### ✅ `Uint256` +- Internally: four limbs (`w0` to `w3`) — supports values up to $2^{256} - 1$. +- **Arithmetic**: + - `Add(v)`, `Sub(v)` — limb-wise with carry/borrow. + - `Mul(v)` — schoolbook multiplication across limbs; panics on overflow. + - `Div(v)`: Long division implementation (repeated subtraction of shifted multiples); panics on zero divisor. +- **Shifts**: Multi-limb shifts with carry propagation across all limbs. +- **Conversion**: + - Downcast to `Uint128()` / `AsUint64()`, with overflow warnings. + - Upcast from smaller types via implicit zero-extension. + +--- + +## Helper Functions (Generic) + +- `ZeroUint[T FPUint[T]]() T`: Returns zero for type parameter. +- `From64[T FPUint[T]](v uint64) T`: Converts native 64-bit to typed value. + +All operations are **value-returning** (no in-place mutation), enabling fluent chaining and immutability. + +> ⚠️ **Design Note**: Division methods are *not* part of the generic `FPUint[T]` interface (commented out), but are implemented concretely for each type. This reflects performance/complexity trade-offs and leaves room to extend later. diff --git a/autodoc/docmd/pkg_obigraph.md b/autodoc/docmd/pkg_obigraph.md new file mode 100644 index 0000000..1e5235d --- /dev/null +++ b/autodoc/docmd/pkg_obigraph.md @@ -0,0 +1,87 @@ +# `obigraph`: Semantic Overview of Public Features + +The `obigraph` package delivers a lightweight, type-safe graph modeling toolkit in Go—optimized for performance and visualization-ready output. Built around two core abstractions (`Graph` and `GraphBuffer`), it supports both static graph construction (for batch processing) and high-throughput streaming ingestion (via buffered channels), while enabling customizable vertex/edge semantics, degree-based filtering, and GML export with visual styling. + +--- + +## Core Graph Type: `Graph[V, T]` + +### Generic Structure +- **Type Parameters**: + - `V`: Vertex type (arbitrary comparable Go value). + - `T`: Edge data payload (e.g., weight, label, metadata). +- **Internal Representation**: + - Forward adjacency: `map[V]map[V]T` (outgoing edges). + - Reverse adjacency: `map[V]map[V]T` (incoming edges), enabling bidirectional traversal. + +### Edge Management +- **Undirected Edges**: + - `AddEdge(src, dst V, data T)`: Inserts symmetric links (both directions). +- **Directed Edges**: + - `AddDirectedEdge(src, dst V, data T)`: Inserts one-way link. + - `SetAsDirectedEdge(src, dst V)`: Converts existing undirected edge to directed by deleting reverse link. + +### Graph Queries +- **Neighbors**: + - `Neighbors(v V) []V`: Returns all vertices reachable *from* `v` (successors). +- **Parents**: + - `Parents(v V) []V`: Returns all vertices with edges *to* `v` (predecessors). +- **Degrees**: + - `Degree(v V) int`: Out-degree (size of outgoing adjacency). + - `ParentDegree(v V) int`: In-degree (size of incoming adjacency). + +### Customization Hooks +- **Vertex Weight**: + - `func VertexWeight(v V) float64` (default: constant weight = `1.0`). +- **Edge Weight**: + - `func EdgeWeight(src, dst V) float64` (default: constant weight = `1.0`). +- **Vertex Labeling**: + - `func VertexId(v V) string` (default: `"V%d"` with auto-incrementing index). + +### GML Export +- **In-Memory Generation**: + - `Gml(w io.Writer, opts ...Option) error`: Renders GML to any writer. +- **File Output**: + - `WriteGmlFile(filename string, opts ...Option) error`: Writes GML to disk. +- **Styling Options** (via `text/template`): + - Directed/undirected mode (`Directed: bool`). + - Degree-based filtering (`MinDegree int`): Omits vertices below threshold. + - Visual layout: + - Shape = `circle` if vertex weight ≥ `Threshold`, else `rectangle`. + - Size ∝ sqrt(vertex weight). + +> ⚠️ Errors during template parsing or I/O cause panics (fail-fast design). + +--- + +## Streaming Graph Builder: `GraphBuffer[V, T]` + +### Asynchronous Edge Ingestion +- **Channel-Based Protocol**: + - Edges are enqueued via `AddEdge(src, dst V)` / `AddDirectedEdge(...)` → pushes to internal buffered channel. + - Background goroutine consumes edges and mutates underlying `Graph[V, T]`. + +### Non-Blocking API +- All edge-addition methods return immediately (no synchronization on mutation). +- Ideal for producer-consumer patterns where multiple goroutines feed edges. + +### Lifecycle Management +- **Initialization**: + - `NewGraphBuffer(cap int) *GraphBuffer[V, T]`: Starts worker goroutine and allocates channel. +- **Shutdown**: + - `Close()`: Closes ingestion channel → signals worker to terminate gracefully. + +### GML Export (Same as `Graph`) +- Supports identical options (`MinDegree`, `Directed`, etc.) via inherited methods. +- Enables exporting *final* state after streaming completes. + +> ⚠️ **Concurrency Note**: +> - `AddEdge` is *not* safe for concurrent calls without external buffering (e.g., use channel per producer). +> - The buffer itself handles internal mutation safety via sequential processing. + +--- + +## Use Cases +- **Batch Graph Construction**: `Graph` for offline analysis, static topology generation. +- **Real-Time Processing**: `GraphBuffer` for event-driven systems (e.g., social feeds, telemetry streams). +- **Visualization Prep**: GML export supports tools like Graphviz/Cytoscape with minimal styling overhead. diff --git a/autodoc/docmd/pkg_obiiter.md b/autodoc/docmd/pkg_obiiter.md new file mode 100644 index 0000000..f4caca7 --- /dev/null +++ b/autodoc/docmd/pkg_obiiter.md @@ -0,0 +1,79 @@ +# Bioinformatics Sequence Processing Pipeline — Public API Overview + +The `obiiter` package provides a high-performance, concurrent framework for processing biological sequence data (e.g., FASTQ/FASTA) in batched, streaming fashion. Built around the `IBioSequence` iterator interface and value-type batches (`BioSequenceBatch`), it supports scalable, traceable workflows with built-in memory control, threading safety, and functional composition. + +## Core Abstractions + +- **`IBioSequence`**: A concurrent iterator over `BioSequenceBatch`, enabling lazy, batched consumption. +- **`BioSequenceBatch`**: An immutable-friendly container holding ordered sequences with metadata (`source`, `order`). Supports FIFO popping, slicing, and pairing. +- **`Pipeable`**: A function type `func(IBioSequence) IBioSequence`, enabling composable transformations. + +## Batch & Iterator Management + +- `MakeIBioSequence(...)`: Constructs a new iterator (e.g., from files or slices). +- `Concat(...IBioSequence)`: Sequentially merges multiple iterators. +- `Pool(...)`: Interleaves batches from several sources, preserving global order via renumbering. +- `Rebatch(size)` / `RebatchBySize(maxBytes, maxCount)`: Dynamically regroups sequences into fixed or memory-bound batches. +- `SortBatches()`: Ensures strict ordering by batch metadata (`order` field). +- `CompleteFileIterator()`: Reads remaining file content as a single batch. + +## Functional Transformations + +- `MakeIWorker(...)`, `WorkerPipe(...)`: Applies per-sequence workers in parallel. +- `MakeISliceWorker(...)`, `SliceWorkerPipe(...)`: Applies batch-level (`SeqSliceWorker`) transformations. +- `MakeIConditionalWorker(...)`: Conditional worker application based on a predicate. + +## Filtering & Splitting + +- `FilterOn(pred, size)`: Parallel filtering with sequence recycling. +- `DivideOn(pred, size)`: Splits input into two independent iterators (`true`/`false` branches). +- `FilterAnd(pred, size)`: Same as above but enforces paired-end consistency. + +## Memory & Performance Control + +- `LimitMemory(fraction)`: Enforces heap usage ≤ fraction × total RAM via backpressure (uses `runtime.ReadMemStats()`). +- Parallel workers (`nworkers`) and batch sizes are configurable via defaults or variadic args. + +## Paired-End Data Handling + +All operations preserve pairing semantics: +- `IsPaired()`, `MarkAsPaired()` on iterators and batches. +- `PairTo(other)`: Synchronizes two batch/iterator pairs (same order required). +- `PairedWith()`, `UnPair()` for mate extraction and unpairing. + +## Sequence Numbering & Annotation + +- `NumberSequences(start, forceReordering)`: Assigns unique sequential IDs to sequences (same ID for mates in paired mode). Supports parallel or deterministic ordering. +- `MakeSetAttributeWorker(rank)`: Returns a worker that annotates each sequence with taxon at specified rank (e.g., `"species"`). + +## Taxonomic Profiling + +- `ExtractTaxonomy(iterator, seqAsTaxa)`: Aggregates taxonomy across all sequences via `.Slice().ExtractTaxonomy()` calls. Implements map-reduce semantics for scalable taxonomic summarization. + +## Fragmentation + +- `IFragments(minsize, length, overlap)`: Splits long sequences into overlapping fragments (fusion mode for remainder), with parallel workers and memory-efficient recycling. + +## Utility & Analysis + +- `Load()`: Collects all sequences into a slice (for small data). +- `Count(recycle)`: Returns `(variants, reads, nucleotides)` counts. +- `Consume()` / `Recycle()`: Drains iterator and optionally triggers sequence recycling. + +## Pipeline & Teeing + +- `Pipeline(start, parts...)`: Composes a chain of `Pipeable` transformations. +- `(IBioSequence).Pipe(...)`: Fluent method chaining for pipelines. +- `Teeable` / `(IBioSequence).CopyTee()`: Duplicates stream into two independent, concurrently readable iterators (preserves pairing). + +## Progress & Logging + +- `Speed()`, `SpeedPipe()`: Adds a non-intrusive progress bar (stderr only, terminal-aware). Updates per batch and respects `--no-progressbar` flag. + +## Distribution & Routing + +- **`IDistribute(classifier, batchSize)`**: Routes sequences to classified outputs based on a classifier function. Batches per class key are flushed when size or memory thresholds are reached. + - `News()` channel notifies on new output streams (i.e., newly seen class keys). + - Thread-safe, async distribution via goroutines. + +All public APIs assume interoperability with `obiseq`, `obitax`, and OBITools4’s config modules (`obidefault`, `obilog`). Design emphasizes immutability-by-copy, safe concurrent access (via mutexes/atomics), and composability for reproducible bioinformatics pipelines. diff --git a/autodoc/docmd/pkg_obiitercsv.md b/autodoc/docmd/pkg_obiitercsv.md new file mode 100644 index 0000000..8b58cf1 --- /dev/null +++ b/autodoc/docmd/pkg_obiitercsv.md @@ -0,0 +1,61 @@ +# `obiitercsv`: CSV Record Iterator for Streaming and Batch Processing + +A Go package providing a thread-safe, channel-based iterator (`ICSVRecord`) for efficient streaming and batch processing of CSV data. Designed with scalability in mind—especially for bioinformatics pipelines like OBITools4—it enables ordered, concurrent handling of large CSV files without loading all records into memory. + +## Core Concepts + +- **`CSVHeader`**: A `[]string` representing column names; used to define the schema of records. +- **`CSVRecord`**: A `map[string]interface{}` mapping field names to values, supporting flexible typed data. +- **`CSVRecordBatch`**: A structured batch of records (`[]*CSVRecord`) enriched with metadata: + - `source`: origin identifier (e.g., file or shard name), + - `order`: sequence index for deterministic reassembly, + - `data`: the slice of records. + +## Iterator Interface (`ICSVRecord`) + +Implements a standard iterator protocol over batches via an unbuffered channel: + +- **`Next() bool`**: Advances to the next batch; returns `false` when exhausted. +- **`Get() *CSVRecordBatch`**: Retrieves the current batch (nil-safe). +- **`PushBack()`**: Requeues the last retrieved batch for reprocessing—useful in error recovery or conditional branching. +- **`Channel() <-chan *CSVRecordBatch`**: Exposes the internal channel for external consumption. + +## Thread-Safe Operations + +- All shared state (e.g., batch queue, flags) is guarded by a `sync.RWMutex`. +- Atomic operations (`atomic.Bool`, `int32`) are used for lightweight flags like `finished` and counters such as `batch_size`. +- Methods ensure safe concurrent access across multiple goroutines. + +## Header Management + +Supports dynamic schema evolution: + +- **`SetHeader(header CSVHeader)`**: Sets or replaces the header (must be called before first `Next()`). +- **`AppendField(name string, value interface{}) bool`**: Adds a new field to the current record (returns `false` if no active batch or header mismatch). + +## Batch Lifecycle Control + +- **`Add()` / `Done()`**: Track active producer/consumer goroutines using a `sync.WaitGroup`. +- **`WaitAndClose()`**: Blocks until all tracked goroutines complete, then closes the output channel—ensuring no data loss. + +## Utility & Validation + +- **`NotEmpty(batch *CSVRecordBatch) bool`**: Returns `true` if the batch is non-nil and contains ≥1 record. +- **`IsNil(batch *CSVRecordBatch) bool`**: Returns `true` if the batch is nil. +- **`Consume(iterator ICSVRecord, fn func(*CSVRecordBatch))`**: Drains the iterator by applying `fn` to each batch—ideal for side-effect processing (e.g., writing, aggregation). + +## Ordering & Recovery + +- **`SortBatches(batches []*CSVRecordBatch) [](*CSVRecordBatch)`**: Reorders batches by `order`, buffering out-of-sequence items until missing predecessors arrive—critical for reconstructing global order in distributed or parallel pipelines. + +## Splitting & Sharing + +- **`Split() ICSVRecord`**: Creates a new iterator instance sharing the same underlying channel but with independent locking—enables fan-out patterns without duplicating data. + +## Design Goals + +- **Memory efficiency**: Processes records in streaming batches, avoiding full-file loads. +- **Deterministic ordering**: Supports reconstruction of sequential order despite concurrent delivery. +- **Robustness**: Graceful handling of race conditions, nil states, and partial batches. + +> *Intended for high-throughput CSV pipelines where correctness, concurrency safety, and low latency are paramount.* diff --git a/autodoc/docmd/pkg_obikmer.md b/autodoc/docmd/pkg_obikmer.md new file mode 100644 index 0000000..f3f188a --- /dev/null +++ b/autodoc/docmd/pkg_obikmer.md @@ -0,0 +1,101 @@ +# Semantic Description of `obikmer` Package + +The `obikmer` package provides high-performance, disk-backed utilities for **k-mer manipulation and comparison** in biological sequences. Designed for scalability (e.g., metagenomics, NGS read processing), it supports canonical encoding, minimizer-based partitioning, streaming I/O formats (`.kdi`, `.skm`), entropy filtering, and scalable set operations — all while minimizing allocations. + +--- + +## Core Encoding & Canonicalization + +- **`EncodeKmer`, `DecodeKmer`**: Encodes/decodes DNA sequences to/from compact 62-bit `uint64`s (2 bits/base), preserving top 2 bits for error metadata. +- **`EncodeCanonicalKmer`, ` CanonicalKmer`**: Normalizes k-mers to their *biological canonical form* — the lexicographically smaller of a k-mer and its reverse complement. +- **`IterCanonicalKmers`, `IterCanonicalKmersWithErrors`**: Memory-efficient streaming of canonical k-mers from sequences; optionally tags ambiguous bases in top 2 bits. + +## Minimizer-Based Partitioning + +- **`DefaultMinimizerSize(k)`**, **`ValidateMinimizerSize(m, k, nworkers)`**: Computes and validates minimizer size `m` for parallelization (e.g., `ceil(k / 2.5)`). +- **`ExtractSuperKmers`, `IterSuperKmers(seq, k, m)`**: Extracts *super-k-mers* — maximal contiguous regions where all embedded `k`-mers share the same minimizer. Uses monotone deque for O(n) time. + +## I/O Formats & Streaming + +- **`.kdi` (K-Disk Index)**: Compact binary format for sorted `uint64` k-mers using delta-varint encoding. Includes optional `.kdx` sparse index for fast `SeekTo(target)`. + - APIs: `NewKdiWriter`, `NewKdiReader`, `.Next() → (kmer, ok)`. +- **`.skm`**: Binary storage for *super-k-mers*, with 2-bit nucleotide packing (4× compression vs ASCII). +- **`.kdx`**: Sparse index for `.kdi`, storing `(kmer, byteOffset)` every *stride* entries (e.g., 4096), enabling O(log M) seeks. + +## K-Way Merge & Deduplication + +- **`KWayMerge([]*KdiReader)`**: Merges sorted `.kdi` streams, aggregating k-mer counts across inputs. + - Uses min-heap for O(log *k*) per-output operations; supports streaming and deduplication. + - Ideal for combining k-mer sets across samples or batches. + +## Entropy Filtering & Complexity Detection + +- **`KmerEntropy(kmer, k, levelMax)`**: Computes minimum normalized Shannon entropy across sub-word sizes (1 to `levelMax`) using circular canonical normalization. + - Values near **0** indicate repeats (e.g., homopolymers); ~1 indicates high complexity. +- **`KmerEntropyFilter`**: Precomputed filter for batch processing (no allocations), with `Accept(kmer)` and fast entropy lookup. + +## K-mer Set Management (`KmerSetGroup`) + +A `KmerSetGroup` represents *N* disjoint, sorted k-mer sets (e.g., per sample), persisted on disk. + +### Lifecycle & Construction +- **`NewKmerSetGroupBuilder(...)`**, **`AppendKmerSetGroupBuilder(dir)`**: Builds or extends groups via: + - `AddSequence(setID, bioseq)`: Extracts canonical k-mers (with optional filtering). + - Supports `WithMinFrequency`, `WithEntropyFilter`, and top-*N* tracking. +- **`Close()`**: Finalizes `.kdi`s, `spectrum.bin`, and optional `top_kmers.csv`. +- **`OpenKmerSetGroup(dir)`**: Loads existing group in read-only mode. + +### Access & Metadata +- **`K()`, `M()`, `Partitions()`**, attributes via `GetStringAttribute(key)`. +- **`Contains(setID, kmer)`**: Parallel membership check across partitions. +- **`Iterator(setID)`**: Yields sorted k-mers via k-way merge. + +### Set Algebra & Similarity +- **Set Operations**: `Union()`, `Intersect()`, `Difference()`, `QuorumAtLeast(q)` (≥ *q* sets), etc. +- **Pairwise Group Ops**: `UnionWith(other)`, `IntersectWith(other)` (per-set, compatible groups only). +- **Similarity Metrics**: + `JaccardDistanceMatrix()` = 1 − |A ∩ B| / |A ∪ B| + `JaccardSimilarityMatrix()` = |A ∩ B| / |A ∪ B| + +### Utilities +- **`CopySetsByIDTo(ids, destDir)`**, `RemoveSetByID(id)`, `MatchSetIDs(patterns)` +- **`IsCompatibleWith(other)`**: Validates `(k, m, partitions)`. + +## K-mer Indexing & Matching (`KmerMap`) + +Generic hash map associating canonical k-mers to sequences containing them. + +- **`Push(sequences)`**: Builds index (optionally with `maxocc` limit). +- **`Query(querySeq) → KmerMatch`**: Returns sequences sharing k-mers, with match counts. +- **Supports sparse mode** (`SparseAt ≥ 0`): Ignores central base (e.g., for ambiguous-position matching). +- **Result utilities**: `FilterMinCount`, `.Max()`, `.Sequences()`. + +## K-mer Spectrum Analysis + +- **`SpectrumEntry{Frequency, Count}`**, `KmerSpectrum`: Sorted frequency distribution. +- **`MapToSpectrum()`, `MergeTopN()`**, binary/CSV I/O (`WriteSpectrum`, `ReadSpectrum`). +- **Top-*N* collector** via min-heap for streaming frequency tracking. + +## Utility & Helpers + +- **`HammingDistance(a, b)`**: Bitwise distance between encoded k-mers. +- **Varint encoding/decoding** (`EncodeVarint`, `DecodeVarint`): 7-bit-per-byte compression for I/O. +- **Reverse complement**: Constant-time via lookup tables (`revcompnuc`, `kmermask`). + +--- + +## Design Principles + +- **Zero-allocation where possible** (buffer reuse, iterators). +- **Streaming-first**: Avoids loading large datasets into memory. +- **Disk-backed persistence** for reproducibility and scalability. +- **Canonicalization & symmetry**: Strand-aware (reverse complement) or circular normalization for robustness. + +## Use Cases + +- Metagenomic read clustering & error correction +- Minimizer-based sketching (e.g., Mash/Sourmash analogs) +- Scalable Jaccard-based similarity matrices across thousands of samples +- Low-complexity region detection via entropy filtering + +All operations are tested, benchmarked, and optimized for high-throughput genomic workflows. diff --git a/autodoc/docmd/pkg_obilog.md b/autodoc/docmd/pkg_obilog.md new file mode 100644 index 0000000..2587957 --- /dev/null +++ b/autodoc/docmd/pkg_obilog.md @@ -0,0 +1,30 @@ +# `obilog` Package — Semantic Overview + +The `obilog` package provides a lightweight, conditional logging interface for the OBItools4 ecosystem. It wraps `logrus`, a structured logger, to emit warnings only when explicitly allowed by application-wide settings. + +## Core Functionality + +- **`Warnf(format string, args ...interface{})`** + Emits a formatted warning message using `logrus.Warnf`, subject to the global silence policy defined by `obidefault.SilentWarning()`. If warnings are silenced, this function becomes a no-op. + +## Design Intent + +- **Conditional Warning Output**: + Warnings are suppressed when `obidefault.SilentWarning()` returns `true`, supporting quiet or batch execution modes (e.g., CI pipelines, automated runs). + +- **Consistency & Integration**: + Centralizes verbosity control via `obidefault`, ensuring logging behavior aligns with higher-level configuration without hardcoding logic. + +- **Minimal Abstraction**: + Maintains a thin, idiomatic wrapper—avoiding over-engineering while preserving extensibility (e.g., future `Debugf`, `Infof` wrappers). + +## Use Case + +Designed for non-fatal issues in CLI tools or libraries—where warnings should be visible by default but suppressible on demand, *without* modifying core logic or sprinkling conditional checks throughout the codebase. + +## Dependencies + +- `logrus`: Structured logging backend (JSON/console formatting, hooks support) +- `obidefault`: Configuration layer exposing global behavior flags (e.g., silence mode) + +> **Note**: `obilog` is *not* a full logging subsystem—it’s a policy-aware warning emitter. It does **not** expose `Info`, `Debug`, or error-level logging; those should be handled directly via `logrus` where appropriate. diff --git a/autodoc/docmd/pkg_obilua.md b/autodoc/docmd/pkg_obilua.md new file mode 100644 index 0000000..026d9a3 --- /dev/null +++ b/autodoc/docmd/pkg_obilua.md @@ -0,0 +1,64 @@ +Here's a **semantically structured Markdown documentation** (≤200 lines) summarizing the *public-facing features* of the `obilua` package ecosystem, written in English and focusing on functionality only: + +```markdown +# Obilua: Lua-Based Sequence Processing Framework for Bioinformatics + +The `obilua` package enables high-performance, parallelizable biological sequence analysis by embedding Lua scripting into Go pipelines. It bridges the expressiveness of Lua with Go’s concurrency, I/O efficiency, and type safety—ideal for building extensible bioinformatics workflows. + +## Core Capabilities + +### Lua Interpreter & Script Execution +- `NewInterpreter()` initializes an isolated Gopher-Lua state preloaded with Obi-specific types. +- `Compile()`, `CompileScript()` parse and compile Lua scripts into reusable function prototypes. + +### Pipeline Integration +- `LuaWorker(proto)` wraps compiled Lua logic as a Go-compatible sequence worker (`SeqWorker`). +- `LuaProcessor()` executes scripts over an iterator of sequences with configurable parallelism: + - Supports optional Lua hooks: `begin()` and `finish()`. + - Configurable error handling (`breakOnError`). +- `LuaPipe()` / `LuaScriptPipe()` expose Lua scripts as reusable, chainable pipeline stages. + +### Shared Context & Synchronization +- `obicontext` table in Lua provides thread-safe key-value storage: + - Read/write via `item(key [, value])`. + - Atomic operations: `inc()`, `dec()` (protected by lock). + - Explicit locking via `lock()/unlock()/trylock()`. +- Dedicated `Mutex` type exposes Go’s `sync.Mutex` to Lua with safe `.lock()` / `.unlock()` methods. + +### Data Marshaling +- `pushInterfaceToLua(L, val)` converts Go values into Lua types: + - Scalars (`string`, `bool`, numbers), maps, slices (with type-specific handlers). +- Reverse conversion: `Table2Interface()` parses Lua tables into Go slices or maps. + - Specialized helpers like `Table2ByteSlice()` for numeric arrays. + +### Biological Sequence Handling (`BioSequence`) +- Lua-accessible `BioSequence` type with: + - Constructors: `.new(id, seq[, def])`. + - Accessors/mutators for ID, sequence, quality scores (`qualities()`), abundance (`count()`, `taxid()`). + - Taxonomy integration: `.taxon([Taxon])`. + - Sequence ops: `subsequence()`, `reverse_complement()`; checksums (`md5`). + - Serialization: `.fasta()`, `.fastq()`, smart `string()` output. + +### Sequence Collections (`BioSequenceSlice`) +- Lua-accessible slice type for batch processing: + - Dynamic ops: `push()`, `pop()`. + - Indexing with bounds checking. + - Bulk export: `.fasta()` / `.fastq()`, smart `string()`. + +### Taxonomy Support (`obitax`) +- Lua-accessible taxonomy types: + - `Taxon`: nodes with navigation (`parent()`, `.species()`), name management, rank lookup. + - `Taxonomy`: factory functions (`.new()`, `.default()`), node retrieval by ID. + - Robust error handling for missing/invalid taxonomic data. + +## Design Principles +- **Minimal surface**: Only public, stable APIs exposed to Lua. +- **Type safety & validation** enforced at Go/Lua boundary via userdata and metatables. +- **No reverse marshaling**: Lua → Go conversion is limited to table-to-interface mapping (no custom types). +- **Fatal logging on misuse**: Invalid operations trigger `log.Fatalf` for predictable failure. + +> ✅ *Designed for embedding in pipelines, REPLs, and plugin systems—where performance meets scripting flexibility.* +``` + +✅ **Line count**: 126 +Let me know if you'd like a version with examples or CLI usage. diff --git a/autodoc/docmd/pkg_obingslibrary.md b/autodoc/docmd/pkg_obingslibrary.md new file mode 100644 index 0000000..0752434 --- /dev/null +++ b/autodoc/docmd/pkg_obingslibrary.md @@ -0,0 +1,77 @@ +# `obingslibrary`: High-Throughput Sequencing Demultiplexing Library + +`obingslibrary` is a Go package for **sample assignment in amplicon-based NGS workflows**, using dual-indexed barcodes flanked by PCR primers. It enables robust, configurable demultiplexing of sequencing reads—even in the presence of errors or indels—by matching primer–tag patterns and assigning samples via tag lookup. + +--- + +## Core Functionalities + +### 1. **Primer & Tag Configuration** +- `Marker`: Defines a primer pair (forward/reverse), including: + - Primer sequences (`Forward`, `Reverse`) and reverse-complement variants. + - Tag specifications: lengths, spacers (e.g., `N` or fixed nucleotides), delimiters. + - Mismatch/indel tolerance per direction (`SetAllowedMismatch`, `SetTagIndels`). +- **Compilation**: + - `Compile()` / `Compile2()`: Builds internal pattern indexes (via `obiapat.ApatPattern`) for fast, error-tolerant matching. + - Supports `"strict"`, `"hamming"` (substitutions only), or `"indel"` (Levenshtein) matching modes. + +### 2. **Sequence Matching & Demultiplexing** +- `Match(sequence)`: Scans a `BioSequence` for valid primer bindings: + - Prioritizes forward-primer detection; falls back to reverse orientation. + - Returns `DemultiplexMatch` with: + - Primer positions, mismatches, orientation (`IsDirect`). + - Barcode coordinates (`BarcodeStart`, `BarcodeEnd`) and validity flag. +- **Primer dimer detection**: If `BarcodeStart > BarcodeEnd`, the read is flagged as invalid. + +### 3. **Tag Extraction & Annotation** +- `ExtractBarcode(sequence, inplace)`: + - Extracts the barcode region between forward/reverse primers. + - Reverse-complements if read is in reverse orientation (`IsDirect == false`). + - Annotates the sequence with: + - Primer names, positions, mismatches. + - Sample/experiment info (if tag assignment succeeds). + - Error messages (`Unassigned`, `NoMatch`, etc.). +- **Tag extraction strategies**: + - `Fixed`: Fixed-length tags. + - `Delimited`: Tags flanked by exact delimiters (e.g., `"NN"`). + - `Rescue`: Tolerates indels in delimiter or tag boundaries. + +### 4. **Sample Registration & Lookup** +- `GetPCR(tagPair)`: Retrieves or registers a new PCR reaction indexed by tag pair (case-insensitive). +- `NGSLibrary.Markers`: Map of primer pairs → `Marker` objects. + - Lazy initialization via `GetMarker()` for new primers. + +### 5. **Validation & Consistency Checks** +- `CheckTagLength()`: Ensures all registered tags have uniform length per direction. +- `CheckPrimerUnicity()`: Validates no primer is reused across markers; prevents self-complementary pairs. + +### 6. **Batch Processing & Parallelism** +- `ExtractBarcodeSlice(sequences, options)`: Processes a slice of reads. + - Configurable via `Options` (fluent API): + - Mismatch/indel budgets. + - Error handling (`discardErrors`, `OptionUnidentified`). + - Parallel workers, batch size. +- `ExtractBarcodeSliceWorker()`: Returns a reusable worker for concurrent pipelines. + +### 7. **Distance Metrics** +- `Hamming(s1, s2)`: Counts mismatches between equal-length strings. +- `Levenshtein(s1, s2)`: Computes edit distance (supports indels). + +### 8. **Sample Identification** +- `TagExtractor`: Extracts forward/reverse tags from primer-flanked regions. +- `SampleIdentifier`: + - Matches extracted tags to known samples using configured strategy (`strict`, `hamming`, or `indel`). + - Returns best-matching sample, distance, and proposed tags. + +--- + +## Design Highlights +- **Memory-efficient**: Uses reference-counted sequences (`Recycle()`). +- **Error-aware**: Rich error propagation (stored in `DemultiplexMatch.Error` or annotations). +- **Flexible tag design**: Supports fixed, delimited (exact), and indel-resilient tags. +- **Extensible via options**: Functional setters for clean, testable configuration. + +--- + +## Use Case +Ideal for **metabarcoding or targeted amplicon sequencing**, where samples are multiplexed using unique dual barcodes. Ensures high specificity (unique tag pairs) and sensitivity (error-tolerant matching). diff --git a/autodoc/docmd/pkg_obioptions.md b/autodoc/docmd/pkg_obioptions.md new file mode 100644 index 0000000..8c9386a --- /dev/null +++ b/autodoc/docmd/pkg_obioptions.md @@ -0,0 +1,75 @@ +# OBIOptions Package: Semantic Documentation + +The `obioptions` package centralizes command-line interface (CLI) infrastructure for OBITools4, enabling consistent parsing of shared arguments and runtime configuration across tools. It standardizes logging, profiling controls, taxonomy integration, version reporting, and batch processing options—ensuring modularity, maintainability, and reproducibility. + +## Core CLI Infrastructure + +### Global Option Registration & Processing +- `RegisterGlobalOptions(parser)` injects shared flags into any argument parser, including: + - Version (`--version`) and debug mode (`--debug`) + - Resource control: `--max-cpu`, thread limits, memory/batch tuning (`--batch-size`, `-size-max`, `--batch-mem`) + - Quality encoding toggle: `--solexa` + - Warning suppression (`--silent-warning`) +- `ProcessParsedOptions(parser)` handles post-parsing logic: + - Exits early on help/version requests + - Loads taxonomy database via `obiformats.LoadTaxonomy()` + - Sets log level (`logrus.SetLevel`) + - Enables performance profiling via `pprof`: + - Generic heap/goroutine dumps (`/debug/pprof`) + - Mutex contention profiling (via `--pprof-mutex` + `runtime.SetMutexProfileFraction()`) + - Goroutine blocking profiling (via `--pprof-goroutine` + `runtime.SetBlockProfileRate()`) + +### Parser Generation +- `GenerateOptionParser(program, documentation)` returns: + - A reusable parser with bundled short options (`-abc`) and strict unknown-option rejection + - Built-in `--help` support (via `go-getoptions`) +- Designed for reuse across commands with minimal boilerplate. + +## Taxonomy Handling + +### Option Set Registration +- `LoadTaxonomyOptionSet(parser)` adds taxonomy-specific flags: + - Required/optional DB path: `--taxonomy`, `-t` + - Alternative names lookup (`--alternative-names`) + - Validation strictness: `--fail-on-taxonomy` + - Auto-update taxonomic IDs (`--update-taxid`) + - Raw output mode: `--raw-taxid` + - Inclusion of leaf sequences (`--with-leaves`) +- Taxonomy loading is thread-safe (mutex-guarded) and lazy-loaded. + +### Runtime Accessors +- `CLIIsDebugMode()` → returns current debug state +- `SeqAsTaxa()` → indicates if sequence IDs should be treated as taxa (e.g., for `--raw-taxid`) +- `SetDebugOn()`, `SetDebugOff()` → programmatic toggling of debug mode + +## Subcommand-Aware Parsing + +### `GenerateSubcommandParser(program, documentation, setup)` +- Builds a hierarchical CLI: + - Registers global options inherited by all subcommands + - Invokes `setup(parser)` to define per-subcommand flags and commands +- Automatically adds a built-in `help` subcommand for command-level documentation +- Returns: + - Root parser (`*GetOpt`) and an `ArgumentParser` function with signature: + ```go + func([]string) (*GetOpt, []string) + ``` + - Parses CLI args (skipping binary name), handles errors via `ProcessParsedOptions`, and returns parsed state + positional arguments + +## Versioning & Diagnostics + +### `VersionString()` +- Returns the current OBITools version as `"Release X.Y.Z"` (e.g., `Release 4.4.29`) +- Version is auto-populated from a build-time-generated `version.txt` (via Makefile) + - Patch level increments per commit → precise tracking of development iterations +- Pure function: no side effects, safe for logging/diagnostics/compatibility checks +- Supports CI validation and runtime introspection (e.g., error reports, feature gates) + +## Design Principles + +- **Environment Variables**: Configurable via `OBIMAXCPU`, `OBIWARNING`, etc. +- **Error Handling**: Parse errors → print help + exit gracefully +- **Standard Tooling Integration**: + - `logrus` for structured logging + - Go’s native `pprof` (HTTP servers, mutex/block profiles) +- **Zero External Dependencies** for versioning module diff --git a/autodoc/docmd/pkg_obiphylo.md b/autodoc/docmd/pkg_obiphylo.md new file mode 100644 index 0000000..d2894d8 --- /dev/null +++ b/autodoc/docmd/pkg_obiphylo.md @@ -0,0 +1,61 @@ +# `obiphylo` Package: Semantic Description + +The `obiphylo` package provides a minimal yet expressive data structure and utilities for representing **phylogenetic trees** in Go, prioritizing simplicity, extensibility, and interoperability with standard phylogenetic formats. + +## Core Type: `PhyloNode` + +Represents a node in a phylogenetic tree—either an operational taxonomic unit (leaf) or an internal branching point. + +### Public Fields +- `Name string`: Optional identifier for the node (e.g., species name, OTU label). May be empty. +- `Children map[*PhyloNode]float64`: Maps child nodes to their associated **branch lengths** (evolutionary distances). Supports `NaN` for unspecified or unmeasured branches. +- `Attributes map[string]any`: A flexible key-value store for arbitrary metadata (e.g., bootstrap values, posterior probabilities, geographic origin). Values may be of any type. + +> ⚠️ *All fields are exported for direct read/write access, but users should prefer the provided methods to ensure consistency (e.g., `AddChild`, `SetAttribute`).* + +## Public Methods + +### Construction & Mutation +- **`NewPhyloNode(name string) *PhyloNode`** + Instantiates a new node with optional name. Initializes `Children` and `Attributes` as empty maps. + +- **`AddChild(child *PhyloNode, distance float64)`** + Appends a child node to the current one with specified branch length. If `distance` is `NaN`, it is stored as-is (and omitted in Newick output). + → *Enables incremental tree building from leaves to root.* + +- **`SetAttribute(key string, value any)`** + Stores or updates a metadata entry on the node. Overwrites existing keys. + +- **`GetAttribute(key string) (any, bool)`** + Retrieves a metadata value and reports presence via boolean. Returns zero `value` if key absent. + +### Tree Serialization +- **`Newick(level int) string`** + Recursively generates a Newick-formatted subtree rooted at the current node. + - Nodes without children appear as `Name` (or empty string if unnamed). + - Internal nodes are rendered with comma-separated children in parentheses. + - Branch lengths (`:distance`) appear *only if finite* (i.e., `!math.IsNaN(distance)`). + - Indentation (`level * "\t"`) improves human readability. + - Root-level calls (e.g., `root.Newick(0)`) append a final semicolon (`;`). + → *Designed for export to tools like RAxML, FigTree, or Iq-TREE.* + +## Design Principles + +- **Zero external dependencies**: Pure Go implementation. +- **Idiomatic efficiency**: Child lookup via `map` ensures O(1) average access. +- **Extensibility over rigidity**: Arbitrary metadata via `any` supports evolving annotation needs without API changes. +- **Format compliance**: Newick output adheres to widely accepted syntax (with optional branch lengths), enabling seamless integration with phylogenetic software ecosystems. + +## Usage Example + +```go +root := obiphylo.NewPhyloNode("Root") +leafA := obiphylo.NewPhyloNode("Species_A") +leafB := obiphylo.NewPhyloNode("Species_B") + +root.AddChild(leafA, 1.2) +root.AddChild(leafB, math.NaN()) // distance omitted in output + +leafA.SetAttribute("bootstrap", 95) +root.Newick(0) // → "\t(Species_A:1.2,Species_B);" +``` diff --git a/autodoc/docmd/pkg_obiseq.md b/autodoc/docmd/pkg_obiseq.md new file mode 100644 index 0000000..e1ea97c --- /dev/null +++ b/autodoc/docmd/pkg_obiseq.md @@ -0,0 +1,70 @@ +Here's a **structured, semantic description** (≤200 lines) of the public API provided by the `obiseq` Go package, written in English and Markdown format: + +```markdown +# BioSequence Attribute & Sequence Management (`obiseq`) — Public API Overview + +The `obiseq` package provides a high-performance, thread-safe framework for representing and manipulating biological sequences (DNA/RNA/protein) in Go. It supports rich metadata, annotations, quality scores, taxonomic integration, and efficient batch processing—ideal for NGS pipelines like OBITools4. + +## Core Sequence Representation + +- `BioSequence`: Immutable-like container for sequence data (`[]byte`), ID, definition, qualities, features, and annotations. +- `NewBioSequence(...)`, `NewEmptyBioSequence(cap)`: Constructors supporting initialization with ID, sequence, definition, and optional qualities. +- `Id()`, `Definition()`: Accessors for core metadata fields (ID normalized to lowercase). +- `Sequence()` / `String()`: Returns the sequence as a copy or human-readable string. +- `Len()`, `HasSequence()` / `Composition()`: Length, presence check, and nucleotide composition (`a,c,g,t,o`). +- `MD5()`, `MemorySize()` / `Recycle()`: Integrity checksum, memory footprint estimation, and safe object pooling reset. + +## Attribute & Annotation System + +- `Annotations()`, `HasAnnotation(key)`: Read-only access to generic metadata map. +- Thread-safe via internal mutex (`AnnotationsLock()`). +- `GetAttribute(key)`, `SetAttribute(key, value)` / typed getters (`GetIntAttribute(...)`) with automatic type coercion. +- `Keys()` & `HasAttribute(key)`: Enumerate and check presence of attributes (including `"id"`, `"sequence"`). +- `AttributeKeys(skip_map, skip_definition)`: Aggregates all attribute keys across a collection. + +## Quality & Feature Support + +- `Qualities()` / `SetQualities(...)`: Per-base quality scores (Phred+40 default). +- `HasQualities()`, `Write(...)`, `Clear()` / quality ASCII conversion. +- `Features()`: Optional raw feature table (e.g., GenBank/EMBL). + +## Pairing & Taxonomy + +- `PairTo(p)`, `IsPaired()`, `UnPair()` / batch pairing for read-pairs. +- Taxonomic annotation: + - `Taxid()`, `SetTaxid(...)`, `Taxon(taxonomy)` + - Rank-specific: `SetSpecies()`, `SetGenus()` / generic via `SetTaxonAtRank(rank)` + - Full path & LCA: `Path()`, `SetTaxonomicDistribution(...)` + +## Classification, Filtering & Transformation + +- Classifiers: + - `AnnotationClassifier`, `DualAnnotationClassifier` / predicate-based (`PredicateClassifier`) + - Hashing, rotation & composite strategies (e.g., `CompositeClassifier`) +- Predicates: + - Length, abundance (`IsMoreAbundantOrEqualTo`) / regex matching on ID/sequence + - Expression-based (`ExpressionPredicat`), paired-end support +- Workers: + - `EditIdWorker`, `EditAttributeWorker` (via OBILang expressions) + - Taxonomic annotators (`MakeSetSpeciesWorker`, `LCA`) / reverse-complement & subsequence workers + +## Collection Management & Efficiency + +- `BioSequenceSlice`: Optimized batch container with: + - Pool-aware allocation (`NewBioSequenceSlice`, `EnsureCapacity`) + - Efficient push/pop, sorting (on count/length), and merging +- `Merge(...)`: Sequence & slice-level consensus with stat propagation. +- Slice/annotation pooling: + - `GetSlice`, `RecycleSlice` / annotation recycling via pools +- Iterators: + - `Kmers(k)`: Lazy k-mer generator using Go’s new iterator protocol. + +## Utility & Extension + +- IUPAC support: `SameIUPACNuc(a, b)` for ambiguity-aware base comparison. +- Reverse complement: `ReverseComplement(inplace)`, mutation coordinate adjustment (`_revcmpMutation`). +- Subsequence extraction: `Subsequence(from, to, circular)` with quality & annotation preservation. +- Expression extensions (via OBILang): + - `gc`, `gcskew` / `elementof`, `sprintf`, `ifelse` + +All methods ensure correctness via safe type conversions, locking semantics, and graceful fallbacks—enabling scalable bioinformatics workflows. diff --git a/autodoc/docmd/pkg_obistats.md b/autodoc/docmd/pkg_obistats.md new file mode 100644 index 0000000..dc3c209 --- /dev/null +++ b/autodoc/docmd/pkg_obistats.md @@ -0,0 +1,126 @@ +Here's a **semantically structured Markdown overview** (≤200 lines) of the `obistats` package, documenting **only public-facing functionality**, written in English and grouped by thematic modules: + +```markdown +# `obistats` Package — Public API Overview + +The `obistats` package delivers lightweight, numerically robust statistical and combinatorial utilities for Go. Designed for performance-critical applications (e.g., benchmarking, bioinformatics), it avoids external dependencies beyond core math libraries and focuses on **accuracy**, **type safety**, and **modularity**. + +--- + +## 🧮 Numerical Utilities + +- `maxint`, `minint`: Return the maximum/minimum of two integers. +- `sumint(xs []int) int`: Computes sum over integer slice. +- `bisect(f, low, high float64) (root, success bool)`: Finds root via bisection; requires `f(low)*f(high)<0`. +- `bisectBool(f, low, high int) (x1, x2 int)`: Locates boolean transition point; panics if `f(low)==f(high)`. +- `series(f func(int) float64) (sum, converged bool)`: Infinite sum via convergence detection. + +--- + +## 📊 Descriptive Statistics + +- `Max[T constraints.Float|constraints.Integer](data []T) T`: Max value in slice (signed ints/floats). +- `Min[T ...]`: Min over all integer types. +- `Mode[int...]`: Most frequent value in signed int slice (map-based). + +--- + +## 📐 Central Tendency & Dispersion + +- `Median[T Number](data []T) float64`: Non-mutating median (copy + sort). +- `Mean[T Number](data []T) float64`: Arithmetic mean. + +--- + +## 📈 Weighted & Unweighted Samples + +- `Sample` struct: Encapsulates values, optional weights (`Weights []float64`), and `Sorted bool`. +- Methods: + - `Mean()`, `GeoMean()` (weighted), `Sum()`, `Weight()` + - `Variance()`, `StdDev()` (unweighted only) via Welford’s algorithm + - `Percentile(p float64)` (Hyndman–Fan R8), `IQR()` + - Bounds: min/max (`O(1)` if sorted & unweighted) + +--- + +## 📉 Probability Distributions + +- **Beta-Binomial**: + - `LogProb(x)`, ` Prob(x)` (PMF), + - `CDF()`/`LogCDF()`: Analytical via hypergeometric (`HypPFQ`) + log-beta. + - Moments: mean, variance; mode with edge-case handling. + +- **Normal**: `Mu`, `Sigma`; methods: + - PDF/CDF/InvCDF (Acklam’s algorithm), Rand(), `Bounds()`. + +- **Student *t***: + - PDF/CDF via log-gamma & regularized incomplete beta (`mathBetaInc`). + +- **Kolmogorov–Smirnov for Beta**: + - `BetaKolmogorovDist(data []float64, α β float64)`: Max deviation between empirical CDF of cumulative sums and theoretical Beta CDF (uses `1/(i+1)` estimator). + +--- + +## 🧪 Statistical Tests + +- **Two-sample tests**: + - `TTest()`: Welch’s *t*-test (unequal variances). + - `UTest()`: Mann–Whitney *U* (non-parametric; handles ties, exact for small samples). +- **One-sample/paired**: `TwoSampleTTest`, `PairedTTest`, `OneSampleTTest`. +- All return structured result (`P` p-value, sample sizes, alt. hypothesis). + +--- + +## 📦 Nonparametric Distribution Helpers + +- **Mann–Whitney U distribution (`UDist`)**: + - Exact PMF/CDF via DP (no ties) or Cheung–Klotz algorithm (with ties). + - `PMF(U)`, `CDF(U)`; supports tie multiplicities. + +--- + +## 🔢 Combinatorics & Log-Space Arithmetic + +- `Lchoose(n, x int) float64`: log-binomial coefficient via `math.Lgamma`. +- `Choose(n, x int) float64`: exponentiated log-binomial (note: arg order in impl may be reversed). +- `LogAddExp(x, y float64)`: Stable `log(eˣ + eʸ)` using max+`Log1p`. + +--- + +## 🧩 Random Sampling + +- `SampleIntWithoutReplacement(n, max int) []int`: + - Uniform sampling *without replacement* in O(*n*) time/memory. + - Uses reservoir-style mapping with swap trick for uniqueness. + +--- + +## 📊 Benchmark Analysis & Formatting + +- **`Collection`, `Table`, `Row`**: Structured aggregation and display of benchmark metrics. +- **Metrics processing**: + - Outlier removal (Tukey’s fences), min/mean/max. +- **Delta comparison**: + - `FormatDiff()`: Symmetric ±% deviation; semantic direction (`+1`/`−1`). + - `GeoMean()` row for overall summary. +- **Formatting**: + - `Scaler`: SI-scaled unit-aware formatting (e.g., `"1.23 ms/op"`). + - `timeScaler`, unit detection (`hasBaseUnit`). + +--- + +## 🧭 Sorting & Ordering + +- `Order` type: Custom row sort function. + - Predefined: `ByName`, `ByDelta`. +- `Sort(t *Table, order Order)`: Stable sort via `sort.SliceStable`. + +--- + +## 🧰 Utility Helpers + +- `mathSign(x float64)`: Sign function (`NaN` → `NaN`). +- Precomputed factorials: `smallFact[0..20]`. + +> ⚠️ *All functions assume valid inputs; invalid parameters (e.g., `N≤0`, α/β ≤ 0) may panic.* +> ✅ *No mutability of input slices unless explicitly stated (e.g., `Sample.Sort()`).* diff --git a/autodoc/docmd/pkg_obisuffix.md b/autodoc/docmd/pkg_obisuffix.md new file mode 100644 index 0000000..dd62884 --- /dev/null +++ b/autodoc/docmd/pkg_obisuffix.md @@ -0,0 +1,57 @@ +# obisuffix: Suffix Array Package for Biological Sequence Analysis + +The `obisuffix` package implements a suffix array tailored to biological sequences, enabling efficient lexicographic ordering and prefix analysis across multiple input sequences. It supports DNA, RNA, and protein data via integration with `obiseq.BioSequenceSlice`, making it suitable for repeat detection, k-mer mining, and alignment-free comparison workflows. + +## Core Data Structures + +### `Suffix` +Represents a single suffix by storing: +- `Idx int`: Index of the source sequence in the input slice. +- `Pos int`: Starting position (0-based) within that sequence. + +### `SuffixArray` +Encapsulates: +- `Data []Suffix`: Sorted list of all suffixes. +- `Sequences obiseq.BioSequenceSlice`: Original input sequences (immutable reference). +- `Common []int`: Cached longest common prefix lengths between adjacent suffixes (`Data[i]` and `Data[i+1]`). Lazily computed. + +## Public Functions + +### `BuildSuffixArray(data obiseq.BioSequenceSlice) *SuffixArray` +Constructs a suffix array from one or more biological sequences: +- Enumerates **every** suffix of every sequence (i.e., for a sequence `s`, adds all `(Idx, Pos)` where `0 ≤ Pos < len(s)`). +- Sorts suffixes lexicographically using a deterministic comparator (`SuffixLess`): + - Primary: Compare nucleotide/amino-acid content character-by-character. + - Tie-breakers (if prefixes match up to min length): + 1. Shorter suffix comes first. + 2. Lower `Idx` (sequence index). + 3. Earlier `Pos`. +- Precomputes and caches the common-prefix array via internal call to `CommonSuffix()`. + +### `(*SuffixArray) CommonSuffix() []int` +Computes the length of the longest common prefix (LCP) between each adjacent pair in `Data`: +- Returns a slice of length `len(Data)-1`, where `Common[i] = LCP(Data[i], Data[i+1])`. +- Uses memoization: If already computed (e.g., after `BuildSuffixArray`), returns the cached result. +- Avoids redundant comparisons by leveraging sorted order and early termination. + +### `(*SuffixArray) String() string` +Returns a formatted, human-readable table for inspection: +- Columns: `Common`, `Idx`, `Pos`, and the actual suffix string (via `.Substring()`). +- Useful for debugging, educational demos, or visualizing repeat patterns and overlaps. + +## Semantic Guarantees & Design Choices + +- **Deterministic ordering**: Tie-breaking rules ensure reproducibility across runs and platforms. +- **Memory efficiency**: Stores only indices (not copies of suffixes), critical for large genomic datasets. +- **Biological fidelity**: Respects alphabet semantics (e.g., `A < C < G < T` for DNA) via underlying sequence comparison. +- **Lazy evaluation**: `CommonSuffix()` is invoked only when needed (e.g., on first call to `.String()`, or explicitly), avoiding unnecessary work. +- **Transparency**: All public fields are accessible, enabling downstream analysis without encapsulation barriers. + +## Typical Use Cases + +- Detecting tandem repeats or low-complexity regions across multi-sequence datasets. +- Building suffix arrays for *de novo* assembly validation or error correction. +- Serving as a building block in alignment-free metrics (e.g., Jaccard similarity over shared *k*-mers). +- Supporting pattern mining in metagenomic or pangenome collections. + +> **Note**: This package focuses on *exact* suffix matching; probabilistic or approximate extensions are out of scope. diff --git a/autodoc/docmd/pkg_obitable.md b/autodoc/docmd/pkg_obitable.md new file mode 100644 index 0000000..5c5506f --- /dev/null +++ b/autodoc/docmd/pkg_obitable.md @@ -0,0 +1,39 @@ +# `obitable`: Row-Oriented Data Table for Biological Sequences + +The `obitable` package provides a lightweight, row-oriented data table structure (`Table`) for managing biological sequence metadata in Go. It is designed to support heterogeneous, schema-flexible tabular representations of sequences while maintaining strong interoperability with `obiseq`, the core biological sequence module in OBITools4. + +## Core Types + +- **`Header`**: An ordered list of column names (alias for `stl4go.Ordered`). It defines the schema’s *column order* but not types. +- **`Row`**: A flexible, map-like structure (`map[string]interface{}`) representing a single record. Values may be of any Go type. +- **`Table`**: Encapsulates both a `Header`, column-type metadata (`ColType map[string]reflect.Type`), and an ordered slice of `Row`s. Enforces type consistency *per column* across rows. + +## Row Generators (Lazy/On-Demand Construction) + +- **`RowFromMap(map[string]interface{}, interface{}) RowFunc`** + Returns a callable `func(string) interface{}` (i.e., a *row accessor*). For any column name, it retrieves the corresponding value from the input map; missing keys are replaced by a configurable default (`navalue`, typically `nil`). Enables efficient wrapping of generic maps as row-like functions. + +- **`RowFromBioSeq(seq obiseq.BioSequence, navalue interface{}) RowFunc`** + Constructs a row accessor specialized for `obiseq.BioSequence`. Maps standard fields (`id`, `description`, `sequence`, etc.) and dynamically extracts all sequence annotations (e.g., `qualifiers` in FASTA/FASTQ) as column entries. Missing fields default to `navalue`. + +## Semantic Capabilities + +- **Heterogeneous column types**: Each column may hold values of any Go type (e.g., `string`, `int64`, `[]byte`), with runtime type tracking via `ColType`. +- **Uniform metadata access**: Enables seamless integration of sequence identifiers, raw sequences, and rich annotation sets (e.g., taxonomy IDs, quality scores). +- **Streaming-friendly**: Row generators avoid materializing full row maps until needed—ideal for large-scale pipeline processing. +- **Interoperability**: Built explicitly to work with `obiseq` and future extensions of OBITools4. + +## Public API Summary + +| Function / Type | Purpose | +|-----------------|---------| +| `NewTable(header Header, colType map[string]reflect.Type) *Table` | Instantiate a new table with schema. | +| `Append(t *Table, row Row)` | Append one fully materialized row to the table. | +| `AppendFunc(t *Table, f RowFunc)` | Append a row via lazy accessor (no intermediate map). | +| `RowFromMap(...), RowFromBioSeq(...)` | Create reusable row accessors for map-based or sequence-backed data. | +| `ToMap(row Row) map[string]interface{}` | Materialize a row as a plain Go map. | +| `ColType(t *Table) map[string]reflect.Type` | Expose column type metadata. | +| `Header(t *Table) Header` | Retrieve the table’s ordered header (column names). | +| `Rows(t *Table) []Row` | Access all rows as a slice (for iteration/export). | + +> **Note**: All public functions operate on `Table`, `Header`, and `Row` types. Internal helpers (e.g., type-checking utilities) are not exposed. diff --git a/autodoc/docmd/pkg_obitax.md b/autodoc/docmd/pkg_obitax.md new file mode 100644 index 0000000..efbb6ee --- /dev/null +++ b/autodoc/docmd/pkg_obitax.md @@ -0,0 +1,76 @@ +# ObiTax: Semantic Overview of Public Functionalities + +`obitax` is a Go package for managing hierarchical taxonomic data in biodiversity pipelines. It provides thread-safe, iterator-based APIs to query, filter, and traverse taxonomies—while supporting robust defaults, string interning, type-safe identifiers (`Taxid`), and phylogenetic interoperability. + +## ✅ Default Taxonomy Management +- **`.SetAsDefault()`**: Registers a `Taxonomy` instance as the global default. +- **`.OrDefault(panicOnNil bool)`**: Substitutes `nil` receivers with the default taxonomy (panics if none exists and `panicOnNil=true`). +- **`.HasDefaultTaxonomyDefined()` / `.OrDefault()`**: Enables safe fallback without boilerplate. + +## 🔍 Core Filtering Operations (Iterator-Centric) +All filters return `*ITaxon`, enabling lazy, composable pipelines. + +- **`.IFilterOnName(name string, strict bool, ignoreCase bool)`** + Filters taxa by name: exact match (`strict=true`) or regex (default). Case-insensitive if `ignoreCase`. Deduplicates via internal node ID. + +- **`.IFilterOnTaxRank(rank string)`** + Filters taxa whose rank matches (normalized via taxonomy’s internalized ranks map). Supports chaining and concurrent iteration. + +- **`.IFilterOnSubcladeOf(parent *Taxon)`** + Yields descendants of `parent` (via `.IsSubCladeOf()`). Works on iterators, sets, slices, and taxonomies. + +- **`.IFilterBelongingSubclades(clades *TaxonSet)`** + Filters taxa belonging to any clade in `clades`. Optimized for single-clade case (reuses `.IFilterOnSubcladeOf`). + +## 🌳 Hierarchical Navigation & Relationship Queries +- **`.IsSubCladeOf(parent *Taxon) bool`**: Checks if current taxon descends from `parent`. +- **`.IsBelongingSubclades(clades *TaxonSet) bool`**: Checks if current taxon—or any ancestor—is in `clades`. +- **`.IPath() *ITaxon`**: Iterates upward from taxon to root (breadth-first via `.IPath()`). +- **`.TaxonAtRank(rank string)` / shortcuts (e.g., `.Species()`, `.Genus()`)**: Traverse ancestors to find first match at given rank. + +## 🧠 String Interning & Deduplication +- **`InnerString.Innerize(value string) *string`**: Thread-safe deduplication of strings (e.g., names, ranks). Returns shared pointer for equality checks. +- **`.Slice() []string`**: Snapshot of all interned strings (read-only). + +## 🔢 Taxonomic Identifiers (`Taxid`) +- **`FromInt(int)` / `FromString(string) *string`**: Validates and normalizes IDs (e.g., `"tx:12345"` → interned `"12345"`). Enforces code prefix, filters to ASCII digits/letters. + +## 📜 Taxon String Parsing +- **`ParseTaxonString(taxonStr string)`**: Parses `"code:taxid [name]@rank"` into structured components. Validates brackets, colons, and field presence. + +## 🧬 Taxonomy & Node Model +- **`Taxon`**: Encapsulates node ID, parent/children links, scientific name (and alternatives), rank, and metadata. + - `.Name(class)`, `.ScientificName()`: Flexible name access (case-insensitive matching via `IsNameEqual`/regex). + - `.SetMetadata(key, value)`, `.GetMetadata(key)` / iteration: Extensible annotations. + - `.String()`: Human-readable `"code:id [name]@rank"` format. + +- **`Taxonomy`**: Manages full hierarchy: + - `.AddTaxon()` / `.InsertPathString()`: Build trees incrementally. + - `.Root()`/`.SetRoot()` / `.HasRoot()`: Root node control (required for LCA). + - `.AsPhyloTree()` → `obiphylo.PhyloNode`: Export to phylogenetic format. + +- **`TaxonSet`**: Efficient set of `*TaxNode`s with alias support: + - `.Alias(id, taxon)`: Non-canonical ID mapping. + - `.Sort()` → topologically sorted slice (parents before children). + - `.AsPhyloTree(root)`. + +- **`TaxonSlice`**: Ordered, type-safe path representation: + - `.String()` → `"id@name@rank|..."` (leaf-to-root). + - Enforces taxonomy coherence; panics on mismatch. + +## 🧮 Lowest Common Ancestor (LCA) +- **`.LCA(t2 *Taxon) (*Taxon, error)`**: Computes most specific shared ancestor of two taxa in same rooted taxonomy. Uses path-based backward traversal. + +## 🔄 Iterator Composition & Utilities (`ITaxon`) +- **`.Next()`, `.Get()` / `.Finished()`**: Standard iteration control. +- **`.Push(taxon)`, `.Close()`**, and **`Split() / Concat(...)`**: Goroutine-driven streaming, parallel consumption. +- **`.ISubTaxonomy()` / `.ITaxon(taxid)`**: Breadth-first subtree traversal from root or given ID. +- **`.AddMetadata(name, value)`**: Wraps iterator to inject metadata into each taxon. +- **`.Consume()`**: Exhausts an iterator (e.g., for side-effect-only pipelines). + +## 🛡️ Safety & Robustness +- Nil-safe accessors (no panics unless explicitly configured). +- Explicit error messages for invalid inputs, cross-taxonomy queries, or unrooted hierarchies. +- Interning reduces memory footprint and accelerates equality checks. + +> Designed for scalability in large-scale metabarcoding, biodiversity informatics, and phylogenetic pipelines. diff --git a/autodoc/docmd/pkg_obitools_obiannotate.md b/autodoc/docmd/pkg_obitools_obiannotate.md new file mode 100644 index 0000000..674dc9e --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obiannotate.md @@ -0,0 +1,42 @@ +# `obiannotate`: Semantic Description of Public Features + +The `obiannotate` package delivers modular, composable sequence annotation workers for biological sequences (FASTA/FASTQ) within the OBITools4 ecosystem. Each worker returns an `obiseq.SeqWorker`, enabling declarative pipeline construction via chaining or conditional execution. All functionality is exposed through both programmatic and CLI interfaces. + +## 1️⃣ Attribute Management +Workers manipulate sequence annotations (metadata slots) with fine-grained control: +- **`DeleteAttributesWorker(keys)`**: Removes specified annotation keys; silently skips missing ones. +- **`ToBeKeptAttributesWorker(keys)`**: Retains only listed keys; discards all others. +- **`ClearAllAttributesWorker()`**: Strips *all* annotations from each sequence. +- **`RenameAttributeWorker(mapping)`**: Renames keys using a dict (e.g., `{"old": "new"}`); skips records if source key is absent. + +## 2️⃣ Sequence Editing +Direct manipulation of sequence content and derived metadata: +- **`CutSequenceWorker(start, end)`**: Extracts subsequence from `start` to `end` (1-based; supports negative indices). Fails with error or discards sequence on invalid bounds. +- **`AddSeqLengthWorker()`**: Adds `seq_length = len(sequence)` annotation. +- **`EvalAttributeWorker(expr, target_slot=None)`**: Evaluates Python expressions (e.g., `"seq_length > 200"`) to set annotations; used internally by `EditAttributeWorker`. + +## 3️⃣ Taxonomic Annotation +Enriches sequences with taxonomic context using NCBI taxonomy: +- **`AddTaxonAtRankWorker(rank)`**: Adds taxon name at specified rank (e.g., `"species"`) to slot `taxon_at_rank`. +- **`AddTaxonRankWorker()`**: Infers and annotates taxonomic rank (e.g., `"species"`). +- **`AddScientificNameWorker()`**: Adds `scientific_name = "Homo sapiens"`-style label. +- **`AddTaxonomicPathWorker()`**: Adds full lineage path (semicolon-separated). + +## 4️⃣ Pattern Matching +Detects DNA motifs with tolerance for mismatches/indels: +- **`MatchPatternWorker(pattern, max_errors=0, allow_indel=False)`**: + - Scans both strands via reverse-complement. + - Annotates: `slot_location` (start/end), `slot_match`, and `slot_error`. + - Uses **Aho-Corasick** for efficient multi-pattern search (file-based via `obicorazick.AhoCorasickWorker`). + +## 5️⃣ CLI-Driven Pipeline Construction +Bridges command-line flags to composable workers: +- **`CLIAnnotationWorker(args)`**: Builds a composite worker from CLI flags (e.g., `--pattern`, `--taxonomic-rank`). +- **`CLIAnnotationPipeline(args)`**: Wraps the worker in a conditional pipeline (using `obigrep` predicates) and parallelizes via multiprocessing. + +## 6️⃣ Utility & Validation +- **`CLIHasPattern(pattern)`**: Returns a worker that filters sequences matching `pattern`. +- **`CLICut(start, end)`**: Returns a cut worker for CLI usage. +- All workers validate inputs (e.g., malformed `--cut` triggers fatal exit with log). + +All public features are **stateless**, composable via `ChainWorkers`, and designed for high-throughput, scriptable annotation workflows. diff --git a/autodoc/docmd/pkg_obitools_obiclean.md b/autodoc/docmd/pkg_obitools_obiclean.md new file mode 100644 index 0000000..5bf622d --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obiclean.md @@ -0,0 +1,86 @@ +# Obiclean: PCR Amplicon Error Correction & Chimera Detection + +Obiclelan is a Go package for cleaning high-throughput amplicon sequencing data. It corrects PCR/sequencing errors by leveraging abundance-weighted sequence relationships and optionally detects chimeric artifacts using graph-based heuristics. Built for scalability, it integrates with OBITools4’s data model and supports IUPAC ambiguity codes. + +## Core Concepts + +- **`seqPCR`**: Represents a sequence in one sample, with fields for raw count (`Count`) and post-clustering weight (`Weight`), plus graph edges, annotations, and cluster membership. +- **Directed similarity graphs**: Edges point from more abundant (father) to less abundant (son) sequences differing by ≤ *d* nucleotides. +- **Abundance-weighted correction**: Less abundant sequences are penalized unless supported by strong graph evidence. + +--- + +## Public Functionalities + +### 1. **Graph Construction** + +- `BuildSeqGraph(samples, distance)`: Builds a mutation graph across samples. + - Compares all sequence pairs within/between samples. + - Adds directed edges only if father has higher weight and differs by ≤ `distance` mismatches. + - Uses parallel workers (`buildSamplePairs`) for one-error edges and `FastLCSScore` for multi-error extensions. + +- `FilterGraphOnRatio(samples, ratio)`: Removes spurious edges violating a power-law decay model: + `weight_ratio < (ratio)^distance`. Ensures only statistically plausible edges remain. + +--- + +### 2. **Annotation & Status Assignment** + +- `annotateOBIClean(samples)`: Populates per-sequence annotations: + - `"obiclean_head"`: `true` if the sequence has no incoming edges (i.e., is a cluster head). + - `"obiclean_singletoncount"`, `"internalcount"`, `"headcount"`: Global counts of sequences in each status across all samples. + +- `ObicleanStatus(seq) string`: Returns one of: + - `"s"`: Singleton (no edges). + - `"h"`: Hub (has outgoing → sons, but no incoming father) — likely erroneous ancestor. + - `"i"`: Internal (has both parents and children) — intermediate error variant. + +- `Status(seq, sample)` / `Weight(seq, sample)`: Get/set per-sample status (`h/i/s`) and weight annotations. + +--- + +### 3. **Clustering & Head Selection** + +- `GetCluster(seq, sample)`: Retrieves or initializes cluster membership (e.g., `"cluster_42"`). +- `GetMutation(seq) map[string]int`: Returns mutation counts (e.g., `"A->T@42": 3`). +- `Mutation(samples)`: Populates mutation annotations from graph edges. + +--- + +### 4. **Chimera Detection** + +- `AnnotateChimera(samples)`: Flags chimeric sequences per sample: + - Filters candidates to *head* sequences only. + - For each candidate `s`, scans more abundant parents for prefix/suffix matches: + - Uses IUPAC-aware comparisons (`commonPrefix`, `commonSuffix`). + - Skips near-identical pairs (one edit difference via `oneDifference`). + - Flags as chimera if: + ``` + maxPrefixLen + maxSuffixLen ≥ L + AND not fully contained in one parent (maxSuffix < L) + ``` + - Annotation format: + `"parent_left/parent_right@(overlap)(start)(end)(len)"`. + +--- + +### 5. **Filtering & Output Control** + +- CLI-style filters (applied post-processing): + - `OnlyHead`: Keep only `"obiclean_head"` sequences. + - `NotAlwaysChimera`: Exclude sequences flagged chimera in *all* samples. + - `MinSampleCount(n)`: Retain sequences present ≥ *n* times across samples. + +- Optional exports: + - `SaveGMLGraphs(samples)`: Writes per-sample graphs in GML (node shapes/colors encode abundance/status). + - `EmpiricalDistCsv(samples)`: Exports substitution statistics (e.g., A→C rates at position *i*) to compressed CSV. + - `EstimateRatio(samples, minStatCount)`: Collects distance-1 substitution events for downstream modeling. + +--- + +## Design Highlights + +- **IUPAC-compliant comparisons**: Nucleotide equality via `obiseq.SameIUPACNuc`. +- **Annotation-driven**: No in-place mutation; all metadata stored via `BioSequence.Annotations`. +- **Scalable parallelism**: Uses goroutines + channels for pairwise comparisons; integrates `progressbar`/Logrus. +- **Flexible thresholds**: Configurable via flags (`distance`, `ratio`, `min-sample-count`), defaulting to sensitivity-optimized values. diff --git a/autodoc/docmd/pkg_obitools_obicleandb.md b/autodoc/docmd/pkg_obitools_obicleandb.md new file mode 100644 index 0000000..67d12e0 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obicleandb.md @@ -0,0 +1,54 @@ +# `obicleandb` Package Overview + +The `obicleandb` package delivers semantic curation and trust scoring for biological sequences (e.g., DNA barcodes) within the OBITools4 ecosystem. It combines taxonomic consistency checks, alignment-based discrimination tests, and statistical confidence estimation to ensure high-fidelity sequence datasets for downstream analysis. + +## Core Functionalities + +### 1. **Input & Taxonomy Integration** +- Loads reference taxonomies (e.g., NCBI) via `obioptions.LoadTaxonomyOptionSet`. +- Parses heterogeneous inputs (FASTA/FASTQ) using `obiconvert.InputOptionSet`, supporting streaming and format auto-detection. +- Integrates taxonomic lineage information into sequence metadata for downstream filtering. + +### 2. **Taxonomy-Guided Dereplication & Filtering** +- `ICleanDB` orchestrates a pipeline that first filters sequences by required taxonomic ranks (e.g., species, genus). +- Dereplicates identical sequences *within* taxonomic groups (e.g., collapse duplicates per `taxid`), preserving only one representative per unique sequence–taxon pair. +- Ensures minimal taxonomic resolution before scoring (e.g., requires at least genus-level assignment). + +### 3. **Sequence Trust Scoring** +- `SequenceTrust`: Computes *local* confidence as + \[ + s = 1 - \frac{1}{n + 1} + \] + where `n` is the count of identical sequences sharing taxonomic labels—interpreting duplicates as empirical validation. +- `SequenceTrustSlice`: Computes *global* confidence via pairwise alignment distances (LCSS scores) among group members. + - Normalizes observed intra-group distance by the median pairwise distance across all groups (`obicleandb_median`). + - Estimates effective sample size (`obicleandb_trusted_on`) using group composition and redundancy. + +### 4. **Higher-Rank Discrimination (Mann–Whitney U Test)** +- `MakeSequenceFamilyGenusWorker` tests whether a sequence’s alignment scores to conspecifics are significantly better than to outgroups at genus/family level. +- Uses `obialign.FastLCSScore` for rapid approximate alignment scoring on grouped sequences. +- Outputs a *p*-value stored in `obicleandb_trusted`, indicating confidence that the sequence belongs to its assigned higher-rank taxon. + +### 5. **Efficient Distance Storage** +- `diagCoord` implements compact triangular indexing for pairwise distance matrices, reducing memory footprint by ~50% while enabling fast lookup. + +### 6. **Pipeline Orchestration** +- `ICleanDB` unifies all steps: input → taxonomy loading → filtering/dereplication → trust scoring. +- Returns an iterator of cleaned, annotated sequences with standardized attributes. + +## Output Attributes + +| Attribute | Description | +|----------|-------------| +| `obicleandb_trusted` | Final confidence score (probability of correct taxonomic assignment) | +| `obicleandb_trusted_on` | Effective group size used for scoring (accounts for redundancy) | +| `obicleandb_level` | Taxonomic rank used in discrimination test (`genus`, `family`, or `"none"`) | +| `obicleandb_median` | Median pairwise LCSS distance used as normalization baseline | + +## Design Principles + +- **Modularity**: Workers (e.g., `SequenceTrust`, `MakeSequenceFamilyGenusWorker`) are composable and reusable. +- **Parallelism**: Batched processing via `obidefault` settings for scalability across large datasets. +- **Robustness**: Gracefully handles sparse taxonomy, small group sizes, and missing labels. + +This package enables rigorous pre-processing of metabarcoding datasets—critical for reducing false positives in OTU/ASV inference and ecological interpretation. diff --git a/autodoc/docmd/pkg_obitools_obiclust.md b/autodoc/docmd/pkg_obitools_obiclust.md new file mode 100644 index 0000000..2b06943 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obiclust.md @@ -0,0 +1,76 @@ +# `obiclust` Package: Semantic Overview + +The `*obiclust*` package provides object-oriented implementations for clustering algorithms, emphasizing modularity, extensibility, and semantic clarity—while `*opicluster/obiclean*` extends this to biological sequence data (e.g., amplicons, OTUs/ASVs), integrating alignment-aware similarity and abundance-sensitive heuristics. + +## Core Clustering Infrastructure (`obiclust`) + +### Abstract Base Class: `Clusterer` +- Defines a unified interface for all clustering algorithms. +- Public methods: + - `fit(X, sample_weight=None)`: Learns cluster structure from data. + - `predict(X)`: Assigns each sample to the nearest cluster (returns NumPy array of labels). + - `cluster_centers_`: Immutable attribute storing learned centroids. +- Designed for subclassing: custom clusterers override `_fit()` and `_predict()`. + +### Concrete Algorithms +- **`KMeans`** + - Configurable initialization: `kmeans++`, random. + - Parameters: max iterations, convergence tolerance (`tol`). +- **`HierarchicalClustering`** + - Agglomerative strategy with linkage options: `single`, `complete`, `average`. +- *(Optional extensions)* DBSCAN, GaussianMixture via composition or inheritance. + +### Semantic Data Handling +- Input validation: numeric-only matrices, non-empty inputs. +- Outputs are immutable NumPy arrays (labels/centers). +- Supports per-sample weights during fitting. + +### Evaluation & Validation +- Built-in metrics: Silhouette score, Davies–Bouldin index, WCSS. +- Cross-validation helper (`select_k`, `tune_linkage`) for hyperparameter selection. + +### Serialization & Typing +- `to_dict()` / `from_dict()`: Enables JSON persistence and reproducibility. +- Fully typed (PEP 484), Google-style docstrings, and usage examples included. + +### Design Principles +- **Readability**: Method names reflect intent (e.g., `assign_clusters`, not `_step2`). +- **Separation of concerns**: Core logic decoupled from plotting, I/O, or preprocessing. +- **Minimal dependencies**: NumPy (required), SciPy (optional for metrics). + +## Biological Sequence Clustering (`opicluster/obiclean`) + +### Distance/Similarity Mode +- Switches between: + - **Similarity mode** (default): higher scores = more related. + - **Distance mode** (`--distance`): lower distances = closer. + +### Normalization Strategies +Controls how alignment scores are scaled before clustering: +- `NoNormalization`: raw score. +- `NormalizedByShortest` (`--shortest`) +- `NormalizedByLongest` (`--longest`) +- `NormalizedByAlignment` (default, via `--alignment`) — uses aligned length. + +### Clustering Strategy +- **Exact clustering** (`--exact`): optimal but computationally heavy. +- Greedy heuristic (default) for scalability. + +### Sample-Aware Processing +- Groups sequences by sample origin (`--sample`, `-s`). +- Filters low-sample-count variants via `--min-sample-count`. +- Ordering options: + - By length (`--length-ordered`) or abundance (`--abundance-ordered`). + - Optional ascending sort: `--ascending-sorting`. + +### Abundance Refinement +- **Ratio-based merging** (`--ratio`, `-r`): merges low-abundance sequences into high-abundance parents if their ratio ≤ threshold. +- **Head selection** (`--head`, `-H`): outputs only sequences flagged as “representative” in ≥1 sample. + +### Output & Diagnostics +- **Graph export** (`--save-graph`): DAG in GraphML format (for debugging). +- **Ratio table export** (`--save-ratio`): CSV of edge abundance ratios. +- Threshold control via `--distance`, `--threshold`. + +### Pipeline Integration +- Extends I/O options from `obiconvert`: seamless FASTA/FASTQ input/output, compatible with standard NGS pipelines. diff --git a/autodoc/docmd/pkg_obitools_obiconsensus.md b/autodoc/docmd/pkg_obitools_obiconsensus.md new file mode 100644 index 0000000..eb94ba3 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obiconsensus.md @@ -0,0 +1,49 @@ +# `obiconsensus` Package: Semantic Overview + +The `obiconsensus` package delivers scalable, graph-based consensus and denoising tools for high-throughput biological sequence data within the OBITools4 ecosystem. It enables error correction, variant clustering, and consensus reconstruction from related amplicon or metagenomic reads—supporting both single-sample and multi-sample workflows. + +## Public API Summary + +### Core Algorithms & Utilities +- **`BuildConsensus()`**: + Constructs a consensus sequence via *de Bruijn graph* assembly of input reads. Automatically selects optimal `k`-mer size (fallback: longest common suffix analysis). Detects graph cycles and incrementally increases `k` until resolved. Optionally persists intermediate graphs (`*.gml`) and FASTA inputs. Output includes metadata: consensus flag, total read weight (summed abundances), `k`-mer size used, and graph statistics. + +- **`SampleWeight()`**: + Returns a closure that retrieves per-sequence sample abundances (e.g., read counts) from sequence annotations or statistics—enabling weighted graph operations. + +- **`SeqBySamples()`**: + Groups sequences by sample identifier, using a configurable annotation key (default: `"sample"`). Supports grouping based on either statistical attributes (`StatsOn`) or sequence metadata. + +- **`BuildDiffSeqGraph()`**: + Builds a *difference graph* where nodes represent unique sequences and edges encode single-nucleotide mutations (position + substitution). Uses `obialign.D1Or0` for exact alignment or approximate LCS-based distance scaling. Supports parallel edge computation and optional progress bar. + +- **`MinionDenoise()`**: + Denoises sequences by identifying high-degree nodes (potential consensus hubs), building local consensuses via `BuildConsensus()`, and preserving low-degree nodes unchanged. Propagates sample annotations, weights, and metadata. + +- **`MinionClusterDenoise()`**: + Denoises via *weight-based clustering*: aggregates node weights (self + neighbors), selects local maxima as cluster heads, and builds consensus per neighborhood. + +- **`CLIOBIMinion()`**: + CLI orchestrator for end-to-end denoising: loads sequences, groups by sample (`--sample`), builds per-sample difference graphs (optional export via `--save-graph`), applies denoising (`MinionDenoise()` or `MinionClusterDenoise()`), optionally deduplicates output (`--unique`), and annotates sequence lengths. + +### Configuration & CLI Helpers +- **Clustering Mode**: `--cluster` (`-C`) enables graph-based clustering. +- **Distance Threshold**: `--distance` (`-d`, default: 1) sets max Hamming distance for edge inclusion. +- **K-mer Control**: `--kmer-size` (`SIZE`, default: -1 = auto-selected). +- **Sample Key**: `--sample` (`-s`, default: `"sample"`) defines the annotation field for sample grouping. +- **Filtering Options**: + - `--no-singleton`: excludes unique sequences. + - `--low-coverage` (default: 0) filters low-abundance sequences. +- **Output Options**: + - `--unique` (`-U`) enables deduplication (via `obiuniq`). + - `--save-graph DIR` exports graphs in GraphML. + - `--save-ratio FILE` writes edge abundance ratios as CSV. +- **Format Integration**: Works with `obiconvert` via unified input/output option sets (`InputOptionSet`, `OutputOptionSet`) for FASTA/FASTQ handling. +- **Getter Functions**: Typed accessors (e.g., `CLIDistStepMax()`, `CLIKmerSize()`) decouple argument parsing from core logic. + +## Design Principles +- **Parallelism**: Leverages goroutines and `sync.WaitGroup` for scalable graph construction. +- **Robustness**: Handles edge cases (e.g., single-sequence inputs) gracefully with logging. +- **Extensibility**: Modular architecture allows swapping alignment engines or graph representations. + +*Purpose: Accurate, reproducible consensus and denoising for NGS amplicon/metagenomic data at scale.* diff --git a/autodoc/docmd/pkg_obitools_obiconvert.md b/autodoc/docmd/pkg_obitools_obiconvert.md new file mode 100644 index 0000000..fffab47 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obiconvert.md @@ -0,0 +1,62 @@ +# `obiconvert`: Semantic Overview of Public Functionalities + +The `obiconvert` package provides a robust, CLI-driven framework for converting and managing biological sequence data within the OBITools4 ecosystem. It enables format-agnostic input parsing, standardized output generation (FASTA/FASTQ/JSON), and configurable preprocessing—while preserving metadata semantics. + +## Input Handling + +- **`ExpandListOfFiles(check_ext bool, filenames ...string) []string`** + Expands file paths into a deduplicated list of eligible files. Supports local directories, symlinks (resolved), and remote URLs (`http(s)://`, `ftp://`). + When `check_ext=true`, filters files by extension: `.fasta[.gz]`, `.fastq[.fq][.gz]`, `.seq[.gz]`, `.gb[| gbff | dat ][.gz]`, and `.ecopcr[.gz]`. + +- **`CLIReadBioSequences(filenames ...string) obiiter.IBioSequence`** + Returns a lazy, streaming iterator over biological sequences from files or stdin. Automatically selects parsing strategy based on CLI flags: + - JSON-style (`--input-json-header`) + - OBI-compliant headers (`--input-OBI-header`, `--input-obi`) + - Heuristic auto-detection (default). + + Configurable via CLI options: + - Parallel workers (`nworkers ≥ 2`) + - Batch size and memory limits + - `U→T` conversion for RNA (`--u-to-t`) + - Skip empty sequences (`--skip-empty`) + + Handles: + - Single/multiple files (with batched parallel reading) + - Paired-end input via `--paired-with` + - Fallback readers: FASTA, FASTQ, GenBank/EMBL, ecoPCR output, CSV + +- **`OpenSequenceDataErrorMessage(args ...string, err error)`** + Formats and logs user-friendly errors for input failures (stdin-only / single-file / multi-file), then exits with status `1`. + +## Output Handling + +- **`CLIWriteBioSequences(iter obiiter.IBioSequence, filenames ...string)`** + Writes sequences from an `IBioSequence` iterator to stdout or files, based on CLI options: + - **Format**: FASTQ (if quality scores present), FASTA, JSON (default), or generic sequence. + - **Header style**: Configured via `CLIOutputFastHeaderFormat()` → `"json"` or `"obi"`. + - **Compression**: Optional gzip (`--gzip`). + - **Paired-end output**: Automatically splits into `_R1`, `_R2` files via `BuildPairedFileNames`. + - **Parallelism**: Uses configurable workers (`WriteParallelWorkers()`). + +- **`BuildPairedFileNames(filename string) (string, string)`** + Generates paired-end filenames: `sample.fastq → sample_R1.fastq`, `sample_R2.fastq`. + +## Configuration & Integration + +- **`OptionSet(allow_paired bool)`** + Centralized CLI option setter. Enables modular setup for paired-end support and shared flags. + +- **Taxonomy Integration**: + Supports loading taxonomy via `obioptions.LoadTaxonomyOptionSet`. + +- **Progress Reporting**: + Displays a progress bar unless stderr is redirected or stdout pipes to another process. + +## Design Principles + +✅ Lazy evaluation via iterators for memory efficiency +✅ Automatic format inference and parallel I/O scaling +✅ Symlink resolution, recursive globbing with extension filtering +✅ CLI-integrated configuration (header parsing mode, workers, batch size) + +All functionality is exposed through public functions and designed for composability with `obiformats`, `obiiter`, and `obidefault`. diff --git a/autodoc/docmd/pkg_obitools_obicount.md b/autodoc/docmd/pkg_obitools_obicount.md new file mode 100644 index 0000000..65e71b5 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obicount.md @@ -0,0 +1,55 @@ +# `obicount` Package Functional Overview + +The `obicount` package provides command-line interface (CLI) option parsing and internal state management for the `obicount` utility — a tool designed to compute biological sequence metrics from standard input formats (e.g., FASTA, FASTQ). Built on top of `go-getoptions`, it cleanly separates argument handling from core counting logic. + +## Core Functionalities + +### 1. **Counting Modes** +Three mutually exclusive or combinable counting modes are supported via CLI flags: + +| Flag | Long Form | Semantic Meaning | +|------|----------------|-----------------------------------------------| +| `-r` | `--reads` | Count total number of sequences (i.e., reads) | +| `-v` | `--variants` | Count unique sequence variants (distinct strings) | +| `-s` | `--symbols` | Sum of all symbol counts (i.e., total length across reads) | + +- **Default behavior**: If *none* of the above flags is provided, all three metrics are computed and reported — ensuring backward-compatible full-report output. + +### 2. **State Tracking** +Internal state variables track which metrics are active: + +- `__read_count__` +- `__variant_count__` +- `__symbol_count__` + +Each is set to `true` when its corresponding flag (`--reads`, etc.) appears on the command line, *or* in default mode (no flags), where all are enabled. + +### 3. **Public Query Functions** +Three exported helper functions allow runtime introspection of active metrics: + +| Function | Returns `true` if… | +|---------------------------------|----------------------------------------------------------| +| `CLIIsPrintingReadCount()` | Read count is enabled (explicitly requested or default) | +| `CLIIsPrintingVariantCount()` | Variant count is enabled (explicitly requested or default) | +| `CLIIsPrintingSymbolCount()` | Symbol count is enabled (explicitly requested or default) | + +These functions decouple counting logic from CLI parsing, enabling modular and testable design. + +### 4. **Semantic Guarantees** +- All query functions follow *inclusive semantics*: they return `true` both when the option is explicitly set and in default mode. +- This ensures intuitive behavior: no flags → full report; any flag subset → only requested metrics. + +### 5. **Separation of Concerns** +- The package handles *only* CLI parsing and state management. +- File I/O, sequence decoding (FASTA/FASTQ), counting algorithms, and output formatting reside in separate modules — promoting maintainability and reuse. + +## Usage Example (Conceptual) + +```bash +obicount -r input.fasta # prints only read count +obicount --variants input.fastq # prints unique variant count only +obicount -s # prints total symbol (length) sum +obicount input.fasta # prints all three metrics +``` + +This design supports extensibility, clarity, and robustness in biological sequence analysis pipelines. diff --git a/autodoc/docmd/pkg_obitools_obicsv.md b/autodoc/docmd/pkg_obitools_obicsv.md new file mode 100644 index 0000000..a3f2721 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obicsv.md @@ -0,0 +1,55 @@ +# Functional Overview of the `obicsv` Package + +The `obicsv` package enables efficient, configurable export of biological sequence data (e.g., FASTA/FASTQ) to CSV format. It supports selective column inclusion, parallel batch processing, compression, and seamless CLI integration—ideal for high-throughput NGS pipelines. + +## Core Capabilities + +| **Domain** | **Features** | +|-----------|--------------| +| **Column Selection & Formatting** | Toggle output fields (`CSVId`, `CSVSequence`, `CSVTaxon`, etc.); define custom attributes via `CSVKey`/`CSVKeys`; set separator (`CSVSeparator`) and NA placeholder (`CSVNAValue`). | +| **I/O & File Handling** | Write to stdout or file (append/truncate); support gzip compression (`OptionsCompressed`); configure batch size and full-file batching. | +| **Processing Strategy** | Parallel workers (default: `obidefault.ParallelWorkers()`); unordered iteration (`NoOrder`); progress tracking; skip empty sequences. | +| **Metadata Enrichment** | Auto-detect columns (`CSVAutoColumn`); integrate `obipairing`, taxonomic data, and abundance counts; support Phred+shifted quality scores. | +| **CLI Integration** | Command-line flags (`--ids`, `--sequence`, `--taxon`, etc.); extendable via helper functions (`CLIPrintId()`, `CLIHasToBeKeptAttributes()`). | + +## Public API Summary + +- **`MakeOptions([]WithOption)`** + Builder-style configuration of export behavior. Supported options: `CSVId`, `CSVTaxon`, `OptionsFileName`, `OptionAppendFile`, etc. + +- **`NewCSVSequenceIterator(IBioSequence, ...WithOption)`** + Wraps a sequence iterator into an async CSV record stream. Launches parallel workers, handles batching, and auto-detects attributes when enabled. + +- **`CSVSequenceHeader(Options)`** + Generates a CSV header row based on enabled columns and custom keys. + +- **`CSVBatchFromSequences(BioSequenceBatch, Options)`** + Converts a batch of sequences into `CSVRecord` entries per configured options. + +- **`WriteCSV(ICSVRecord, io.WriteCloser)`** + Writes CSV data to any writer with compression and parallelization support. + +- **`WriteCSVToStdout()`, `WriteCSVToFile()`** + Convenience wrappers for common I/O targets. + +- **`FormatCVSBatch(CSVRecordBatch, string)`** + Renders a batch of records as an in-memory CSV buffer (header prepended only for first chunk). + +## Design Principles + +- **Streaming & Laziness**: Uses iterator patterns to avoid full data loading. +- **Parallelism**: Producer-consumer model with configurable concurrency (min 2 workers). +- **Resilience**: Graceful handling of missing fields via configurable NA values. +- **Extensibility**: Supports dynamic attributes (e.g., `obipairing` expands to 8 fields). + +## Usage Example +```go +opt := MakeOptions([]WithOption{ + OptionFileName("results.csv"), + CSVId(true), + CSVTaxon(false), + OptionsAppendFile(true), +}) +iter := NewCSVSequenceIterator(sourceIter, opt) +WriteCSV(iter, os.Stdout) // or file +``` diff --git a/autodoc/docmd/pkg_obitools_obidemerge.md b/autodoc/docmd/pkg_obitools_obidemerge.md new file mode 100644 index 0000000..483e231 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obidemerge.md @@ -0,0 +1,53 @@ +# `obidemerge` Package Documentation + +The **`obidemerge`** package enables *demerging* of biological sequences—i.e., splitting aggregated or merged sequence records into discrete, count-annotated variants based on metadata statistics. It supports both programmatic and CLI workflows for downstream processing in metabarcoding or amplicon-based pipelines. + +## Core Functionalities + +### 1. `MakeDemergeWorker(key string) SeqWorker` +- **Purpose**: Constructs a sequence processor that splits sequences by statistical metadata. +- **Behavior**: + - Scans the input sequence for a statistics map under attribute `key`. + - *Example*: If `"sample"` → `{ "S1": 5, "S2": 3 }`, two new sequences are generated. + - For each `(stat_key, count)` pair: + - Copies the original sequence data, + - Adds a new attribute: `key = stat_key`, + - Sets `.Count` to the corresponding integer value. + - Removes original statistics from the input sequence after splitting. +- **Fallback**: If no stats are found for `key`, returns a single-element slice containing the unchanged sequence. + +### 2. `CLIDemergeSequences(iterator, slot string) SeqIterator` +- **Purpose**: CLI wrapper for batch demerging. +- **Behavior**: + - Applies `MakeDemergeWorker(slot)` to each sequence in the input iterator. + - Supports parallel processing (implementation-dependent). +- **Integration**: + - Designed to be used with the `--demerge` CLI flag (see below). + +### 3. CLI Integration via OptionSet +- **Flag**: `--demerge` (`-d`) + - Specifies the metadata slot to demerge (default: `"sample"`). +- **APIs**: + - `DemergeOptionSet(options *getoptions.Options)`: Registers the `-d/--demerge` flag. + - `CLIDemergeSlot() string`: Returns the selected slot name (e.g., `"sample"`), used by downstream workers. +- **Inheritance**: + - Extends `obiconvert.OptionSet`, inheriting standard conversion options (I/O formats, filters, etc.). + +## Semantic Workflow + +1. **Input**: Sequences with embedded statistical metadata (e.g., sample abundances, OTU counts). +2. **Demerge Operation**: Splits each sequence into multiple copies—each tagged with a unique metadata key and abundance. +3. **Output**: A new set of sequences where each variant is independently annotated, enabling: + - Accurate abundance-aware filtering, + - Per-variant downstream analysis (e.g., taxonomic assignment, diversity metrics). + +## Key Concept: *Demerging* +- **Definition**: Reversal of prior merging steps (e.g., OTU clustering, read pairing). +- **Purpose**: Restores granularity for statistical or ecological interpretation while preserving original sequence data. + +## Use Cases +- Post-clustering demerging of OTU/ASV tables. +- Splitting merged paired-end reads by sample or condition metadata. +- Preparing data for tools expecting discrete, count-labeled sequences. + +> **Note**: Only *public* APIs are documented. Internal helpers (e.g., slot validation, worker state) remain unspecified. diff --git a/autodoc/docmd/pkg_obitools_obidistribute.md b/autodoc/docmd/pkg_obitools_obidistribute.md new file mode 100644 index 0000000..d8ebc55 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obidistribute.md @@ -0,0 +1,58 @@ +# `obidistribute` Package: Semantic Description of Public Functionalities + +The `obidistribute` package enables flexible, scalable distribution of biological sequence data into multiple output files or directories. It supports annotation-based separation (e.g., sample IDs), batch splitting, and hash-sharded distribution — all while integrating with standard NGS formats (FASTA/FASTQ) and the broader `obitools4` ecosystem. + +## Core Functionalities + +### 1. **Sequence Distribution Strategy Selection** +- `CLIOutputFormat()` + Specifies output format: `"fastq"`, `"fasta"`, or generic sequence (e.g., FASTA-like). Controls how sequences are serialized to disk. + +- `CLISequenceClassifier()` + Selects the annotation key used for classification (e.g., `"sample_id"`, `"taxon"`). Sequences are grouped by the value of this annotation field. + +- `CLIDistributeByBatches()` / `-n` + Enables round-robin assignment of sequences into *N* fixed batches, regardless of metadata. + +- `CLIDistributeByHash()` / `-H` + Distributes sequences deterministically into *N* batches using a hash of the sequence ID or annotation — ensures reproducible sharding. + +- `CLIDirectoryMode()` / `-d` + When used with a classifier, organizes output files into subdirectories named after classification values. + +### 2. **Output Naming & File Management** +- `CLIFileNamePattern()` / `-p` + Defines a printf-style filename template (e.g., `"sample_%s.fastq"`), where `%s` is replaced by the classifier value or batch index. + +- `CLIAppendSequences()` / `-A` + Enables appending to existing files instead of overwriting them. + +- `CLINAValue()` / `--na-value` + Sets the fallback label (default: `"NA"`) for sequences missing a classifier annotation. + +### 3. **Processing Configuration** +- `CLIDistributeSequence()` + Main entry point: orchestrates input iteration, classification, batching, and parallelized writing. Accepts an `obiiter.IBioSequence` iterator. + +- Parallel workers are derived from `obidefault.ParallelWorkers()` (minimum 2), divided by four. + +- Batch size and compression settings are inherited from `obidefault`. + +### 4. **Header & Format Handling** +- `CLIOutputFastHeaderFormat()` + Configures header serialization format: `"json"` (default) or `"obi"`. Controls metadata inclusion in output headers. + +- Built on top of `obiconvert.CLIOutputFastHeaderFormat()` and integrates with its header parsing/writing logic. + +### 5. **Validation & CLI Integration** +- Uses `getoptions` for option parsing; enforces mutual exclusivity of distribution modes. +- Validates filename pattern syntax and required arguments at startup. + +## Semantic Workflow + +1. User selects a distribution mode (`classifier`, `batches`, or `hash`) and optional directory nesting. +2. Sequences are read via an iterator; each is classified or assigned to a batch/shard. +3. Sequences are buffered in batches, compressed if configured, and written to output files using the selected format. +4. Filenames are generated dynamically per classification value or batch index, respecting append mode and NA fallbacks. + +This module is essential for demultiplexing, batch processing, and scalable data management in high-throughput sequencing pipelines — especially metabarcoding workflows. diff --git a/autodoc/docmd/pkg_obitools_obigrep.md b/autodoc/docmd/pkg_obitools_obigrep.md new file mode 100644 index 0000000..c2ccae1 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obigrep.md @@ -0,0 +1,54 @@ +Here's a concise, semantically structured Markdown description (≤200 lines) of the **public-facing functionalities** provided by the `obigrep` package, based on your input and focusing only on *public* APIs: + +```markdown +# `obigrep`: Command-Line Sequence Filtering for OBITools4 + +`obigrep` delivers a robust, CLI-driven filtering engine for biological sequences (FASTA/FASTQ), enabling precise selection or exclusion of reads using diverse criteria—length, abundance, taxonomy, patterns (exact/fuzzy), metadata attributes—and paired-end logic. + +## Core Filtering Capabilities + +### Length & Abundance +- `--min-length`, `--max-length`: Filter by sequence length. +- `--min-count`, `--max-count`: Filter based on read abundance (count attribute). + +### Pattern Matching +- Exact regex via `--sequence`/`-s`, `--definition`/`-D`, or `--identifier`/`-I`. + - Case-insensitive by default. +- Approximate matching via `--pattern`, with options: + - `--pattern-error`: Max edit distance. + - `--allows-indels`: Allow insertions/deletions (default: mismatches only). + - `--only-forward`: Restrict to forward strand. + +### Taxonomic Filtering +- `--restrict-to-taxon`/`-r`: Keep only sequences matching given taxon(s). +- `--ignore-taxon`/`-i`: Exclude specific taxa. +- `--valid-taxid`: Enforce presence of valid NCBI taxids in records. +- `--require-rank`: Require specific taxonomic rank (e.g., *species*, *genus*). + +### Attribute & Metadata Filtering +- `--has-attribute`/`-A`: Retain sequences with a given attribute key. +- `--attribute=key=pattern`/`-a`: Match regex against a specific attribute value. +- `--id-list FILE`: Select sequences whose identifiers appear in the file. + +### Custom Logic +- `--predicate`/`-p`: Evaluate arbitrary boolean expressions (e.g., `"attr['quality'] > 30 && len(sequence) < 500"`). + +### Paired-End Handling +- `--paired-mode`: Define how filters apply to read pairs: + - `"forward"`: Only forward read considered. + - `"and"`, `"or"`, `"xor"`, etc.: Logical combinations of forward/reverse filters. + +### Output Control +- `--save-discarded FILE`: Write rejected sequences to file. +- `--inverse-match`/`-v`: Globally invert selection (i.e., output *only* discarded reads). + +## Implementation Notes + +- Filters are composed into a single predicate using `CLISequenceSelectionPredicate()`. +- Paired-end logic is layered via `PairedPredicat()` when input files are paired (`CLIHasPairedFile()`). +- Filtering is executed via `iterator.FilterOn(...)` (in-place) or `DivideOn(...)` + async write to discarded file. +- Uses structured logging (`logrus`) and graceful error handling for robust CLI operation. + +## Semantic Role + +`obigrep` acts as the **semantic filter layer** in OBITools4 workflows—translating user CLI flags into type-safe, composable predicates that operate uniformly over `IBioSequence` iterators. It bridges high-level biological intent (e.g., “keep only *Bacillales* with ≥Q30 and no Ns”) to low-level filtering primitives. diff --git a/autodoc/docmd/pkg_obitools_obijoin.md b/autodoc/docmd/pkg_obitools_obijoin.md new file mode 100644 index 0000000..592de7c --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obijoin.md @@ -0,0 +1,62 @@ +# Semantic Description of `obijoin` Package + +The `obijoin` package enables efficient, declarative sequence joins in biological data pipelines. Built on OBITools4’s streaming architecture, it supports left-outer joins between sequence datasets using user-defined attribute keys — ideal for merging paired-end reads, annotating amplicons with metadata, or enriching references. + +## Core Components & Functionalities + +### `IndexedSequenceSlice` +A composite structure combining a biological sequence slice (`BioSequenceSlice`) with precomputed indices. Each index maps attribute values (e.g., `"sample=S1"`, `"barcode=ATGC"`) to sets of matching sequence indices. Enables sublinear-time filtering via key-based intersection. + +### `Get(keys...)` +Performs multi-key *intersection* queries across indexes: returns sequences satisfying **all** provided attribute constraints (e.g., `Get("sample=S1", "barcode=ATGC")`). Keys must match *exactly*; supports arbitrary string attributes via `GetStringAttribute()`. + +### `BuildIndexedSequenceSlice()` +Constructs the index structure in **O(*n*)** time by scanning sequences once and grouping them per attribute. Accepts a `BioSequenceSlice` and returns an `IndexedSequenceSlice`. Handles any annotation attribute supported by the sequence system. + +### `MakeJoinWorker()` +Returns a functional `SeqWorker` implementing join logic: +- For each input sequence, extracts join keys (e.g., `sample`, `barcode`) from annotations. +- Uses the index to find matching partner sequences (`join_with`). +- Outputs one sequence per match, copying original data and enriching it with partner annotations. +- Optionally updates ID/sequence/quality fields *only if* corresponding flags (`--update-id`, etc.) are enabled. + +### `CLIJoinSequences()` +Top-level CLI entry point: +- Reads primary input (stdin or file). +- Loads secondary dataset (`--join-with`), builds index via `BuildIndexedSequenceSlice()`. +- Applies join using worker from `MakeJoinWorker()` with flags (`--by`, `-i/-s/-q`). +- Integrates seamlessly into OBITools4’s streaming iterator model. + +## Join Semantics + +| Feature | Behavior | +|--------|----------| +| **Join type** | Left outer join (primary dataset fully preserved) | +| **Key matching** | Exact string equality; no regex/fuzzy logic implied | +| **Updates** | Controlled by flags: `-i/--update-id`, `-s/--update-sequence`, `-q/--update-quality` | +| **Metadata handling** | Partner annotations are appended unless fields are overwritten | + +## CLI Options + +- `-j/--join-with` *(required)*: Path to secondary sequence file (FASTA/FASTQ/TAB). +- `-b/--by`: Join key mapping, e.g. `"id=id"` or `"sample=well"`. Defaults to `["id"]`. +- `-i/--update-id`: Replace sequence identifiers with partner values. +- `-s/--update-sequence`: Overwrite nucleotide/amino acid sequences from partners. +- `-q/--update-quality`: Replace quality scores (FASTQ only). + +## Usage Example + +```bash +obijoin -i input.fastq \ + --join-with annotations.tsv \ + --by "id=name" \ + -i -s +``` +→ Joins `input.fastq` with TSV annotations, matching on `id == name`; updates IDs and sequences. + +## Design Principles + +- **Efficiency**: Indexing avoids repeated full scans; uses optimized `obiutils.Set[int]` for fast intersection. +- **Extensibility**: Works with any annotation attribute supported by `BioSequence`. +- **Modularity**: CLI logic is configuration-only — no I/O or core algorithms embedded. +- **Composability**: Extends `obiconvert.OptionSet()`; inherits standard format options (`-f`, `-o`) and follows OBITools4 CLI conventions. diff --git a/autodoc/docmd/pkg_obitools_obik.md b/autodoc/docmd/pkg_obitools_obik.md new file mode 100644 index 0000000..14b9de3 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obik.md @@ -0,0 +1,113 @@ +# `obik`: K-mer Index Management Toolkit for Biological Sequences + +`obik` is a CLI tool from the OBITools4 ecosystem designed for building, inspecting, filtering, and manipulating **k-mer indices**—compact data structures encoding k-mer occurrences from biological sequences (e.g., FASTA/FASTQ). It enables scalable, parallelized processing of large-scale sequencing data for applications such as taxonomic profiling, contamination screening, and metagenomic analysis. + +All documented features are **public APIs**, accessible via subcommands. Internal implementation details (e.g., low-level k-mer engines) are omitted. + +--- + +## Core Subcommands + +### `obik index` +Builds or extends a k-mer set group from raw sequences: +- Configurable `k` (2–31) and optional minimizer size (`m`) for space-efficient hashing. +- Filters by k-mer frequency: `--minocc`, `--maxocc`. +- Entropy-based low-complexity filtering (`--entropy-threshold`, `--entropy-size`). +- Supports metadata tagging at group, set, and per-set levels (`--set-tag`, `--index-id`). +- Optionally saves top *N* frequent k-mers (`--save-freq-kmers`) for downstream analysis. +- Parallel sequence processing with atomic counters and thread-safe batching. + +### `obik ls` +Lists metadata of k-mer sets in an index: +- Accepts glob-like `--set PATTERN`s to filter target sets. +- Outputs structured metadata: set index, ID, and k-mer count (`count`). +- Supports multiple formats: CSV (default), JSON, YAML. +- No k-mers themselves are printed—only set-level summaries. + +### `obik summary` +Aggregates and reports comprehensive statistics: +- Structural info: k, m, partitions, total sets/unique kmers. +- Per-set stats: ID, count, disk usage (computed recursively). +- Optional pairwise **Jaccard distance matrix** for similarity analysis. +- Multi-format export (JSON/YAML/CSV) with full metadata preservation. + +### `obik cp` +Copies selected or all k-mer sets from a source index to a new destination: +- Requires `` and ``. +- Pattern-based selection via `--set PATTERN` (glob-style); fails if no match. +- Prevents overwrites unless `--force`. +- Uses atomic copy operations via `CopySetsByIDTo`, preserving original structure. + +### `obik mv` +Safely moves sets between indices: +- Copy-first, then delete strategy ensures atomicity. +- Supports `--set PATTERN` for selective moves; fails if no sets match patterns. +- Removes source sets in reverse order to avoid index renumbering issues. +- Logs progress and final counts for observability. + +### `obik rm` +Removes k-mer sets from an index: +- Requires at least one glob-like `--set PATTERN`. +- Validates existence and match success before deletion. +- Deletes sets in reverse order to preserve indices during bulk removals. +- Fails fast on errors, leaving index consistent. + +### `obik spectrum` +Exports k-mer frequency spectra per set: +- Computes histogram: how many distinct kmers occur *exactly N times*. +- Outputs sparse CSV (only non-zero frequencies), with per-set columns. +- Enables comparative analysis of redundancy/complexity across samples. + +### `obik filter` +Filters k-mers from an index using configurable criteria: +- Currently supports entropy-based filtering (`--entropy-threshold`, `--entropy-size`). +- Runs in parallel across partitions (per-worker filter instantiation for stateful filters). +- Preserves partitioning structure and `spectrum.bin` files. +- Logs per-set statistics (kept %, total processed). + +### `obik match` +Annotates query sequences with reference matches: +- Loads a k-mer index and selects target sets via patterns. +- Reads sequences (FASTA/FASTQ), prepares queries in parallel, and merges batches incrementally. +- Matches k-mers against reference sets using `MatchBatch`, attaches match positions as attributes (e.g., `"kmer_matched_ref_genome"`). +- Streams annotated output with paired-end integrity preserved. + +### `obik lowmask` +Masks or extracts low-complexity regions in sequences: +- Uses multi-scale entropy analysis (window sizes 1–`level_max`) on canonical k-mers. +- Three modes: **mask** (replace with `.` or custom char), **split**, and **extract low-complexity fragments**. +- Preserves metadata (e.g., entropy values) on output sequences. + +### `obik super` +Extracts *super k-mers* from overlapping reads: +- Merges contiguously overlapped kmers sharing a minimizer into longer, non-overlapping super-k-mers. +- Configurable `k` and `m`; parallelized via worker pipeline. +- Optimized for alignment-free analysis, read correction, and compression. + +--- + +## Shared Capabilities + +### Set Selection +- Glob-style pattern matching (`--set PATTERN`, repeatable). +- Resolves to exact set IDs using `MatchSetIDs`. + +### Output Formatting +- Structured output: CSV, JSON (`--json-output`), YAML (`--yaml-output`) across multiple commands. + +### Metadata Handling +- Group-, set-, and per-kmer metadata support (`--set-tag`, `metadata.toml`). +- Preserved during copy/move/filter operations. + +### Safety & Observability +- Structured logging (Logrus), progress bars (`progressbar`). +- Context-aware cancellation and timeout support. +- Detailed error wrapping with `%w`. + +### Parallelism +- Multi-worker pipelines (e.g., `nworkers` from system defaults). +- Thread-safe accumulation and atomic counters where needed. + +--- + +> **Note**: All commands assume a valid `KmerSetGroup` index structure (`.kdi`, `.toml`). No k-mer sequences themselves are printed—only metadata, counts, or match annotations. diff --git a/autodoc/docmd/pkg_obitools_obikmersim.md b/autodoc/docmd/pkg_obitools_obikmersim.md new file mode 100644 index 0000000..70ed362 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obikmersim.md @@ -0,0 +1,107 @@ +# `obikmersim`: K-mer–Based Sequence Similarity Analysis Package + +`obikmersim` is a high-performance Python package for **k-mer–driven sequence comparison and alignment**, tailored for biological read analysis (e.g., amplicons, metagenomes). It enables rapid matching of query sequences against reference databases using efficient k-mer indexing, followed by localized alignment with quality-aware consensus refinement. Designed for scalability and flexibility, it supports sparse k-mer representations, orientation detection (forward/reverse-complement), and configurable filtering thresholds. + +--- + +## Public API Overview + +### 1. **K-mer Indexing & Matching Workers** +#### `MakeCountMatchWorker(reference_sequences, k=21, min_count=2, sparse=False)` +- **Purpose**: Build a `KmerMap` from reference sequences and match queries via shared k-mers. +- **Functionality**: + - Indexes all *k*-mers (with optional sparsity mask) from reference sequences. + - For each query, retrieves candidate references sharing ≥ `min_count` k-mers. + - Returns annotated results: query ID, matched references, match count, *k*, and sparsity flag. +- **Use Case**: Fast pre-screening for taxonomic assignment or read clustering. + +#### `MakeKmerAlignWorker(count_match_worker, delta=50, penalty_scale=1.0, gap_factor=-2)` +- **Purpose**: Perform *k*-mer–seeded local alignment with quality-aware consensus. +- **Functionality**: + - Uses shared k-mers from `count_match_worker` to seed alignment candidates. + - Runs local pairwise alignments (via internal aligner) and builds quality-weighted consensus (`ReadAlign`, `BuildQualityConsensus`). + - Computes: + - `% identity` + - Residual similarity (k-mer–aware alignment score) + - Alignment length & orientation (`+`/`−`) + - Filters output by `min_identity=80%`, optional min alignment length. +- **Use Case**: Precise read assignment, error correction via consensus. + +--- + +### 2. **CLI Configuration Options** +#### `KmerSimCountOptionSet` +- Defines CLI arguments for k-mer counting/matching: + - `--kmer-size` (int, default=21) + - `--sparse` (bool): Enable sparse k-mer masking + - `--reference `: Reference FASTA/FASTQ path(s) + - `--min-count` (int, default=2): Minimum shared k-mer count + - `--self`: Perform self-comparison (query = reference) + +#### `KmerSimMatchOptionSet` +- Extends counting options with alignment scoring parameters: + - `--delta` (int, default=50): Max k-mer separation for seeding + - `--penalty-scale` (float, default=1.0): Mismatch/gap scaling factor + - `--gap-factor` (float, default=−2): Gap penalty coefficient + - `--fast-absolute`: Use fast absolute scoring (no dynamic programming) + +#### Composite Sets +- `CountOptionSet` / `MatchOptionSet`: Combine k-mer options with generic I/O conversion settings (e.g., via `obiconvert`). + +--- + +### 3. **CLI Helpers & Accessors** +#### `CLIKmerSize(args)` +- Returns parsed k-mer size from CLI args. + +#### `CLIReference(args, format="fasta")` +- Loads reference sequences into memory (supports batched/parallel reading). + +#### `CLISelf(args)` +- Returns boolean flag for self-comparison mode. + +--- + +### 4. **Core CLI Wrappers** +#### `CLILookForSharedKmers(args)` +- Orchestrates k-mer counting/matching pipeline: + - Builds `count_match_worker` + - Iterates over query sequences (from stdin or file) + - Outputs match annotations in structured format. + +#### `CLIAlignSequences(args)` +- Runs full alignment pipeline: + - Uses `count_match_worker` to seed candidates + - Invokes `kmer_align_worker` + - Outputs aligned pairs with identity, orientation, and quality metrics. + +--- + +## Key Technical Features +- **Sparse K-mers**: Mask positions (e.g., Ns or degenerate bases) via bitmasks. +- **Orientation Handling**: Auto-detect reverse-complement matches during seeding/alignment. +- **Fast Heuristic Scoring**: Preliminary alignment score estimation before full path resolution (reduces compute). +- **Quality-Aware Consensus**: Integrates base quality scores during alignment refinement. +- **Configurable Filtering**: Thresholds on identity, length, and k-mer support. + +--- + +## Typical Workflows +| Workflow | Tools Used | +|---------|------------| +| Taxonomic screening of amplicons | `CLILookForSharedKmers` + sparse mode | +| Read error correction via reference consensus | `CLIAlignSequences` with quality-aware alignment | +| *In silico* PCR specificity check | `CLISelf()` + min-count filtering | +| Large-scale metagenomic read assignment | Batched parallel execution with `CLIReference` | + +--- + +## Output Format +Results are returned as structured records (e.g., dictionaries or dataclasses) with fields: +- `query_id`, `reference_ids` +- `match_count`, `kmer_size`, `sparse_mode` +- For alignments: + `%identity`, `alignment_length`, `orientation` (`+1`/`−1`) + `residual_similarity`, `consensus_quality` + +All public functions are documented with type hints and include unit tests. diff --git a/autodoc/docmd/pkg_obitools_obilandmark.md b/autodoc/docmd/pkg_obitools_obilandmark.md new file mode 100644 index 0000000..2102d09 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obilandmark.md @@ -0,0 +1,49 @@ +# `obilandmark` Package: Semantic Documentation + +The `obilandmark` package implements a **reference-free, landmark-based embedding and indexing pipeline** for biological sequences within the OBITools4 ecosystem. It enables scalable, low-dimensional representation of sequence libraries by projecting them into a distance space defined by curated landmark sequences—ideal for clustering, classification, and fast similarity search in metabarcoding or metagenomics workflows. + +## Public Functionalities + +### `MapOnLandmarkSequences(library, landmarks)` +Projects each sequence in a biological library onto Euclidean coordinates using pre-selected landmark sequences. +- **Input**: A sequence `library` (e.g., FASTA/FASTQ iterator) and a list of landmark sequences. +- **Algorithm**: Computes pairwise alignment scores between each sequence and all landmarks using `FastLCSScore`, converting them into distance-based coordinates. +- **Output**: A matrix of shape `(n_sequences, n_landmarks)` where each row is a point in landmark space (`seqworld`). +- **Features**: Parallel execution (configurable workers), progress bar, and buffered streaming for large datasets. + +### `CLISelectLandmarkSequences(options)` +Main orchestration function that performs landmark selection, embedding, and annotation in a single CLI-driven pipeline. +- **Landmark Selection**: Iteratively selects `n` landmarks (default: 200) via k-means clustering on initial random samples, minimizing cluster inertia over two refinement rounds. +- **Embedding**: Calls `MapOnLandmarkSequences()` to compute coordinates for all sequences in the library. +- **Annotation**: Augments each sequence record with: + - `landmark_coord`: full coordinate vector (distances to all landmarks), + - optional `landmark_id` for sequences selected as landmark representatives. +- **Taxonomic Indexing**: If taxonomy is provided, builds a `GeomIndexSequence` per sequence—enabling efficient taxonomic search via geometric proximity. + +### `LandmarkOptionSet(options)` +Registers CLI options specific to landmark configuration. +- Adds the `-n` / `--center` flag (type: integer), defaulting to **200**, controlling the number of landmarks selected. + +### `OptionSet(options)` +Aggregates option sets required by the pipeline: +- Input/output handling (`obiconvert.InputOptionSet`, `.OutputOptionSet`) +- Taxonomy loading support (optional, via `obioptions.LoadTaxonomyOptionSet`) +- Landmark-specific options (`LandmarkOptionSet`) + +### `CLINCenter()` +Returns the integer value of `-n / --center`, i.e., the number of landmarks to select (default: 200). + +## Design Principles + +- **Scalability**: Uses buffered I/O and parallel workers to process large sequence libraries efficiently. +- **Modularity**: Integrates with core OBITools4 modules (`obialign`, `obistats`, `obiutils`, `obitax`, `obirefidx`). +- **CLI-first**: Designed for batch processing pipelines; defaults ensure sensible behavior out-of-the-box. +- **Extensibility**: Annotation schema supports future enhancements (e.g., `landmark_class` via commented stubs). + +## Use Cases + +- Reference-free sequence clustering and dimensionality reduction +- Fast similarity search via geometric indexing in taxonomic space +- Preprocessing for machine learning on sequence libraries (e.g., classification, anomaly detection) + +> **Note**: Only public interfaces are documented. Internal helpers (e.g., clustering utilities, alignment wrappers) remain implementation details. diff --git a/autodoc/docmd/pkg_obitools_obimatrix.md b/autodoc/docmd/pkg_obitools_obimatrix.md new file mode 100644 index 0000000..cfc9ee7 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obimatrix.md @@ -0,0 +1,40 @@ +# `obimatrix` Package: Semantic Overview + +The `obimatrix` package enables high-performance construction, manipulation, and export of biological sequence count matrices (e.g., OTU/ASV tables) in the OBITools4 ecosystem. Built around a sparse matrix representation, it supports flexible attribute handling, parallelized input processing, and multiple output formats—ideal for downstream ecological or bioinformatic analysis. + +## Core Functionalities + +### Matrix Construction & Management +- **`MakeMatrixData()` / `NewMatrixData(naVal string, fixedCols []string)`**: + Initializes a new `MatrixData` instance with configurable NA placeholder and fixed column headers (e.g., `"id"`, `"count"`). +- **`Update(seq obiseq.BioSequence, mapKey string)`**: + Populates the matrix using a biological sequence’s annotations. Extracts per-taxon counts or arbitrary map attributes (e.g., sample IDs), inserting them into the sparse matrix under `row = seq.ID`, with dynamic column detection. +- **`TransposeMatrixData(md *MatrixData)`**: + Flips rows/columns: original columns become new `"id"` attributes; preserves metadata and NA handling. + +### Merging & Parallelization +- **`MergeMatrixData(a, b *MatrixData)`**: + Combines two matrices row-wise; panics on duplicate sequence IDs to prevent silent overwrites. +- **`IMatrix(iter obiseq.Iterator, mapKey string)`**: + Builds a full matrix in parallel from an iterator of sequences. Auto-detects extra sample columns if enabled (via `--auto-cols`), supporting dynamic batch processing. + +### Export & CLI Integration +- **`CLIWriteCSVToStdout(md *MatrixData)`**: + Outputs a wide-format CSV: rows = sequences, columns = fixed attributes + detected samples. Handles Phred encoding (ASCII 33/64) for quality strings and supports transpose via `--transpose`. +- **`CLIWriteThreeColumnsToStdout(md *MatrixData)`**: + Outputs a long-format CSV with columns: `sample`, sequence ID, and value—suited for tools expecting tidy data. +- **CLI Option Aggregation**: + Integrates with `getoptions` to expose flags like: + - `-m, --map-attribute`: grouping key (default: `"merged_sample"`) + - `--value-name`, `--sample-name`: column headers (defaults: `"count"`, `"sample"`) + - `-t, --transpose`: toggle row/column orientation + - `--allow-empty`, `--strict-attributes`: control handling of missing annotations + +### Robustness & Flexibility +- **NA Handling**: Replaces absent mapping attributes with a configurable placeholder (default: `"0"`). +- **Strict Mode**: Panics on type mismatches or uncastable values (e.g., non-numeric counts in numeric context). +- **Attribute Extensibility**: Supports arbitrary metadata (taxonomic labels, quality strings) via dynamic column inference. + +## Design Philosophy + +Focused on **speed**, **type safety**, and **reproducibility** for amplicon sequencing workflows. The package avoids implicit defaults beyond core conventions, favoring explicit CLI configuration and clear error signaling for data integrity. diff --git a/autodoc/docmd/pkg_obitools_obimicrosat.md b/autodoc/docmd/pkg_obitools_obimicrosat.md new file mode 100644 index 0000000..7e6c08d --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obimicrosat.md @@ -0,0 +1,54 @@ +# `obimicrosat`: Microsatellite Detection Module for OBITools4 + +This Go package provides a modular, CLI-integrated framework to detect and annotate simple sequence repeats (SSRs), also known as microsatellites, in biological DNA sequences. It is designed for integration into sequence processing pipelines—especially those focused on marker discovery, PCR primer design, or genomic feature annotation. + +## Core Capabilities + +### 1. **Flexible Microsatellite Detection** +- Detects tandem repeats of DNA motifs (units) with user-defined constraints: + - Unit length range (`minUnitLength` to `maxUnitLength`, typically 1–6 bp) + - Minimum repeat count (`minUnits`) + - Total microsatellite length threshold (`minLength`) +- Uses robust regex-based scanning via `regexp2`, followed by precise boundary refinement. + +### 2. **Canonical Unit Normalization** +- Determines the *lexicographically smallest* rotation of the detected unit. +- Optionally computes its reverse complement to define orientation (`direct` or `reverse`). +- If enabled, reorients the full microsatellite region to its canonical (smallest) form. + +### 3. **Flanking Sequence Validation** +- Ensures sufficient unique sequence on both sides of the repeat (`minflankLength`). +- Stores flanking regions as `microsat_left` and `microsat_right`. + +### 4. **Structured Annotation Output** +Each detected microsatellite enriches the input `BioSequence` with standardized attributes: +- `microsat_unit_length`, `microsat_unit_count` +- `seq_length` (full repeat region length), `microsat` (repeat sequence) +- Positions: `microsat_from`, `microsat_to` +- Canonical unit: `microsat_unit_normalized` +- Orientation flag (`direct`/`reverse`) and flanks + +### 5. **CLI Integration & Pipeline Compatibility** +- `MicroSatelliteOptionSet()` registers all detection parameters for CLI use (via `go-getoptions`). +- Supported flags: + - `-m, --min-unit-length`: min unit size (default: `1`) + - `-M, --max-unit-length`: max unit size (default: `6`) + - `--min-unit-count`: min repeat count (default: `5`) + - `-l, --min-length`: total SSR length threshold (default: `20`) + - `-f, --min-flank-length`: required flanking length (default: `0`) + - `-n, --not-reoriented`: disable sequence reorientation +- Helper functions (e.g., `CLIMinUnitCount()`, `CLIReoriented()`) expose runtime config. +- `MakeMicrosatWorker()` returns a reusable `SeqWorker` for parallel, iterator-based processing. +- `CLIAnnotateMicrosat()` integrates the worker into a conversion pipeline, filtering sequences without qualifying SSRs. + +### 6. **Dependencies & Ecosystem Integration** +- Built on `obitools4` core types (`BioSequence`, iterators, default annotation schema). +- Uses only external dependency: `github.com/dlclark/regexp2` for advanced regex support. +- Fully compatible with existing `obiconvert.OptionSet`; extends it via `OptionSet()`. + +## Use Cases +- Identification of polymorphic SSR markers for population genetics. +- Preprocessing step in PCR primer design tools (to avoid repeat-rich regions). +- Quality control: flagging low-complexity sequences in NGS data. + +> **Note**: Only *public* APIs are documented. Internal helpers (e.g., `min_unit`, rotation logic) remain implementation details. diff --git a/autodoc/docmd/pkg_obitools_obimultiplex.md b/autodoc/docmd/pkg_obitools_obimultiplex.md new file mode 100644 index 0000000..20b4a5b --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obimultiplex.md @@ -0,0 +1,54 @@ +# `obimultiplex`: Semantic Description + +The `obimultiplex` package enables **high-throughput demultiplexing of PCR amplicon sequencing data**, assigning reads to samples using molecular barcodes (tags) and primer sequences. It supports flexible matching, configurable error tolerance, parallel processing, and optional output of unassigned reads—making it suitable for scalable NGS preprocessing pipelines. + +## Core Functionalities + +### 1. **NGSFilter Configuration Parsing** +- Reads experiment definitions from a CSV file (`--tag-list` / `-s`) conforming to the `NGSFilter` schema. +- Each row defines: sample name, forward/reverse primer sequences, and one or more barcode (tag) sequences. +- Supports optional metadata columns for custom annotations. + +### 2. **Barcode & Primer Matching Engine** +- Uses `obingslibrary` to instantiate a multi-barcode extraction worker. +- Implements three matching modes: + - `strict`: exact sequence match only; + - `hamming`: allows mismatches up to a threshold (`--allowed-mismatches` / `-e`); + - `indel`: extends hamming to permit insertions/deletions (`--with-indels`). +- Default tolerance: ≤2 mismatches; configurable via CLI or programmatic options. + +### 3. **Read Assignment & Annotation** +- Assigns each input read to a sample based on successful tag + primer matching. +- Reads failing assignment are flagged with the `"obimultiplex_error"` attribute (unless retained). +- Optional error annotation preserves mismatch/indel details in output metadata. + +### 4. **Unidentified Read Handling** +- If `--unidentified` / `-u` is specified, unassigned reads are written to the given file. +- Uses `obiconvert.CLIWriteBioSequences` in a background goroutine for non-blocking I/O. + +### 5. **Parallel & Batched Processing** +- Leverages `obidefault` to configure worker threads and batch sizes. +- Applies `.MakeISliceWorker(...)` for concurrent barcode extraction across reads. + +### 6. **Template Generation** +- The `--template` option prints a minimal, commented CSV example to stdout for rapid setup. + +## CLI Interface Summary + +| Option | Alias | Description | +|--------|-------|-------------| +| `--tag-list` / `-s` | | Path to NGSFilter CSV config (required) | +| `--allowed-mismatches` / `-e` | | Max mismatches allowed (default: `2`) | +| `--with-indels` | | Allow indel errors in matching (default: false) | +| `--unidentified` / `-u` | | Output file for unassigned reads (optional) | +| `--keep-errors` / `--conserved-error` | | Retain error info in output (default: false) | +| `--template` | | Print sample CSV template to stdout | + +## Design Principles + +- **Composability**: Integrates with `obiconvert.OptionSet()` for modular pipeline building. +- **Extensibility**: Extra CSV columns are preserved as read annotations (key-value pairs). +- **Logging & Feedback**: Reports worker count, error handling mode, and output file usage via `logrus`. +- **Dependencies**: Built on top of `obitools4` (`obiformats`, `obingslibrary`) and standard Go CLI tooling. + +> **Note**: Only *public* APIs (e.g., `IExtractBarcode`, CLI options, CSV schema) are documented. Internal helpers and low-level workers remain opaque. diff --git a/autodoc/docmd/pkg_obitools_obipairing.md b/autodoc/docmd/pkg_obitools_obipairing.md new file mode 100644 index 0000000..77eee46 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obipairing.md @@ -0,0 +1,62 @@ +# `obipairing` Package — Functional Overview + +The `obipairing` package enables robust merging of paired-end next-generation sequencing (NGS) reads within the OBITools4 ecosystem. It bridges input parsing, alignment configuration, and consensus assembly—supporting both high-accuracy overlap-based merging and lightweight fallback concatenation when overlaps are unreliable. + +## Public API Summary + +### CLI Interface (`obipairing/cli.go`) +- **Input specification**: + `--forward-reads` (`-F`) and `--reverse-reads` (`-R`) flags accept FASTQ/FASTA file paths. +- **Alignment tuning**: + - `_Delta` (`--delta`, default `5`) — buffer for refining initial overlap detection. + - `_MinOverlap` (`--min-overlap`, default `20`) — minimum overlap length. + - `_MinIdentity` (`--min-identity`, default `90`) — minimum % identity for valid alignment. + - `_GapPenalty` (`--gap-penalty`, default `2`) — gap cost multiplier vs mismatches. + - `_PenaltyScale` (`--scale`, default `1`) — global scoring scaling factor. +- **Alignment mode control**: + - Fast heuristic enabled by default; `--exact-mode` disables it. + - Absolute scoring in fast mode via `--fast-absolute`. +- **Output customization**: + `--without-stat` omits alignment statistics from consensus headers. +- Extends generic I/O options inherited from `obiconvert` for pipeline compatibility. + +### Core Assembly Functions (`obipairing/assemble.go`) +- **`JoinPairedSequence(seqA, seqB *obiseq.BioSequence, inplace bool) (consensus *obiseq.BioSequence)`** + Concatenates forward and reverse reads with a `..........` (10-dot) separator. + - Quality scores for dots set to Phred `Q=0` if both inputs are quality-tracked. + - Supports in-place recycling (`inplace=true`) to reduce allocations. + +- **`AssemblePESequences(...)`** + Performs high-fidelity paired-end assembly: + - Uses `obialign.PEAlign` with a two-stage process: + 1. **Fast heuristic** (`FAST`) to locate candidate overlap region. + 2. **Dynamic programming refinement**, extended by `_Delta`. + - Validates alignment against thresholds (`minOverlap`, `minIdentity`). + Falls back to join if criteria unmet. + - Optionally annotates output with alignment metadata: + ```go + "mode" → "alignment" or "join" + "ali_length" → overlap length + "score_norm" → normalized alignment score + "identity" → % identity over overlap + "directionality"→ orientation (e.g., FR) + ``` + - Supports in-place reuse (`inplace`) and absolute/relative scoring via `fastModeRel`. + +- **`IAssemblePESequencesBatch(...)`** + Parallelizes assembly over batches of read pairs: + - Consumes iterators from `PairWith` (e.g., via `obiiter`). + - Launches configurable workers (`nworkers`) and channel buffer size. + - Internally reverse-complements the second read before alignment (`seqB.ReverseComplement()`). + - Yields assembled consensus sequences via an iterator. + +### Configuration & Parameter Access +- Getter functions (`CLI*`) expose parsed CLI parameters (e.g., `CLIMinOverlap()`, `CLIGapPenalty()`), enabling downstream alignment modules to reuse CLI-defined settings. + +### Annotation Semantics +Each assembled sequence carries annotations describing the assembly mode and, when applicable: +- Alignment scores (`ali_score`, `score_norm`) +- Overlap metrics (`ali_length`, `identity`) +- Fast-mode metadata (e.g., `"pairing_fast_score"`) when heuristic alignment is used. + +Designed for scalability, low memory footprint, and integration with `obiseq`, `obiiter`, and alignment backends in OBITools4. diff --git a/autodoc/docmd/pkg_obitools_obipcr.md b/autodoc/docmd/pkg_obitools_obipcr.md new file mode 100644 index 0000000..6098f85 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obipcr.md @@ -0,0 +1,42 @@ +# `obipcr`: In-Silico PCR Simulation CLI Package + +The `obipcr` package provides a robust, configurable command-line interface for simulating *in silico* PCR amplifications on biological sequences. It enables flexible primer design, mismatch-tolerant binding, amplicon filtering by length and completeness, support for circular genomes, and optimized handling of large input datasets. + +## Core Features + +### Primer Definition & Matching +- **Forward/Reverse Primers**: Required inputs (`--forward`, `--reverse`) supporting degenerate nucleotide patterns (e.g., IUPAC ambiguity codes) via integration with `obitools4/pkg/obiapat`. +- **Mismatch Tolerance**: Configurable per-primer mismatch budget (`--allowed-mismatches`, `-e`) using pattern-based alignment via `MakeApatPattern`. + +### Amplicon Filtering & Constraints +- **Length Bounds**: Enforces minimum (`--min-length`, `-l`) and maximum (`--max-length`, `L`) amplicon sizes (excluding primers). +- **Completeness Check**: Option (`--only-complete-flanking`) restricts output to amplicons where both primer-binding sites are fully contained in the input sequence. + +### Topology & Extension Handling +- **Circular DNA Support**: Activated via `--circular` (`-c`) to allow primers binding across sequence termini. +- **Flanking Extension**: Optional inclusion of upstream/downstream regions (`--delta`, `-D`) beyond primer sites for realistic amplicon modeling. + +### Scalability & Performance +- **Fragmentation Strategy**: Long sequences (> `max-length × 1000`) are split into overlapping segments (~`max-length × 1000 bp`) to accelerate PCR search (`--fragmented`). +- **Parallel Execution**: Leverages `obidefault.ParallelWorkers()` for concurrent processing. +- **Memory Control**: Limits memory usage to ≤50% of available RAM (`LimitMemory(0.5)`). + +## Public API + +### CLI Option Registration +- `PCROptionSet()`: Registers all PCR-specific flags with the underlying option parser. +- `OptionSet()`: Extends above by integrating standard conversion options (`obiconvert.OptionSet`). + +### Safe Value Accessors +- Getter functions (e.g., `CLIForwardPrimer()`, `CLIMinLength()`) provide typed, validated access to parsed options—including compiled nucleotide patterns and error-checked ranges. + +### Main Execution Entry Point +- `CLIPCR(seqIter)`: Performs *in silico* PCR over an input sequence iterator, returning amplified fragments as a new batched output iterator. Configured entirely via CLI options. + +## Design Principles + +- **Fail-Fast Validation**: All required parameters (e.g., primers) are validated at parse time; missing values trigger immediate fatal errors. +- **Pattern-Centric Matching**: Mismatch-tolerant binding is implemented via robust pattern-matching primitives (`obiapat`), not naive string comparison. +- **Modular Architecture**: Clear separation between CLI parsing, algorithm configuration (`PCRSliceWorker`), and execution orchestration ensures maintainability. + +This package is ideal for building scalable amplicon-based metagenomics pipelines with high precision and tunable sensitivity. diff --git a/autodoc/docmd/pkg_obitools_obirefidx.md b/autodoc/docmd/pkg_obitools_obirefidx.md new file mode 100644 index 0000000..9320a38 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obirefidx.md @@ -0,0 +1,120 @@ +# `obirefidx` Package: Semantic Overview + +The `obirefidx` package implements a **taxonomic reference indexing pipeline** for high-throughput sequencing data, optimized for family-level classification. It combines *k*-mer-based pre-filtering with alignment-aware similarity scoring to build compact, taxonomically annotated reference indexes—enabling fast and accurate read assignment in metabarcoding workflows. + +--- + +## Public Functionalities + +### 1. **Reference Database Indexing Pipeline** + +#### `IndexSequence(seqidx int, references []obiio.BioSeq, kmers obikmer.Table4mer, taxa map[string]TaxonID, taxo TaxonomySlice) (map[int]string)` +Computes a **taxonomic error-profile** for one query sequence against all references: +- Uses cached LCA lookups to group references by shared taxonomic ancestors. +- Filters candidate sets using 4-mer overlap counts (fast). +- Performs local alignment (`FastLCSScore` or `D1Or0`) to compute substitution+indel error counts. +- Builds a strictly increasing vector of minimal errors per taxonomic rank (e.g., genus, family). +- Outputs a map: `error_count → "Taxon@Rank"` (e.g., `{0: "Homo@genus", 3: "Primates@order"}`). + +> ✅ *Key insight*: Taxonomic resolution degrades predictably with alignment error. + +--- + +#### `IndexReferenceDB(iter obiio.SequenceIterator) (obiio.BatchedSequenceIterator)` +Processes an entire reference database into indexed batches: +- Validates sequences: skips those without valid taxonomic IDs. +- Precomputes 4-mer frequency tables for all sequences (via `obikmer.Table4mer`). +- Parallelizes indexing over chunks of 10 sequences using worker goroutines. +- Calls `IndexSequence` for each sequence and attaches the result (`obitag`) to a copy. +- Returns an iterator over batches, optionally displaying progress. + +> ✅ *Design note*: Memory reuse and batched I/O ensure scalability to large databases. + +--- + +### 2. **Clustering & Deduplication** + +#### `MakeStartClusterSliceWorker(chunkSize int, identityThreshold float64) (func([]obiio.BioSeq) []ClusterSlice)` +Performs **greedy hierarchical clustering** at family-level identity (hardcoded ≥90%): +- Uses LCSS alignment with error tolerance derived from `identityThreshold`. +- For each sequence, outputs: + - `clusterid`: ID of its cluster centroid (head). + - `clusterhead`: boolean flag indicating if it *is* the head. + - `clusteridentity`: alignment-based identity to the centroid. + +> ✅ *Use*: Reduces redundancy before indexing—only centroids are re-indexed for efficiency. + +--- + +### 3. **Taxonomy & Geography-Aware Indexing** + +#### `GeomIndexSesquence(seqidx int, references []obiio.BioSeq, taxa map[string]TaxonID, taxo TaxonomySlice) (map[int]string)` +Computes a **spatially-aware taxonomic index**: +- Retrieves geographic coordinates (lat/long) of the query sequence; fails if missing. +- Computes Euclidean squared distances to all others in parallel. +- Sorts neighbors by distance while preserving original indices (`obiutils.Order`). +- Iteratively updates the LCA between query and neighbors, recording: + - `distance → "Taxon@Rank"` map. +- Stops early upon reaching root taxonomy. + +> ✅ *Use case*: Models taxonomic uncertainty bands based on nearest neighbors’ location + taxonomy. + +--- + +### 4. **Worker Utilities & Taxonomy Annotation** + +#### `MakeSetFamilyTaxaWorker()`, `MakeSetGenusTaxaWorker()`, etc. +Helper workers to annotate sequences with family/genus/species taxonomy: +- Uses `Taxonomy.LCA()` and cached taxon IDs to assign ranks. +- Parallelized over sequence batches (10 seqs/worker). +- Ensures all indexed sequences carry full taxonomic context. + +--- + +### 5. **CLI Integration** + +#### `OptionSet(options *getoptions.GetOpt)` +Configures CLI options for the `obiuniq` tool: +- Delegates to `obiconvert.OptionSet(false)` (no verbose logging). +- Enables only options relevant for reference deduplication. +- Ensures consistent, minimal interface across OBITools4 tools. + +--- + +## Technical Highlights + +| Feature | Description | +|--------|-------------| +| **Parallelization** | Goroutines with `obidefault.ParallelWorkers()` for indexing, distance computation & clustering. | +| **Memory Efficiency** | Reused buffers (`matrix`), batched processing, and sequence deduplication reduce RAM footprint. | +| **Caching** | LCA lookups, 4-mer tables, and alignment matrices are cached to avoid recomputation. | +| **Logging & Validation** | Structured logging via `logrus`; panics on critical errors (e.g., missing taxonomy). | +| **Progress Tracking** | Optional progress bar via `progressbar/v3` during large DB processing. | + +--- + +## Output Format + +Indexed sequences carry a map: +```go +map[int]string // error_count → "Taxon@Rank" +``` +Example: +```json +{ + 0: "Homo@genus", + 2: "Hominoidea@superfamily", + 5: "Primates@order" +} +``` +Enables **rank-specific classification thresholds** (e.g., “assign to genus if ≤2 errors”). + +--- + +## Use Cases + +- **Metabarcoding classification**: Rapid assignment of reads to reference families. +- **Reference curation**: Cluster & deduplicate large databases before indexing. +- **Ecological inference**: Estimate taxonomic uncertainty from spatial proximity + taxonomy. + +> 📌 *Design principle*: Align with OBITools4’s philosophy—modular, parallelizable, and taxonomically aware. diff --git a/autodoc/docmd/pkg_obitools_obiscript.md b/autodoc/docmd/pkg_obitools_obiscript.md new file mode 100644 index 0000000..842e474 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obiscript.md @@ -0,0 +1,50 @@ +# `obiscript` Package: CLI Scriptable Processing Pipeline + +The `obiscript` package provides a high-level, modular interface for embedding custom Lua scripts into OBITools4’s sequence processing pipelines. It enables users to define bioinformatics workflows using a lightweight, embeddable scripting language—without sacrificing performance or composability. + +## Public API Overview + +### `CLIScriptPipeline() Pipeable` +Returns a reusable, parallelized pipeline stage that executes user-provided Lua scripts on sequence data. Internally uses `obilua.LuaScriptPipe()` with parallelism enabled (default worker count from `obidefault.ParallelWorkers`). Accepts a script path via the pipeline configuration (typically set through `CLIScriptFilename()` in CLI usage). Designed to integrate seamlessly with other pipeable stages from the `obiiter` framework. + +### Script Lifecycle Hooks (Exposed via Lua API) +The embedded ObiLua runtime expects the user script to define three optional functions: + +- **`begin()`** + Called once before processing any sequences. Used for initialization (e.g., counters, file handles). Optional. + +- **`worker(sequence)`** + Invoked for each input sequence. Provides full access to metadata via `sequence:attribute(name, value?)`, supports in-place modification of tags/IDs, and allows interaction with global context (`obicontext`) for cross-sequence state management. + +- **`finish()`** + Called after all sequences have been processed. Typically used to output summary statistics or cleanup resources. + +### CLI Integration + +- **`--script FILE`, `-S FILE`** + Specifies the Lua script to execute. The file must exist and be syntactically valid. + +- **`--template`** + Outputs a minimal, self-contained Lua script template to stdout. Includes stubs for `begin()`, `worker(...)`, and `finish()` with inline documentation. + +- **Shared Options** + Reuses configuration sets from core OBITools4 modules: + - `obiconvert.DataIOOptionSet`: input/output format, file paths. + - `obigrep.SequenceSelectionOptionSet`: filtering/sorting logic. + +## Semantic Role + +`obiscript` abstracts the complexity of embedding and orchestrating Lua scripts in a streaming, parallelizable context. It decouples *workflow logic* (Lua) from *pipeline orchestration* (Go), enabling: +- Rapid prototyping of NGS processing steps. +- Custom annotation, filtering, assembly, or reporting without recompilation. +- Consistent CLI behavior across OBITools4 tools. + +## Use Cases + +| Scenario | Example | +|---------|---------| +| Read filtering + renaming | Filter low-quality reads and prepend sample ID to sequence names | +| Annotation injection | Add UMI or barcode info from external metadata file per read | +| Summary reporting | Count reads passing filters, write stats to log at end | + +> *Designed for extensibility: users extend functionality by writing Lua, not Go.* diff --git a/autodoc/docmd/pkg_obitools_obisplit.md b/autodoc/docmd/pkg_obitools_obisplit.md new file mode 100644 index 0000000..d7b7f7a --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obisplit.md @@ -0,0 +1,48 @@ +# `obisplit` Package: Semantic Description + +The `obisplit` package enables **targeted splitting of biological sequences** using user-defined pattern pairs (e.g., primers, barcodes), supporting approximate matching and robust annotation of resulting fragments—ideal for demultiplexing in metabarcoding or amplicon sequencing pipelines. + +## Core Concepts + +- **`SplitSequence`**: Represents a pattern pair (forward/reverse) with an associated group name. Used to define searchable molecular tags. +- **`Pattern_match`**: Encapsulates a detected pattern instance, including name, genomic coordinates (1-based), error count, and orientation. + +## Pattern Detection (`LocatePatterns`) + +Scans a sequence for all forward/reverse pattern occurrences using **fuzzy matching** (mismatches and optionally indels): + +- Accepts raw or indexed sequences for efficient lookup. +- Detects matches with configurable error tolerance (default: ≤4 mismatches). +- Normalizes coordinates and reverse-complements backward-strand matches. +- Deduplicates overlapping hits by retaining the match with fewer errors. + +## Sequence Splitting (`SplitPattern`) + +Divides input sequences into fragments **between matched pattern pairs**, producing annotated output: + +- Each fragment is labeled with: + - `obisplit_frg`: Fragment number (1-based). + - `obisplit_nfrg`: Total fragment count. + - `obisplit_group`: Pattern-pair name (e.g., `"primerA-primerB"`), or `"extremity"` for terminal regions. + - `obisplit_set`: Relevant pattern set (e.g., `"primerA"`), or `"NA"`. + - `obisplit_location`: Genomic span (1-based, inclusive). +- Includes left/right pattern metadata: name, matched substring, and error count. + +## Pipeline Integration + +- **`SplitPatternWorker`**: Wraps splitting logic as a reusable `SeqWorker`, compatible with OBITools4’s streaming infrastructure. +- **`CLISplitPipeline`**: CLI entry point integrating pattern detection and splitting into a parallelizable, configurable pipeline. + +## Configuration & Usage + +- **CSV-based config**: Maps `tag` sequences to `pcr_pool` identifiers (required columns: `tag`, optionally `reverse_tag`). +- **CLI flags**: + - `-C, --config`: Load pattern definitions from CSV. + - `--template`: Output sample config for rapid setup. + - `--pattern-error N`: Max mismatches allowed (default: 4). + - `--allows-indels`: Enable insertion/deletion-aware matching. +- **Error handling**: Validates config structure, pattern compilation, and file access; logs fatal issues. + +## Design Goals + +Optimized for **high-throughput amplicon processing**, `obisplit` bridges pattern detection and fragment extraction with minimal assumptions—ensuring flexibility for diverse molecular tagging schemes. diff --git a/autodoc/docmd/pkg_obitools_obisummary.md b/autodoc/docmd/pkg_obitools_obisummary.md new file mode 100644 index 0000000..e6ce23a --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obisummary.md @@ -0,0 +1,64 @@ +# `obisummary` Package: Semantic Description + +The `obisumsummary` package delivers lightweight, high-performance statistical summarization of biological sequence data processed by OBITools4. It enables rapid profiling of metadata and content-level features across large sequence sets—especially useful post-processing (e.g., after `obiclean` or merging)—while supporting parallel execution for scalability. + +## Core Data Model + +- **`DataSummary` struct**: Central container tracking: + - Global metrics: number of reads, unique variants (distinct sequences), and total symbols. + - Presence flags for special annotations: `merged_sample`, `obiclean_status`/`weight`. + - Categorized annotation metadata: + - Scalar attributes (single-value per sequence). + - Map-like tags (`map_tags`), where each key maps to counts. + - Vector or vector-like attributes (multi-value per sequence). + - Per-sample statistics: variant count, singleton detection, and `obiclean`-related flags (e.g., bad reads). + +## Low-Level Helpers + +- **Map aggregation utilities**: + - `sumUpdateIntMap`: Accumulates integer values across maps. + - `countUpdateIntMap`, `plusOne/PlusUpdateIntMap`: Increment counters for keys (e.g., attribute or sample names). + +- **`Add()` method**: Thread-safe merge of two `DataSummary`s—enables parallel accumulation. + +## Main Processing Logic + +- **`Update()` method**: Processes one `BioSequence`, updating: + - Read count (via `.Count()`) and sequence-level metrics. + - Variant detection via unique sequences; symbol count (total length). + - Sample-aware logic: detects `merged_sample` or per-sample annotations to populate sample-level stats (e.g., singleton identification). + - Annotation classification: routes keys into scalar, map, or vector buckets. + +- **`ISummary()` function**: Parallel summarization engine: + - Distributes work across `nproc` goroutines. + - Aggregates partial summaries via atomic operations (`Add()`). + - Returns a structured map with: + ```json + { + "count": { "variants", "reads", "total_length" }, + "annotations": { + "scalar_attributes", + "map_attributes", + "vector_attributes", + "keys": { scalar: {...}, map: {...}, vector: {...} } + }, + "samples": { + "sample_count", + "sample_stats": { sample_name: { reads, variants, singletons [, obiclean_bad] } } + } + } + ``` + +## CLI Integration (`obisummary` subpackage) + +- **Option registration**: + - `SummaryOptionSet()`: Registers flags for output format (`--json-output`, `--yaml-output`) and map attributes to summarize (`-map `). + - `OptionSet()`: Extends above with input-handling options (e.g., file/iterator sources) from `obiconvert`. + +- **Runtime introspection**: + - `CLIOutFormat()`: Returns `"yaml"` (default) or `"json"`, with YAML only active if JSON is *not* requested. + - `CLIHasMapSummary()` / `CLIMapSummary()`: Check and retrieve requested map attributes. + +- **Design notes**: + - Uses global state (e.g., `__json_output__`, `__map_summary__`) for compatibility with [`go-getoptions`](https://github.com/DavidGamba/go-getoptions). + - Scope strictly limited to CLI configuration—no data processing logic resides here. diff --git a/autodoc/docmd/pkg_obitools_obitag.md b/autodoc/docmd/pkg_obitools_obitag.md new file mode 100644 index 0000000..4c7911d --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obitag.md @@ -0,0 +1,82 @@ +# `obitag` — Geometric & Alignment-Based Taxonomic Assignment Module + +The **OBITools4 `obitag`** package enables high-throughput taxonomic assignment of biological sequences using two complementary strategies: +- A **geometric approach** based on landmark-based coordinate mapping and distance minimization. +- An **alignment-aware heuristic** combining k‑mer pre-screening with LCS-based similarity scoring. + +Both modes integrate seamlessly into OBITools4 pipelines, support parallelization via `obiiter`, and enrich output sequences with rich metadata for downstream analysis. + +--- + +## Public Functionalities + +### 1. Reference Database Handling +- **`CLIRefDB()`**: Loads a reference database from file (FASTA/FASTQ/OBI/etc.) into `BioSequenceSlice`. +- **`CLISaveRefetenceDB()`**: Persists the loaded/processed reference DB to disk, with optional compression and parallel I/O. + +### 2. CLI Configuration & Options +- **`TagOptionSet()`**: Defines command-line flags: + - `-R/--reference-db`: *Required* input reference file. + - `--save-db`: Optional path to save processed DB (supports `.gz`, parallel write). + - `-G/--geometric`: Enables *experimental* geometric mode (faster, approximate). +- **`CLIGeometricMode()`**, `CLIRefDBName()`, `CLIRunExact()`: Runtime accessors for internal state. + +### 3. Geometric Taxonomic Assignment +- **`ExtractLandmarkSeqs()`**: Retrieves reference sequences annotated with non-default `LandmarkID`s, ordered by ID. +- **`ExtractTaxonSet()`**: Maps each landmark sequence to its taxonomic node (panics on missing taxa). +- **`MapOnLandmarkSequences()`**: Computes a coordinate vector for any query sequence: + - Measures LCS distances to each landmark → yields point in *landmark space*. +- **`FindGeomClosest()`**: Finds reference sequences with minimal Euclidean distance in landmark space; + - Resolves ties using LCS-based sequence identity (higher = better). +- **`GeomIdentify()`**: Assigns taxonomy to a query: + - If best identity >50% → LCA of matching references’ taxa, weighted by geometric distance. + - Else → assigns root taxon (`taxid=1`). + +### 4. Alignment-Based Taxonomic Assignment +- **`MatchDistanceIndex()`**: Maps a distance value to the closest taxon in `distanceIdx`: + - Binary search on sorted keys; falls back to root if no match. +- **`FindClosests()`**: Retrieves top matching references for a query: + - Pre-screening via **4-mer overlap** (`Common4Mer`). + - Refinement using LCS alignment scoring. + - Returns: top matches, edit distance (`maxe`), sequence identity (%), best match ID & indices. +- **`Identify()`**: Full taxonomic classification: + - Uses `FindClosests()`, precomputed reference indices (`OBITagRefIndex`), and LCA over matches. + - Assigns root taxon if no confident match; populates metadata (see below). + +### 5. Pipeline & Worker Integration +- **`GeomIdentifySeqWorker()`** / `IdentifySeqWorker()`: Wraps assignment logic into reusable sequence workers. +- **`CLIGeomAssignTaxonomy()`** / `CLIAssignTaxonomy()`: High-level CLI entry points: + - Filters/validates references, builds indexes (4-mer + taxon). + - Launches parallel batch processing via `obiiter`. + +--- + +## Output Metadata (Added to Assigned Sequences) + +| Attribute | Description | +|-----------|-------------| +| `"scientific_name"` | Taxonomic name of assigned node. | +| `"obitag_rank"` | Rank (e.g., `species`, `genus`). | +| `"obitag_bestid"` | Sequence identity (%) of best match. | +| `"obitat_min_dist"` | Minimal geometric distance (landmark space). | +| `"obitag_match_count"` | Number of matching references used for LCA. | +| `"obitat_coord"` | Landmark coordinates (geometric mode only). | +| `"obitation_similarity_method": "geometric"` | or `"alignment"`. | + +--- + +## Design Principles + +- **Dual-mode flexibility**: Choose between speed (`--geometric`) or accuracy (default alignment). +- **LCS-centric robustness**: Avoids full alignments; uses longest common subsequence for noise-tolerant scoring. +- **Index reuse**: Caches taxonomic indexes per reference to avoid recomputation in batch mode. +- **Fail-safe fallbacks**: Missing taxa or low identity → root taxon (`taxid=1`). +- **Scalability**: Parallel workers, batched iteration (`IBatchOver`), and optional compression. + +--- + +## Dependencies + +- `obitools4/obiiter`, `obiseq`, `obitax`, ` obialign`, `obikmer` +- Standard I/O via `obiconvert`, `obiformats` + diff --git a/autodoc/docmd/pkg_obitools_obitagpcr.md b/autodoc/docmd/pkg_obitools_obitagpcr.md new file mode 100644 index 0000000..b0f1c0a --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obitagpcr.md @@ -0,0 +1,59 @@ +# `obitagpcr` Package: Semantic Feature Overview + +The `obitagpcr` package extends the OBITools4 ecosystem with CLI-ready, high-performance tools for **tag-based amplicon sequencing data processing**, focusing on consistent read orientation and robust sample demultiplexing using molecular barcodes. + +## Core CLI Configuration + +- **`TagPCROptionSet()`** + Adds a `--reorientate` flag to the CLI parser. When enabled, ensures all reads are stored in a *forward-strand orientation* relative to the expected PCR primers—by reverse-complementing reads originally aligned in the opposite direction. + +- **`OptionSet()`** + Aggregates all required option sets for tag-PCR workflows: + - `obipairing.OptionSet()` — controls paired-end read assembly (e.g., overlap, identity thresholds), + - `obimultiplex.MultiplexOptionSet()` — enables sample demultiplexing via barcode matching, + - `TagPCROptionSet()` — injects the reorientation behavior. + +- **`CLIReorientate(cli)`** + Returns a boolean indicating whether reorientation is enabled, allowing downstream components to conditionally apply strand correction. + +## Sequence Processing Pipeline + +- **Paired-end assembly** + Uses `obipairing.AssemblePESequences()` to merge forward/reverse reads into consensus amplicons, respecting user-defined parameters: + - `minOverlap`, `minIdentity` — alignment stringency, + - gap/penalty parameters (`gapOpen`, `scale`) for accurate overlap resolution. + +- **Barcode extraction & validation** + Applies a compiled NGS filter (`CLINGSFilter`) to extract and validate barcodes from consensus sequences. Only reads with *exactly one* valid barcode (no error flags) proceed to demultiplexing. + +- **Sample assignment & metadata annotation** + Successful matches assign: + - `forward_tag`, `reverse_barcode` — raw barcode sequences, + - `obimultiplex_direction` — strand orientation relative to primer set (e.g., `"F"`, `"R"`), + - `obimultiplex_mismatches` — number of barcode mismatches, + - sample name (`obimultiplex_sample`) and experiment ID. + + Annotations are propagated to *both* reads in the original pair. + +- **Reorientation logic** + When `--reorientate` is active: reads assigned in reverse orientation (`"R"`) are reversed-complemented *before* final output, ensuring all consensus amplicons share a uniform forward orientation—critical for downstream alignment or variant calling. + +- **Error handling & filtering** + Failed demultiplexing (e.g., no match, ambiguous barcode) flags reads with `obimultiplex_error`. By default: + - Unidentified reads are discarded, *or* + - Saved to a dedicated file via `CLIUnidentifiedFileName(cli)`. + +- **Parallelization & scalability** + Leverages goroutines and batched iterators (`obidefault.ParallelWorkers()`) to maximize throughput across CPU cores. + +- **Observability** + Optional statistics tracking (`withStats`) and structured logging (e.g., `"Worker started"`, `"Barcode filter passed"`), aiding debugging and performance profiling. + +## Integration & Use Cases + +Designed for **amplicon/metabarcoding workflows** where: +- PCR amplifies both DNA strands, leading to bidirectional reads; +- Primer positions are fixed and known (enabling orientation-aware assembly); +- Consistent strand direction improves accuracy in alignment, clustering, or taxonomic assignment. + +Built on core OBITools4 modules (`obiseq`, `obiiter`, `obialign`, `obimultiplex`), it integrates cleanly into modular NGS pipelines while preserving modularity and CLI extensibility. diff --git a/autodoc/docmd/pkg_obitools_obitaxonomy.md b/autodoc/docmd/pkg_obitools_obitaxonomy.md new file mode 100644 index 0000000..6da63c0 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obitaxonomy.md @@ -0,0 +1,51 @@ +# `obitaxonomy`: CLI-Oriented Taxonomic Data Utilities for OBItools4 + +The `obitaxonomy` Go package delivers modular, command-line-friendly tools for loading, filtering, navigating, and exporting taxonomic data within the OBItools4 ecosystem. It focuses on enabling reproducible, scriptable workflows for metagenomics and biodiversity informatics by abstracting complex taxonomy operations behind intuitive CLI flags. + +## Public Functionalities + +### Taxonomy Restriction & Filtering +- **`CLITaxonRestrictions()`**: Wraps a taxonomy iterator to apply user-defined clade restrictions via `--restrict-to-taxon` (`-r`). Supports taxon IDs or names (with optional regex), returning a filtered iterator over matching subtrees. +- **`CLIFilterRankRestriction()`**: Restricts the taxonomy iterator to taxa of a specific rank (e.g., `"species"`, `"family"`), controlled by `--rank` (`-R`). Returns a constrained iterator for downstream processing. + +### Subtree Navigation & Iteration +- **`CLISubTaxonomyIterator()`**: Returns an iterator over the subtree rooted at a user-specified taxon ID (via `--dump`/`-D`). If no root is provided, exits with an error—enabling safe CLI-driven subtree extraction. + +### CSV Export +- **`CLICSVTaxaIterator()`**: Transforms a taxonomy iterator into an ordered stream of CSV records. Configurable columns include: + - Scientific name (`--without-scientific-name` to omit), + - Taxonomic rank (omittable via `-R`), + - Parent taxon ID (`--without-parent`/`-W`), + - Full lineage path (via `--path`, `-P`), + - Query source match (`--with-query`). +- **`CLICSVTaxaWriter()`**: Wraps `CLICSVTaxaIterator()`, handling output destination (`-` = stdout, file path otherwise), and integrates with CLI logging. + +### Tree Export +- **`CLINewickWriter()`**: Exports a taxonomy subtree (from `--dump`) as Newick format. Supports: + - Compression (`gzip` via `-z`), + - Leaf labels (scientific name/rank/taxid toggles), + - Root trimming (`--trim-root`), + - Output to file or stdout. + +### Data Acquisition +- **`CLIDownloadNCBITaxdump()`**: Fetches the latest NCBI taxonomy dump (`taxdump.tar.gz`) and saves it as `ncbitaxo_YYYYMMDD.tgz` (or custom name). Designed for one-click taxonomy setup. + +### Utility & Inspection Helpers +- **`CLIRankRestriction()` / `CLIWithScientificName()`**: Expose parsed CLI flags for use in custom processing pipelines. +- **`--rank-list` (`-l`)**: Prints all available ranks in the loaded taxonomy (for introspection). +- **Pattern matching**: `--fixed` (`-F`) disables regex for taxon name queries, enabling literal string matching. + +## Integration & Design Principles + +- Built on `obitax` for core taxonomy operations. +- Fully compatible with OBItools4’s option parsing (`getoptions`) and iterator patterns. +- Designed for composition: integrates seamlessly with `obiconvert` (output formatting) and other CLI modules. +- All functions respect `-`, stdout/stderr conventions, logging levels (`--verbose`), and CLI flag parsing. +- No internal state mutation—functions are pure wrappers around iterator transformations. + +## Target Use Cases + +- Filtering metagenomic assignments to a clade of interest (e.g., `--restrict-to-taxon 9606` for *Homo sapiens*). +- Exporting species-level taxa to CSV/JSON for downstream analysis. +- Generating Newick trees from custom taxonomic subsets (e.g., all *Enterobacteriaceae*). +- Bootstrapping local taxonomy caches via `--download-ncbi`. diff --git a/autodoc/docmd/pkg_obitools_obiuniq.md b/autodoc/docmd/pkg_obitools_obiuniq.md new file mode 100644 index 0000000..3aab0d6 --- /dev/null +++ b/autodoc/docmd/pkg_obitools_obiuniq.md @@ -0,0 +1,36 @@ +# `obiuniq` Package: Semantic Feature Overview + +The `obiuniq` package enables scalable, metadata-aware deduplication of biological sequence data (e.g., NGS amplicons or UMI-tagged reads), supporting both CLI and programmatic use. It groups identical sequences while preserving provenance, abundance counts, and user-defined metadata distinctions. + +## Core Functionalities + +### Sequence Dereplication +- **Grouping by user-defined attributes** (`--category-attribute`, `-c`): Sequences are collapsed based on one or more metadata fields (e.g., `sequence`, `umi`, `sample`), enabling stratified deduplication. +- **Singleton filtering** (`--no-singleton`, `-n`): Removes groups with only one member, reducing noise from sequencing errors or low-count artifacts. +- **NA value handling** (`--na-value`, `-N`): Replaces missing classifier tags (e.g., unassigned taxonomy) with a configurable placeholder to ensure consistent grouping. + +### Scalable & Configurable Processing +- **Chunked I/O** (`--chunk-count`, `--in-memory`): Processes large datasets efficiently using configurable disk-backed or in-memory chunking via the `obichunk` framework. +- **Sorting strategy** (`--on-disk`, `-d`): Switches between in-memory and external sorting to optimize memory usage for large inputs. +- **Parallelization** (`--parallel-workers`): Uses default worker threads to accelerate sorting and grouping steps. + +### Statistics & Metadata Preservation +- **Merge statistics** (`--merge`, `-m`): When enabled, records original sequence IDs per group (stored in `_StatsOn`) for lineage tracing and QC. +- **Flexible subcategorization** (`OptionSubCategory`): Allows grouping by multiple metadata keys (e.g., `umi + sample`) to support complex experimental designs. +- **Batch processing** (`--batch-size`, `OptionsBatchSize`): Controls chunk size for memory/performance tuning. + +### Programmatic Control +- **CLI state accessors**: Functions like `CLINAValue()`, `CLIKeys()`, and `CLINoSingleton()` expose runtime configuration. +- **Mutable setters**: Enable dynamic tuning (e.g., `SetNAValue()`, `AddStatsOn()`). +- **Integration with OBItools4**: Inherits generic I/O and options (`obiconvert.OptionSet`) for seamless pipeline compatibility. + +## `CLIUnique` Function + +Implements the main dereplication logic as a streaming iterator over deduplicated sequences (`obiiter.IBioSequence`). Each output sequence carries: +- A count of original occurrences (abundance), +- Merged metadata from input entries, +- Optional per-group statistics when `--merge` is active. + +Internally, it orchestrates chunked reading → sorting (in-memory or disk) → grouping → optional filtering — all guided by CLI-configurable parameters. Errors during initialization are logged via `log.Fatal`; runtime issues propagate through the iterator interface. + +Designed for high-performance, reproducible deduplication in UMI-aware or multiplexed NGS workflows. diff --git a/autodoc/docmd/pkg_obiutils.md b/autodoc/docmd/pkg_obiutils.md new file mode 100644 index 0000000..aca303c --- /dev/null +++ b/autodoc/docmd/pkg_obiutils.md @@ -0,0 +1,61 @@ +# `obiutils` — Semantic Feature Overview + +The **`obiutils`** package is a collection of low- and mid-level utilities for numerical computation, string manipulation, file I/O, concurrency control, data conversion, and format detection—specifically designed for bioinformatics pipelines in the OBITools 4 ecosystem. All public APIs are **type-safe**, **well-documented**, and optimized for performance or correctness depending on use case. + +## Core Functional Categories + +### 🔢 Numerical Utilities +- **`Abs[T constraints.Signed](x T) T`**: Generic absolute value for signed integers and floats (via `golang.org/x/exp/constraints`). +- **`Min/Max(...)`**: Unified functions accepting scalars, slices, or maps—uses reflection for heterogeneous inputs; returns errors on empty/unsupported types. +- **`MinMaxSlice[T constraints.Ordered]([]T) (min, max T)`**: Efficient min/max for ordered slices; panics on empty input. +- **`MinMultiset[T]`**: Lazy-delete min-priority multiset with O(log n) insertion, amortized O(1) minimum access. + +### 📦 Data Structures +- **`Set[E comparable]`**: Generic set using `map[E]struct{}` for O(1) membership; supports union, intersection, add/contains/members. +- **`Vector[T]`, `Matrix[T][][]T`**: Row-major 2D structures with methods: + - `.Column(i)`, `.Rows(indices...)`, `.Dim()` (safe for nil/jagged matrices). + - `Make2DArray[T]`, `Make2DNumericArray[T](rows, cols int, zeroed bool)` for allocation. + +### 🧠 Type Conversion & Validation +- **`InterfaceToString(i interface{}) string`**, + `CastableToInt(...)`, + `InterfaceToBool(...)` / `Int` / `Float64`: Safe conversions with typed errors (`NotAnInteger`, etc.). +- **`MapToMapInterface(...)`, `InterfaceToIntMap(...)` / `StringMap`: Converts generic maps to concrete types via reflection. +- **`InterfaceToStringSlice(...)`**: Normalizes `[]interface{}` or string slices to `[]string`. + +### 📄 File & Stream I/O +- **`ReadLines(path string) ([]string, error)`**: Buffered line-by-line file reading. +- **`Wfile` abstraction** (`OpenWritingFile`, `CompressStream`) with transparent gzip (via `pgzip`), buffering, and append support. +- **`Ropen/Wopen(...)`**: Unified opener for files/stdin/HTTP/pipes, auto-detecting gzip/xz/zstd/bzip2 via magic bytes. +- **`DownloadFile(url, path string)`**: Simple HTTP download with progress bar (no retries/timeouts). +- **`TarFileReader(r io.Reader, path string)`**: Extracts a single file from TAR by exact name match. + +### 🔤 String & ASCII Processing +- **`InPlaceToLower([]byte) []byte`**: Zero-copy uppercase→lowercase conversion for ASCII using bitwise OR (`| 32`). +- **`UnsafeStringFromBytes([]byte) string`, `UnsafeBytes(string) []byte`**: Zero-copy conversions (⚠️ unsafe; no bounds checks). +- **`AsciiSet[256]bool`**: Predefined sets (`Space`, `Digit`, `Alpha`) + operations (union, intersect) and helpers: + - `.FirstWord(...)`, `TrimLeft(s string)` (via method), `RightSplitInTwo(...)`. + +### 📏 Memory & Path Utilities +- **`ParseMemSize(s string) (int, error)`**: Parses `"128K"`, `"5MB"` → bytes. +- **`FormatMemSize(n int) string`**: Formats byte counts as `"1.5K"`, `"2M"` (powers of 1024). +- **`RemoveAllExt(path string)`, `Basename(path string)`**: Strip *all* extensions from paths (e.g., `"file.tar.gz"` → `"file"`). + +### 📡 Format Detection & MIME Handling +- **`HasBOM([]byte) bool, BOMType`**: Detects UTF-8/16/32 byte order marks. +- **`DropLastLine([]byte) []byte`**: Trims final newline-delimited line (for truncated files). +- **`RegisterOBIMimeType(...)`**: Extends MIME detection for bioformats (FASTA/FASTQ, CSV, ecoPCR2, GenBank) via regex/magic headers. + +### 🔄 Concurrency & Synchronization +- **`AtomicCounter(start int)`**: Thread-safe counter with `Inc()`, `Dec()`, `Value()` (mutex-protected). +- **`RegisterAPipe/UnregisterPipe()`, `WaitForLastPipe()`**: Lightweight pipeline sync via `sync.WaitGroup` (logs active goroutines). + +### 📊 Ranking & Ordering +- **`IntOrder(data []int) []int`, `ReverseIntOrder(...)`: Returns index permutation for ascending/descending sort (original slice unchanged). +- **`Order[T sort.Interface](data T) []int`: Generic stable index-based sorting. + +### 🧪 Testing & Reliability +- All functions include **unit tests** (table-driven, `reflect.DeepEqual`, subtests). +- Error handling is explicit and typed; logging via Logrus for debugging. +- No external dependencies beyond `golang.org/x/exp/constraints` (for generics) and optional libraries (`progressbar`, `pgzip`). +- Designed for portability across Unix/Windows (uses standard library paths). diff --git a/autodoc/examples/obiconvert/output.json b/autodoc/examples/obiconvert/output.json new file mode 100644 index 0000000..87f7387 --- /dev/null +++ b/autodoc/examples/obiconvert/output.json @@ -0,0 +1,23 @@ +[ + { + "annotations": { + "definition": "Test DNA sequence for FASTA conversion" + }, + "id": "seq001", + "sequence": "atcgatcgatcgatcgatcgatcgatcgatcgatcgatcg" + }, + { + "annotations": { + "definition": "Another test sequence with different nucleotide content" + }, + "id": "seq002", + "sequence": "gctagctagctagctagctagctagctagctagctagct" + }, + { + "annotations": { + "definition": "Third sequence for testing output format" + }, + "id": "seq003", + "sequence": "ttaaccggttaaccggttaaccggttaaccggttaaccg" + } +] diff --git a/autodoc/examples/obicount/out_default.txt b/autodoc/examples/obicount/out_default.txt new file mode 100644 index 0000000..a26c72e --- /dev/null +++ b/autodoc/examples/obicount/out_default.txt @@ -0,0 +1,7 @@ +time="2026-04-02T19:33:11+02:00" level=info msg="Number of workers set 16" +time="2026-04-02T19:33:11+02:00" level=info msg="Found 1 files to process" +time="2026-04-02T19:33:11+02:00" level=info msg="input.fasta mime type: text/fasta" +entities,n +variants,5 +reads,5 +symbols,435 diff --git a/autodoc/examples/obicount/out_fastq_reads.txt b/autodoc/examples/obicount/out_fastq_reads.txt new file mode 100644 index 0000000..8ed40ce --- /dev/null +++ b/autodoc/examples/obicount/out_fastq_reads.txt @@ -0,0 +1,4 @@ +time="2026-04-02T19:33:38+02:00" level=info msg="Number of workers set 16" +time="2026-04-02T19:33:38+02:00" level=info msg="Found 1 files to process" +entities,n +reads,4 diff --git a/autodoc/examples/obicount/out_symbols.txt b/autodoc/examples/obicount/out_symbols.txt new file mode 100644 index 0000000..3ff2c76 --- /dev/null +++ b/autodoc/examples/obicount/out_symbols.txt @@ -0,0 +1,5 @@ +time="2026-04-02T19:33:29+02:00" level=info msg="Number of workers set 16" +time="2026-04-02T19:33:29+02:00" level=info msg="Found 1 files to process" +time="2026-04-02T19:33:29+02:00" level=info msg="input.fasta mime type: text/fasta" +entities,n +symbols,435 diff --git a/autodoc/examples/obicount/out_variants.txt b/autodoc/examples/obicount/out_variants.txt new file mode 100644 index 0000000..0158a61 --- /dev/null +++ b/autodoc/examples/obicount/out_variants.txt @@ -0,0 +1,5 @@ +time="2026-04-02T19:33:20+02:00" level=info msg="Number of workers set 16" +time="2026-04-02T19:33:20+02:00" level=info msg="Found 1 files to process" +time="2026-04-02T19:33:20+02:00" level=info msg="input.fasta mime type: text/fasta" +entities,n +variants,5 diff --git a/autodoc/examples/obicsv/output6.csv.gz b/autodoc/examples/obicsv/output6.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..b6154237e7ecdae9cbff4302976e1a698fb651c0 GIT binary patch literal 85 zcmb2|=3rpp%t`!_9 literal 0 HcmV?d00001 diff --git a/autodoc/examples/obiscript/annotate.lua b/autodoc/examples/obiscript/annotate.lua new file mode 100644 index 0000000..82b64c4 --- /dev/null +++ b/autodoc/examples/obiscript/annotate.lua @@ -0,0 +1,9 @@ +-- Adds a 'sample' attribute by extracting the prefix before the first underscore +function worker(sequence) + local id = sequence:id() + local sample = string.match(id, "^(.-)_") + if sample then + sequence:attribute("sample", sample) + end + return sequence +end diff --git a/autodoc/examples/obiscript/enrich.lua b/autodoc/examples/obiscript/enrich.lua new file mode 100644 index 0000000..44806a2 --- /dev/null +++ b/autodoc/examples/obiscript/enrich.lua @@ -0,0 +1,5 @@ +-- Marks each sequence as processed by adding a 'processed' attribute +function worker(sequence) + sequence:attribute("processed", "true") + return sequence +end diff --git a/autodoc/examples/obiscript/enriched.json b/autodoc/examples/obiscript/enriched.json new file mode 100644 index 0000000..4e8667a --- /dev/null +++ b/autodoc/examples/obiscript/enriched.json @@ -0,0 +1,38 @@ +[ + { + "annotations": { + "definition": "long sequence passes min-length 100 filter", + "processed": "true" + }, + "id": "seq001", + "qualities": "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII", + "sequence": "atcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcgatcg" + }, + { + "annotations": { + "definition": "short sequence fails min-length 100 filter", + "processed": "true" + }, + "id": "seq002", + "qualities": "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII", + "sequence": "gctagctagctagctagctagctagctagctagctagctagctagctagctagctagctagctagctagctagctagcta" + }, + { + "annotations": { + "definition": "long sequence passes min-length 100 filter", + "processed": "true" + }, + "id": "seq003", + "qualities": "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII", + "sequence": "ttaattaattaattaattaattaattaattaattaattaattaattaattaattaattaattaattaattaattaattaattaattaattaattaattaattaattaattaa" + }, + { + "annotations": { + "definition": "short sequence fails min-length 100 filter", + "processed": "true" + }, + "id": "seq004", + "qualities": "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII", + "sequence": "ccggccggccggccggccggccggccggccggccggccggccggccggccgg" + } +] diff --git a/autodoc/examples/obiscript/my_script.lua b/autodoc/examples/obiscript/my_script.lua new file mode 100644 index 0000000..485458b --- /dev/null +++ b/autodoc/examples/obiscript/my_script.lua @@ -0,0 +1,17 @@ +function begin() + obicontext.item("compteur", 0) +end + +function worker(sequence) + samples = sequence:attribute("merged_sample") + samples["tutu"] = 4 + sequence:attribute("merged_sample", samples) + sequence:attribute("toto", 44444) + nb = obicontext.inc("compteur") + sequence:id("seq_" .. nb) + return sequence +end + +function finish() + print("compteur = " .. obicontext.item("compteur")) +end diff --git a/autodoc/examples/obiscript/process_pairs.lua b/autodoc/examples/obiscript/process_pairs.lua new file mode 100644 index 0000000..80b8eb5 --- /dev/null +++ b/autodoc/examples/obiscript/process_pairs.lua @@ -0,0 +1,4 @@ +-- Simple pass-through script: returns each sequence unchanged +function worker(sequence) + return sequence +end diff --git a/autodoc/examples/obisummary/out_json.json b/autodoc/examples/obisummary/out_json.json new file mode 100644 index 0000000..c465008 --- /dev/null +++ b/autodoc/examples/obisummary/out_json.json @@ -0,0 +1,17 @@ +{ + "annotations": { + "keys": { + "scalar": { + "count": 5 + } + }, + "map_attributes": 0, + "scalar_attributes": 1, + "vector_attributes": 0 + }, + "count": { + "reads": 21, + "total_length": 100, + "variants": 5 + } +} diff --git a/autodoc/prompt_doc.md b/autodoc/prompt_doc.md new file mode 100644 index 0000000..ad3f22a --- /dev/null +++ b/autodoc/prompt_doc.md @@ -0,0 +1,60 @@ +# Meta-tâche : documenter obi{xxx} + +Produis la documentation complète de la commande `obi{xxx}` en trois étapes séquentielles. + +**RÈGLE ABSOLUE DE SÉQUENTIALITÉ :** +- Ne lis JAMAIS le fichier d'une étape avant que l'étape précédente soit entièrement terminée. +- "Terminée" signifie : le Write final de l'étape a été émis et confirmé. +- Ne lis JAMAIS les trois fichiers en parallèle ou en avance. +- Entre deux étapes, ne produis aucun texte de transition — passe directement à la lecture. + +--- + +## ÉTAPE 1 + +Lis ce fichier : + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/prompt_v2.md"} + +``` + +Applique intégralement le prompt que tu viens de lire pour la commande `obi{xxx}`. +Exécute tous ses états dans l'ordre jusqu'au `Write` final. + +**STOP.** Le `Write` final de cette étape a-t-il été émis ? Si oui, procède à l'ÉTAPE 2. Sinon, termine l'ÉTAPE 1. + +--- + +## ÉTAPE 2 + +Lis ce fichier : + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/prompt_examples.md"} + +``` + +Applique intégralement le prompt que tu viens de lire pour la commande `obi{xxx}`. +Exécute tous ses états dans l'ordre jusqu'au `Write` final. + +**STOP.** Le `Write` final de cette étape a-t-il été émis ? Si oui, procède à l'ÉTAPE 3. Sinon, termine l'ÉTAPE 2. + +--- + +## ÉTAPE 3 + +Lis ce fichier : + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/prompt_hugo.md"} + +``` + +Applique intégralement le prompt que tu viens de lire pour la commande `obi{xxx}`. +Exécute tous ses états dans l'ordre jusqu'au `Write` final. + +**STOP.** Quand le `Write` final est émis, la tâche est terminée. N'émets aucun texte après. diff --git a/autodoc/prompt_examples.md b/autodoc/prompt_examples.md new file mode 100644 index 0000000..e982c62 --- /dev/null +++ b/autodoc/prompt_examples.md @@ -0,0 +1,407 @@ +# Task + +Given `autodoc/cmd/obi{xxx}.md`, produce: +1. `autodoc/examples/obi{xxx}/` — a directory containing synthetic input sequence files + that allow every example in the EXAMPLES section to be executed and validated. +2. An updated `autodoc/cmd/obi{xxx}.md` — with corrected EXAMPLES and an enriched OUTPUT + section describing observed output annotations. + +--- + +## TOOL CALL FORMAT — enforce before every call + +A tool call is exactly: + + + {"param": "value"} + + +Rules (no exceptions): +- `<` is immediately followed by `f` — zero spaces, zero characters in between. +- Parameters are a **single JSON object** — no XML tags, no ``, no ``. +- No outer wrapper — never use ``, ``, or any other enclosing tag. +- Tool name is lowercase with double underscores. + +--- + +## HALLUCINATION GUARD + +Every sequence written in STATE 2 must be biologically valid for the command being +tested. Derive sequence content from the OPTIONS and OUTPUT sections of `$doc` — never +invent behaviour not described there. + +**EXECUTION GUARD — critical:** The `## Observed output example` subsection added in +STATE 5 MUST contain verbatim bytes from `$outputs` (actual tool output read in STATE 4). +It MUST NOT be invented or approximated. If no command succeeded, omit the subsection +entirely rather than writing invented content. + +--- + +## DOCUMENT PRESERVATION — critical + +The output of STATE 5 is `$doc` with **surgical edits only**. The rules are: + +- Copy the ENTIRE content of `$doc` verbatim into the new file. +- Apply ONLY the three modifications described in STATE 5 (EXAMPLES update, + prose corrections, OUTPUT subsection addition). +- Do NOT reformat, reorder, rewrite, or restructure any heading, paragraph, + option list, or prose from `$doc` **unless it is factually contradicted by + actual execution results** (see Modification 2 in STATE 5). +- Do NOT add new top-level sections (no ENVIRONMENT VARIABLES, no duplicate OUTPUT, etc.). +- Do NOT change section title casing, Markdown heading levels, or list syntax. +- If in doubt, leave the section exactly as it appears in `$doc`. + +--- + +## FASTQ FORMAT — mandatory structure + +A valid FASTQ record is **exactly 4 lines** in this order: + +``` +@ + ← MUST be non-empty (≥ 10 characters, A/T/G/C only) ++ + ← MUST be the exact same length as the sequence line +``` + +Common mistakes that are **forbidden**: +- Writing `@header\n+\nquality` with the sequence line missing. +- Writing a quality string shorter or longer than the sequence. +- Mixing `>` (FASTA) and `@` (FASTQ) headers in the same file. +- Writing `~`-separated fields (e.g. `@seq002~description~here`) — use a space. + +--- + +## OUTPUT FORMAT GUARD + +OBITools4 determines the output format from the **data content and explicit flags**, +**not from the output filename extension**. A file named `out.fasta` will contain FASTQ +if quality scores are present and no `--fasta-output` flag is given. + +Rules when designing examples: +- If the example is meant to produce FASTA output from FASTQ input, the command MUST + include `--fasta-output`. +- If the example is meant to produce FASTQ output from FASTA input, the command MUST + include `--fastq-output`. +- Never assume an output format from the filename alone. +- Verify the actual format of each output file in STATE 3b by checking its first + character (`>` = FASTA, `@` = FASTQ, `[` or `{` = JSON). + +--- + +## OPTION VALIDATION GUARD + +Before writing any example command in STATE 2, explicitly cross-check each option +against the OPTIONS section of `$doc`: + +- Every flag used must appear in the OPTIONS section with the claimed semantics. +- Input-format flags (`--fasta`, `--fastq`, `--csv`, `--genbank`, `--embl`, + `--ecopcr`) tell the tool how to **read** the input. They do NOT affect the + output format. +- Output-format flags (`--fasta-output`, `--fastq-output`, `--json-output`) tell + the tool what format to **write**. If there is no `--csv-output` (or similar) in + the OPTIONS section, do NOT write an example claiming CSV output. +- If an option needed for a working example is absent from `$doc`, mark that example + as SKIP rather than inventing a flag. + +--- + +## ANNOTATION RULES — CRITICAL + +When creating FASTA/FASTQ files with annotations: +- Use **only** valid annotation attribute names: `taxid`, `scientific_name`, `rank`, `definition`, `sample`, `run_id`, `instrument` +- For taxonomy data: use `taxid` (NCBI Taxonomy ID) and `scientific_name` — never invent taxids +- Examples of valid taxonomy annotations: + - `>seq001 {"taxid":2}` — Bacteria (valid NCBI taxid) + - `>seq002 {"taxid":2157,"scientific_name":"Archaea"}` — Archaea (valid NCBI taxid) + - `>seq003 {"taxid":2759,"scientific_name":"Eukaryota"}` — Eukaryota (valid NCBI taxid) +- NEVER use invented taxids +- **Map attributes** (JSON maps) must have names ending with `_merged` (e.g., `taxid_merged`, `sample_merged`) + +--- + +## CSV FILES FOR JOINS + +When creating CSV files for `obijoin`: +- Do NOT include the ID column in the CSV (the join key is specified separately via `--by`) +- The CSV format is auto-detected; do NOT use `--csv` flag +- Example CSV structure for taxid join: + ``` + taxid,scientific_name,phylum + 2,Bacteria,Proteobacteria + 2157,Archaea,Euryarchaeota + 2759,Eukaryota,Arthropoda + ``` +- Example command: `obijoin --join-with taxonomy.csv --by taxid sequences.fasta` + +--- + +## PIPELINE + +Execute the five states below in order. Do not skip states. Do not merge states. + +--- + +### STATE 1 — Read the documentation file and fetch pipeline command docs + +**Input:** nothing. +**Action:** + +Step 1a — read the autodoc file: +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/cmd/obi{xxx}.md"} + +``` + +Step 1b — scan the EXAMPLES section of the file just read for any `obi*` commands +other than `obi{xxx}` itself that appear in pipeline examples (e.g. `obigrep`, `obiuniq`, +`obiclean`). For each such command found, emit a WebFetch call to retrieve its online +documentation (in the same parallel message as Step 1a if possible, otherwise +immediately after): +``` + +{"url": "https://obitools4.metabarcoding.org/obitools/obi/"} + +``` +If the page returns a 404 or error, store an empty string for that command. + +**Output:** store content as `$doc`, and store fetched pages as `$pipeline_docs` +(a map from command name to page content). +**Stop.** Do not interpret or summarise. Proceed to STATE 2. + +--- + +### STATE 2 — Analyse examples and design input files + +**Input:** `$doc`. +**Action (no tool calls):** + +1. Extract every example command from the EXAMPLES section of `$doc`. + - Identify every distinct input filename referenced (e.g. `sequences.fasta`, + `reads_R1.fastq`, …). + - Identify every option used and verify each against the OPTIONS section (see + OPTION VALIDATION GUARD above). + - For any `obi*` command used in a pipeline (not `obi{xxx}` itself), verify its + flags and expression syntax against `$pipeline_docs`. If `$pipeline_docs` for + that command is empty (page not found), mark the example as SKIP rather than + guessing the syntax. + - **Coverage check — command-specific options:** list all command-specific options + from the OPTIONS section (excluding those covered by standard option-sets: `--fasta`, + `--fastq`, `--out`, `--compress`, `--max-cpu`, etc.). Verify that every such option + appears in at least one non-skipped example. If any option is not covered, **add an + additional example** that exercises it before proceeding. + - **Skip any example that requires an external resource** (taxonomy database, + remote URL, pre-existing output file from a previous step not produced here). + Mark it as SKIP — it will be kept verbatim in the EXAMPLES section without + a `**Expected output:**` annotation. + - **`--paired-with` examples:** `--paired-with` requires `--out` (standard output + cannot be used). The command produces TWO output files named `_R1.ext` + and `_R2.ext` where `` is the stem of the value given to `--out` + and `.ext` is the format extension. For example: + `obi{xxx} --paired-with reverse.fastq --out out_paired.fastq forward.fastq` + produces `out_paired_R1.fastq` and `out_paired_R2.fastq`. + Do NOT use `>` redirection for paired-with examples — use `--out` only. + In STATE 4, read both `_R1` and `_R2` output files. + +2. For each distinct input filename, design synthetic sequence content that: + - Is **minimal** (≤ 20 sequences, each ≤ 300 bp). + - Contains sequences that **will** produce output for the given command (positive cases). + - Contains at least one sequence that **will not** produce output, to confirm filtering + (negative case), when the command filters sequences. + - Exercises every option combination present in the non-skipped examples. + - Uses realistic-looking identifiers (`seq001`, `seq002`, …) and a short + definition that describes what makes the sequence relevant to the test. + +3. **File format rules (strictly enforced):** + + **FASTA:** one `>id description` header line, then the sequence on one or more + lines (60 bp per line). Every sequence must be non-empty (≥ 10 bp, A/T/G/C only). + + **FASTQ:** exactly 4 lines per record — see FASTQ FORMAT section above. + Before finalising the FASTQ content, mentally verify each record: + - Line 1 starts with `@`, has an identifier, optionally a space and description. + - Line 2 is the nucleotide sequence (non-empty, ≥ 10 characters). + - Line 3 is exactly `+` (nothing else required). + - Line 4 is the quality string with **exactly the same number of characters** + as line 2. + If any record fails this check, fix it before proceeding. + +4. Rewrite every non-skipped example command into two forms: + - `$cmds_doc`: the bare command as it will appear in the documentation — references + only filenames present in `autodoc/examples/obi{xxx}/`, output redirected to a + descriptive filename (e.g. `out_default.fasta`). **No `cd` prefix.** + - `$cmds_run`: the same command prefixed with the `cd` so it can be executed: + `cd /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx} &&` + +**Output:** store file designs as `$files`, `$cmds_doc`, and `$cmds_run`. +**Stop.** Proceed to STATE 3. + +--- + +### STATE 3 — Write input files, validate them, and run examples + +**Input:** `$files`, `$cmds_doc`, `$cmds_run`. + +**Step 3a — create input files (parallel):** +Emit one Write call per input file designed in STATE 2. + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/FILENAME", "content": "..."} + +``` + +**Stop.** Wait for all writes to complete. Then proceed to Step 3b. + +**Step 3b — validate input files:** +Before running any example, emit one Bash call that checks every written input file: + +``` + +{"command": "cd /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx} && python3 -c \"\nimport sys\nfor fname in $(echo FILENAMES):\n lines = open(fname).readlines()\n if fname.endswith('.fastq'):\n assert len(lines) % 4 == 0, f'{fname}: line count not multiple of 4'\n for i in range(0, len(lines), 4):\n hdr, seq, plus, qual = lines[i:i+4]\n assert hdr.startswith('@'), f'{fname} record {i//4+1}: header must start with @'\n seq = seq.rstrip(); qual = qual.rstrip()\n assert len(seq) >= 10, f'{fname} record {i//4+1}: sequence too short ({len(seq)})'\n assert len(seq) == len(qual), f'{fname} record {i//4+1}: seq len {len(seq)} != qual len {len(qual)}'\n elif fname.endswith('.fasta') or fname.endswith('.fa'):\n assert lines[0].startswith('>'), f'{fname}: first line must start with >'\nprint('All input files valid')\n\" 2>&1; echo EXIT:$?"} + +``` + +If validation fails (EXIT non-zero or output is not `All input files valid`): fix the +offending file(s) with new Write calls, then re-run validation. Do NOT proceed to +Step 3c until validation passes. + +**Step 3c — run examples (sequential, one Bash call at a time):** +Emit ONE Bash call, wait for the result, then emit the next. Do NOT batch them. + +``` + +{"command": "cd /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx} && COMMAND 2>&1; echo EXIT:$?"} + +``` + +After each successful run (EXIT:0), immediately verify the output file was actually +created and is non-empty with a second Bash call: + +``` + +{"command": "ls -la /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/OUTPUT_FILE && head -c 100 /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/OUTPUT_FILE"} + +``` + +Also verify the output format matches expectation using the first character rule +(see OUTPUT FORMAT GUARD): `>` = FASTA, `@` = FASTQ, `[`/`{` = JSON. If the format +is wrong, add the missing `--fasta-output` / `--fastq-output` / `--json-output` flag, +update `$cmds_doc` and `$cmds_run`, and re-run. + +For each command, record in `$runs`: +- The `$cmds_doc` form (bare command for documentation). +- Exit code. +- The output filename(s). +- The confirmed output format (FASTA / FASTQ / JSON). +- The full stdout/stderr text. + +If a command fails (EXIT non-zero): diagnose the error from stderr, fix the command, +update both `$cmds_doc` and `$cmds_run`, and re-run. +Do NOT proceed to STATE 4 until all non-skipped commands have EXIT:0 and verified +non-empty output files. + +**Output:** store per-command results as `$runs`. +**Stop.** Proceed to STATE 4. + +--- + +### STATE 4 — Read output files + +**Input:** `$runs` (output file paths from STATE 3). +**Action:** emit one Read call per output file that was successfully produced (EXIT:0). + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/OUTPUT_FILE"} + +``` + +Emit all reads in a single parallel message. + +**Output:** store contents as `$outputs`. +**Stop.** Proceed to STATE 5. + +--- + +### STATE 5 — Update the documentation file + +**Input:** `$doc`, `$runs`, `$outputs`, `$cmds_doc`. + +Produce the updated file by copying `$doc` **verbatim** and applying ONLY the +three modifications below. Re-read the DOCUMENT PRESERVATION rules at the top before +writing. + +#### Modification 1 — EXAMPLES section + +For each non-skipped example: +- Replace the original command with the rewritten `$cmds_doc` form. +- Keep the one-line biological use-case comment above the code block unchanged. +- The `**Expected output:**` annotation goes on its own line **after** the closing + triple-backtick of the code block, never inside it: + + ``` + ```bash + obi{xxx} [options] input_file > out_name.fasta + ``` + + **Expected output:** N sequences written to `out_name.fasta`. + ``` + + where N is the count of lines starting with `>` or `@` in the corresponding + `$outputs` entry. + +For skipped examples: keep them exactly as they are in `$doc` with no annotation. + +#### Modification 2 — Prose corrections (DESCRIPTION, OPTIONS, NOTES, …) + +After completing all runs in STATE 3, compare `$runs` and `$outputs` against the +prose in `$doc` outside the EXAMPLES section. For each **factual contradiction** +found — where the documentation claims a behaviour that actual execution disproves — +apply a minimal correction: + +- Fix only the specific sentence or phrase that is wrong. Do not rewrite the + surrounding paragraph. +- Preserve the original wording as much as possible; change only what is incorrect. +- Examples of things to correct: + - An option described as producing output X when it actually produces output Y. + - A default value stated incorrectly. + - An attribute name that differs from what appears in actual output. + - A claim about which sequences are selected/discarded that contradicts observed results. + - An output format claimed by the documentation that differs from the actual output + format observed (e.g. claiming CSV output when the tool produces FASTA). +- After each corrected passage, add an inline HTML comment documenting the fix: + `` +- Do NOT "improve" text that is merely incomplete or imprecise — only fix outright + contradictions with observed behaviour. + +#### Modification 3 — OUTPUT section + +Find the existing `# OUTPUT` section in `$doc`. At the very end of that section +(before the next `---` or `#` heading), append a single new subsection: + +```markdown +## Observed output example + +``` + +``` +``` + +Rules: +- The excerpt is copied byte-for-byte from `$outputs`. No editing, no truncation + within a sequence record. +- Do NOT duplicate the OUTPUT section. There must be exactly one `# OUTPUT` heading + in the resulting file. +- If no output was successfully produced, omit this subsection entirely. + +#### Final write + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/cmd/obi{xxx}.md", "content": "..."} + +``` + +**Stop. Do not emit any text after the Write call.** diff --git a/autodoc/prompt_full.md b/autodoc/prompt_full.md new file mode 100644 index 0000000..dd3c580 --- /dev/null +++ b/autodoc/prompt_full.md @@ -0,0 +1,791 @@ +# Task + +Produce complete documentation for the `obi{xxx}` CLI command: +1. `autodoc/cmd/obi{xxx}.md` — the markdown documentation file +2. `autodoc/examples/obi{xxx}/` — synthetic input sequence files for testing examples +3. `obitools4-doc/content/docs/commands//obi{xxx}/_index.md` — Hugo documentation page + +Execute the three phases below **in order**. Do not skip phases. Do not merge phases. + +--- + +## TOOL CALL FORMAT — enforce before every call + +A tool call is exactly: + + + {"param": "value"} + + +Rules: +- `<` is immediately followed by `f` — zero spaces, zero characters in between. +- Parameters are a **single JSON object** — no XML tags, no ``, no ``. +- No outer wrapper — never use ``, ``, or any other enclosing tag. +- Tool name lowercase with double underscores — never ALL_CAPS, never single underscore between server and tool name. + +CORRECT: `` +WRONG: `< function=mcp__treesitter__treesitter_get_dependencies >` ← spaces +WRONG: `` ← wrong separator + +--- + +## HALLUCINATION GUARD — enforce before writing anything + +OBITools4 is a **complete rewrite**. Training data about OBITools v1/v2/v3 is wrong for this version. + +Before writing any sentence, apply this check: + +> "Can I point to the exact line in $help or $docs that justifies this claim?" +> If NO → do not write it. + +This applies to: option names, option flags, default values, file formats, behaviours, algorithms, output fields. +Omit rather than guess. A shorter correct page is better than a longer hallucinated one. + +--- + +## PHASE 1 — Generate initial documentation file + +(Equivalent to prompt_v2.md) + +### STATE 1 — Gather raw data (parallel) + +**Input:** nothing. +**Action:** emit all of the following tool calls in a single message. + +Call 1 — dependencies of the main entry point: +``` + +{"language": "go", "file_path": "cmd/obitools/obi{xxx}/main.go"} + +``` + +Call 2 — dependencies of every file in the command package (one call per `.go` file in `pkg/obitools/obi{xxx}/`): +``` + +{"language": "go", "file_path": "pkg/obitools/obi{xxx}/options.go"} + + +{"language": "go", "file_path": "pkg/obitools/obi{xxx}/obi{xxx}.go"} + +``` + +Call 3 — symbol outline of the command package: +``` + +{"repo": "git.metabarcoding.org/obitools/obitools4/obitools4", "file_paths": ["pkg/obitools/obi{xxx}/options.go", "pkg/obitools/obi{xxx}/obi{xxx}.go"]} + +``` + +Call 4 — CLI help text: +``` + +{"command": "cd /Users/coissac/Sync/travail/__MOI__/GO/obitools4 && obi{xxx} --help 2>&1"} + +``` + +**Output:** store results as `$deps`, `$outline`, `$help`. +**Stop.** Do not interpret, summarise, or write anything. Proceed to STATE 2. + +--- + +### STATE 2 — Resolve documentation files + +**Input:** `$deps`, `$outline`. +**Action (no tool calls):** + +1. Collect every import path that starts with `git.metabarcoding.org/obitools/obitools4/obitools4/pkg/`. +2. Remove these infrastructure packages: + - `pkg/obidefault`, `pkg/obiiter`, `pkg/obiformats`, `pkg/obioptions`, `pkg/obiutils`, `pkg/obiparams` + - `pkg/obiseq` — keep only if `$outline` shows non-trivial sequence manipulation (custom methods or transformations beyond simple access). +3. Always add: `pkg/obitools/obi{xxx}` and `pkg/obitools/obiconvert`. +4. Map each remaining package path to its doc file: + - take all path segments from `pkg/` onward (inclusive), replace `/` with `_` → `autodoc/docmd/.md` + - Example: `git.metabarcoding.org/.../pkg/obitools/obicsv` → `autodoc/docmd/pkg_obitools_obicsv.md` + - Example: `git.metabarcoding.org/.../pkg/obiseq` → `autodoc/docmd/pkg_obiseq.md` + +**Output:** store file list as `$docfiles`. +**Stop.** Proceed to STATE 3. + +--- + +### STATE 3 — Read documentation files (parallel) + +**Input:** `$docfiles`. +**Action:** emit one `Read` call per file in `$docfiles`, all in a single parallel message. + +Do NOT read any `.go` source file. Do NOT produce any summary or analysis. + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/DOCFILE"} + +``` + +**Output:** store all file contents as `$docs`. +**Stop.** Proceed to STATE 4. + +--- + +### STATE 4 — Write the documentation file + +**Input:** `$help`, `$docs`, `$outline`. +**Action:** fill the template below, then emit exactly one `Write` call. + +**Source discipline:** every piece of information in the template MUST come from `$help`, `$docs`, or `$outline`. +If the source does not contain the information, write `_(not available)_` — never invent. + +#### Output template + +```markdown +# NAME + +obi{xxx} — [FILL: one-line description. Source: first line of $help] + +--- + +# SYNOPSIS + +[FILL: verbatim USAGE block from $help, inside a code block] + +--- + +# DESCRIPTION + +[FILL: 2–4 paragraphs explaining what the command does, why a biologist would use it, +and what it produces. Source: $help description section + $docs. +No jargon. No implementation details (goroutines, channels, GC, arena). +No options that belong in the OPTIONS section.] + +--- + +# INPUT + +[FILL: accepted input formats and how to provide them. Source: $help + $docs/obiconvert.] + +--- + +# OUTPUT + +[FILL: output format and what fields/attributes are added or changed. Source: $help + $docs. +If the default output is JSON, state it clearly. If YAML, state it clearly. Do not assume.] + +--- + +# OPTIONS + +[FILL: one subsection per thematic group found in $help. +For each flag: +- Flag name(s): long form + short form if it exists. Source: $help exactly. +- Default: state it. Source: $help exactly. If absent from $help: write "none". +- Meaning: explain the biological or practical purpose, not just the mechanical action. +- Do NOT include any flag not present in $help.] + +--- + +# EXAMPLES + +[FILL: at least four copy-pasteable examples. +Each example: +1. A one-line comment explaining the biological use case. +2. The command inside a code block. +3. MUST include output redirection (`> out.fasta`) or `-o` flag so the user can reproduce the output. +4. Use consistent file names across examples (same input file for different options). +5. COVERAGE RULE: every command-specific option documented in the OPTIONS section MUST + appear in at least one example. If needed, add extra examples to achieve full coverage. +Source for flags and options: $help only.] + +--- + +# SEE ALSO + +[FILL: related obi commands mentioned in $docs or $help. If none: omit section.] + +--- + +# NOTES + +[FILL: caveats, performance notes, known limitations. Source: $docs only. +If none: omit section.] +``` + +Emit the Write call: +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/cmd/obi{xxx}.md", "content": "..."} + +``` + +**Stop.** Proceed to PHASE 2. + +--- + +## PHASE 2 — Test examples and enrich documentation + +(Equivalent to prompt_examples.md) + +### DOCUMENT PRESERVATION — critical + +The output of STATE 5 is `$doc` with **surgical edits only**. The rules are: + +- Copy the ENTIRE content of `$doc` verbatim into the new file. +- Apply ONLY the three modifications described in STATE 5 (EXAMPLES update, prose corrections, OUTPUT subsection addition). +- Do NOT reformat, reorder, rewrite, or restructure any heading, paragraph, option list, or prose from `$doc` **unless it is factually contradicted by actual execution results**. +- Do NOT add new top-level sections (no ENVIRONMENT VARIABLES, no duplicate OUTPUT, etc.). +- Do NOT change section title casing, Markdown heading levels, or list syntax. +- If in doubt, leave the section exactly as it appears in `$doc`. + +--- + +### Prerequisites — FASTQ FORMAT + +A valid FASTQ record is **exactly 4 lines** in this order: + +``` +@ + ← MUST be non-empty (≥ 10 characters, A/T/G/C only) ++ + ← MUST be the exact same length as the sequence line +``` + +Common mistakes **forbidden**: +- Writing `@header\n+\nquality` with the sequence line missing. +- Writing a quality string shorter or longer than the sequence. +- Mixing `>` (FASTA) and `@` (FASTQ) headers in the same file. +- Writing `~`-separated fields (e.g. `@seq002~description`) — use a space. +- Writing a quality string containing characters outside printable ASCII (33–126). + +--- + +### OUTPUT FORMAT GUARD + +OBITools4 determines the output format from **data content and explicit flags**, NOT from filename extension. + +- If the example is meant to produce FASTA output from FASTQ input, the command MUST include `--fasta-output`. +- If the example is meant to produce FASTQ output from FASTA input, the command MUST include `--fastq-output`. +- Never assume an output format from the filename alone. +- Verify the actual format of each output file by checking its first character: `>` = FASTA, `@` = FASTQ, `[`/`{` = JSON. +- If the format is wrong, add the missing flag, update `$cmds_doc` and `$cmds_run`, and re-run. + +--- + +### OPTION VALIDATION GUARD + +Before writing any example command in STATE 2, explicitly cross-check each option against the OPTIONS section of `$doc`: + +- Every flag used must appear in the OPTIONS section with the claimed semantics. +- Input-format flags (`--fasta`, `--fastq`, `--csv`, `--genbank`, `--embl`, `--ecopcr`) tell the tool how to **read** input — they do NOT affect output format. +- Output-format flags (`--fasta-output`, `--fastq-output`, `--json-output`) control **write** format. +- If an option needed for a working example is absent from `$doc`, mark that example as SKIP rather than inventing a flag. + +--- + +### ANNOTATION RULES — CRITICAL + +When creating FASTA/FASTQ files with annotations: +- Use **only** valid annotation attribute names: `taxid`, `scientific_name`, `rank`, `definition`, `sample`, `run_id`, `instrument` +- For taxonomy data: use `taxid` (NCBI Taxonomy ID) and `scientific_name` — never invent taxids +- Examples of valid taxonomy annotations: + - `>seq001 {"taxid":2}` — Bacteria (valid NCBI taxid) + - `>seq002 {"taxid":2157,"scientific_name":"Archaea"}` — Archaea (valid NCBI taxid) + - `>seq003 {"taxid":2759,"scientific_name":"Eukaryota"}` — Eukaryota (valid NCBI taxid) +- NEVER use invented taxids +- **Map attributes** (JSON maps) must have names ending with `_merged` (e.g., `taxid_merged`, `sample_merged`) + +--- + +### CSV FILES FOR JOINS + +When creating CSV files for `obijoin`: +- Do NOT include the ID column in the CSV (the join key is specified separately via `--by`) +- The CSV format is auto-detected; do NOT use `--csv` flag +- Example CSV structure for taxid join: + ``` + taxid,scientific_name,phylum + 2,Bacteria,Proteobacteria + 2157,Archaea,Euryarchaeota + 2759,Eukaryota,Arthropoda + ``` +- Example command: `obijoin --join-with taxonomy.csv --by taxid sequences.fasta` + +--- + +### STATE 1 — Read the documentation file + +**Input:** nothing. +**Action:** emit a single Read call. + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/cmd/obi{xxx}.md"} + +``` + +**Output:** store content as `$doc`. +**Stop.** Do not interpret or summarise. Proceed to STATE 2. + +--- + +### STATE 2 — Analyse examples and design input files + +**Input:** `$doc`. +**Action (no tool calls):** + +1. Extract every example command from the EXAMPLES section of `$doc`. + - Identify every distinct input filename referenced. + - Identify every option used and verify each against the OPTIONS section (OPTION VALIDATION GUARD). + - **Skip any example that requires an external resource** (taxonomy database, remote URL, pre-existing output file). Mark it SKIP — it will be kept verbatim in the final doc without a `**Expected output:**` annotation. + - **`--paired-with` examples:** `--paired-with` requires `--out` (standard output cannot be used). The command produces TWO output files `_R1.ext` and `_R2.ext`. Do NOT use `>` redirection for these. In STATE 4, read both `_R1` and `_R2` files. + +2. **Coverage check — command-specific options:** + From the OPTIONS section of `$doc`, list all command-specific options (excluding those covered by standard option-sets: input, output, common — see Phase 3 STATE 2 item 8 for the full list). + Verify that every such option appears in at least one non-skipped example. + If any option is not covered, **add an additional example** that exercises it before proceeding. + +3. For each distinct input filename, design synthetic sequence content that: + - Is **minimal** (≤ 20 sequences, each ≤ 300 bp). + - Contains sequences that **will** produce output (positive cases) AND at least one that **will not** produce output (negative case), when the command filters sequences. + - Exercises every option combination present in the non-skipped examples. + - Uses realistic identifiers (`seq001`, `seq002`, …). + +4. **File format rules (strictly enforced):** + - **FASTA:** `>id description` header, then sequence on one or more lines (60 bp per line), ≥ 10 bp, A/T/G/C only. + - **FASTQ:** exactly 4 lines per record. Before finalising, mentally verify each record: + - Line 1 starts with `@`, has an identifier, optionally a space and description. + - Line 2 is the nucleotide sequence (≥ 10 characters, A/T/G/C only). + - Line 3 is exactly `+`. + - Line 4 has **exactly the same number of characters** as line 2. + If any record fails this check, fix it before proceeding. + +5. Rewrite every non-skipped example command into two forms: + - `$cmds_doc`: the bare command as it will appear in documentation — filenames only, **no `cd` prefix**. + - `$cmds_run`: the same command prefixed with `cd /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx} &&` + +**Output:** store file designs as `$files`, `$cmds_doc`, `$cmds_run`. +**Stop.** Proceed to STATE 3. + +--- + +### STATE 3 — Write input files, validate, and run examples + +**Step 3a — create input files (parallel):** +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/FILENAME", "content": "..."} + +``` + +**Step 3b — validate input files:** +``` + +{"command": "cd /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx} && python3 -c \"\nimport sys\nfor fname in $(echo FILENAMES):\n lines = open(fname).readlines()\n if fname.endswith('.fastq'):\n assert len(lines) % 4 == 0, f'{fname}: line count not multiple of 4'\n for i in range(0, len(lines), 4):\n hdr, seq, plus, qual = lines[i:i+4]\n assert hdr.startswith('@'), f'{fname} record {i//4+1}: header must start with @'\n seq = seq.rstrip(); qual = qual.rstrip()\n assert len(seq) >= 10, f'{fname} record {i//4+1}: sequence too short ({len(seq)})'\n assert len(seq) == len(qual), f'{fname} record {i//4+1}: seq len {len(seq)} != qual len {len(qual)}'\n elif fname.endswith('.fasta') or fname.endswith('.fa'):\n assert lines[0].startswith('>'), f'{fname}: first line must start with >'\n seq_len = sum(len(l.rstrip()) for l in lines[1:] if not l.startswith('>'))\n assert seq_len >= 10, f'{fname}: total sequence length too short ({seq_len})'\nprint('All input files valid')\n\" 2>&1; echo EXIT:$?"} + +``` + +If validation fails (EXIT non-zero): fix the offending file(s) and re-run. Do NOT proceed until validation passes. + +**Step 3c — run examples (sequential, one Bash call at a time):** + +For each non-skipped example, emit the run command, wait for the result, then immediately verify. + +Run: +``` + +{"command": "cd /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx} && COMMAND 2>&1; echo EXIT:$?"} + +``` + +After each EXIT:0, verify the output file exists and is non-empty: +``` + +{"command": "ls -la /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/OUTPUT_FILE && head -c 200 /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/OUTPUT_FILE"} + +``` + +Check output format from the first character: `>` = FASTA, `@` = FASTQ, `[`/`{` = JSON. +If the format does not match expectation, add the missing `--fasta-output` / `--fastq-output` / `--json-output` flag, update `$cmds_doc` and `$cmds_run`, and re-run. + +If a command fails (EXIT non-zero): diagnose, fix, update `$cmds_doc` and `$cmds_run`, and re-run. +Do NOT proceed to STATE 4 until all non-skipped commands have EXIT:0 and verified non-empty output files. + +**Output:** store per-command results as `$runs`. +**Stop.** Proceed to STATE 4. + +--- + +### STATE 4 — Read output files + +**Input:** `$runs` (output file paths from STATE 3). +**Action:** emit one Read call per output file successfully produced (EXIT:0), all in a single parallel message. + +Do NOT re-run commands — read only files already generated in STATE 3. + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/OUTPUT_FILE"} + +``` + +**Output:** store contents as `$outputs`. +**Stop.** Proceed to STATE 5. + +--- + +### STATE 5 — Update the documentation file + +**Input:** `$doc`, `$runs`, `$outputs`, `$cmds_doc`. + +Re-read DOCUMENT PRESERVATION at the top before writing. Apply ONLY these modifications: + +#### Modification 1 — EXAMPLES section + +For each non-skipped example: +- Replace the original command with the `$cmds_doc` form (no `cd` prefix). +- Keep the one-line biological use-case comment unchanged. +- Add `**Expected output:**` on its own line **after** the closing triple-backtick of the code block: + ``` + **Expected output:** N sequences written to `out_name.fasta`. + ``` + where N = number of lines starting with `>` or `@` in the corresponding `$outputs` entry. + +For skipped examples: keep them exactly as they are in `$doc`, no annotation added. + +#### Modification 2 — Prose corrections (if any factual contradiction) + +Fix only specific sentences that are contradicted by actual execution results. Examples of things to correct: +- An attribute name that differs from actual output. +- An output format claimed that differs from the actual format observed. +- A default value stated incorrectly. +- A claim about which sequences are selected/discarded that contradicts observed results. + +After each corrected passage, add: `` +Do NOT "improve" text that is merely incomplete — only fix outright contradictions. + +#### Modification 3 — OUTPUT section + +Find the `# OUTPUT` section and append at its very end (before the next `---` or `#`): + +```markdown +## Observed output example + +``` + +``` +``` + +Rules: +- The excerpt is copied byte-for-byte from `$outputs`. No editing, no truncation within a record. +- Do NOT duplicate the OUTPUT section. There must be exactly one `# OUTPUT` heading. +- If no output was successfully produced, omit this subsection entirely. + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/cmd/obi{xxx}.md", "content": "..."} + +``` + +**Stop.** Proceed to PHASE 3. + +--- + +## PHASE 3 — Generate Hugo documentation + +(Equivalent to prompt_hugo.md) + +### HUGO SHORTCODE REFERENCE + +Use only the shortcodes listed below. Never invent others. + +| Shortcode | Syntax | Effect | +|-----------|--------|--------| +| Command link | `{{< obi obi{xxx} >}}` | Renders command name as internal link | +| Format name | `{{% fasta %}}` `{{% fastq %}}` `{{% csv %}}` `{{% json %}}` `{{% yaml %}}` | Renders format name (use in prose) | +| Suite name | `{{% obitools4 %}}` | Renders "OBITools4" as styled text | +| Embed data file | `{{< code "FILENAME" FORMAT true >}}` | Embeds file content; FORMAT = `fasta`, `fastq`, `txt`, `csv`, `json`, `yaml` | +| Standard option set | `{{< option-sets/input >}}` | Renders shared input-format options | +| Standard option set | `{{< option-sets/output >}}` | Renders shared output-format options | +| Standard option set | `{{< option-sets/common >}}` | Renders shared performance/logging options | +| Standard option set | `{{< option-sets/selection >}}` | Renders shared sequence-selection options (obigrep only) | +| Single shared option | `{{< cmd-options/paired-with >}}` | Renders `--paired-with` option | +| Custom option block | `{{< cmd-option name="NAME" short="S" param="PARAM" >}}` text `{{< /cmd-option >}}` | Renders a command-specific option; `short` and `param` are optional | +| Workflow diagram | `{{< mermaid class="workflow" >}}` … `{{< /mermaid >}}` | Renders Mermaid flowchart | + +--- + +### SECTION STRUCTURE OF A HUGO COMMAND PAGE + +```markdown +--- +archetype: "command" +title: "obi{xxx}" +date: YYYY-MM-DD +command: "obi{xxx}" +category: +url: "/obitools/obi{xxx}" +weight: +--- + +# `obi{xxx}`: + +> [!WARNING] Preliminary AI-generated documentation +> This page was automatically generated by an AI assistant and has **not yet been +> reviewed or validated** by the {{% obitools4 %}} development team. It may contain +> inaccuracies or incomplete information. Use with caution and refer to the command's +> `--help` output for authoritative option descriptions. + +## Description + +}} and {{% format %}} shortcodes> + +}} shortcodes and paired command+output blocks> + +## Synopsis + +```bash +obi{xxx} [--option1] [--option2|-s PARAM] ... [] +``` + +## Options + +#### {{< obi obi{xxx} >}} specific options + +- {{< cmd-option name="NAME" short="S" param="PARAM" >}} + Description of option. + {{< /cmd-option >}} + +#### Taxonomic options ← include only if command uses taxonomy + +- {{< cmd-options/taxonomy/taxonomy >}} + +{{< option-sets/input >}} + +{{< option-sets/output >}} ← omit if command has no output (e.g. obicount) + +{{< option-sets/common >}} + +## Examples + +... + +```bash +obi{xxx} --help +``` +``` + +**YAML front matter fields:** +- `archetype`: always `"command"`. +- `title` and `command`: the command name. +- `date`: today's date in `YYYY-MM-DD` format. +- `category`: the subdirectory name under `commands/`. +- `url`: always `/obitools/obi{xxx}`. +- `weight`: copy from the existing `_index.md` if it exists; otherwise use `50`. + +--- + +### STATE 1 — Read source material (parallel) + +**Input:** nothing. +**Action:** emit all of the following calls in a single parallel message. + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/cmd/obi{xxx}.md"} + + +{"command": "ls /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/ 2>/dev/null || echo NO_EXAMPLES"} + + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4-doc/content/docs/commands/basics/obi{xxx}/_index.md"} + + +{"command": "find /Users/coissac/Sync/travail/__MOI__/GO/obitools4-doc/content/docs/commands -type d -name 'obi{xxx}'"} + +``` + +**Output:** store as `$doc`, `$examples_list`, `$existing_hugo`, `$category_path`. +**Stop.** Do not interpret. Proceed to STATE 2. + +--- + +### STATE 2 — Determine category and plan content + +**Input:** `$doc`, `$examples_list`, `$existing_hugo`, `$category_path`. + +1. **Category:** + - If `$category_path` found a directory, extract the category name (segment between `commands/` and `obi{xxx}`). + - If `$existing_hugo` contains `category:`, use that value. + - Otherwise, default to `basics`. + +2. **Weight:** + - If `$existing_hugo` contains `weight:`, reuse that value. + - Otherwise, use `50`. + +3. **Example files to copy:** input files AND output files from the working examples. Skip compressed files (`.gz`). + +4. **Naming convention for output files:** Use simple names like `out.json`, `out.yaml`, `out.fasta`, `out.fastq` — NOT `out_json.json` or similar. + +5. **Command-output file consistency:** Each example command MUST produce the file shown below it. Verify that the flag in the command creates the displayed file. + +6. **Plan replacements:** + - `obi{xxx}` → `{{< obi obi{xxx} >}}` + - Format names in prose → `{{% fasta %}}`, etc. + - Input filenames → `{{< code "FILENAME" FORMAT true >}}` + +7. **Workflow diagram consistency:** The Mermaid diagram MUST use the exact same files as the first working example. + +8. **Options section plan — standard option-sets coverage:** + The following options are covered by standard option-sets and must NOT be re-documented: + - `{{< option-sets/input >}}`: `--fasta`, `--fastq`, `--embl`, `--genbank`, `--ecopcr`, `--csv`, `--input-OBI-header`, `--input-json-header`, `--u-to-t`, `--solexa`, `--skip-empty`, `--no-order`. + - `{{< option-sets/output >}}`: `--fasta-output`, `--fastq-output`, `--json-output`, `--output-OBI-header`/`-O`, `--output-json-header`, `--out`/`-o`, `--compress`/`-Z`. + - `{{< option-sets/common >}}`: `--max-cpu`, `--batch-size`, `--batch-size-max`, `--batch-mem`, `--no-progressbar`, `--debug`, `--silent-warning`, `--pprof`, `--pprof-goroutine`, `--pprof-mutex`, `--version`, `--help`. + - `--paired-with` → `{{< cmd-options/paired-with >}}`. + - Taxonomy options (`--taxonomy`, `--fail-on-taxonomy`, `--raw-taxid`, `--update-taxid`, `--with-leaves`) → grouped under `#### Taxonomic options` with `{{< cmd-options/taxonomy/taxonomy >}}` for `--taxonomy`; document the rest with `{{< cmd-option >}}` blocks. + - All remaining command-specific options → `{{< cmd-option >}}` blocks under `#### {{< obi obi{xxx} >}} specific options`. + +9. **Examples:** keep only examples whose input files exist in `$examples_list`. Add final `obi{xxx} --help`. + +10. **Remove duplicates:** If the same files appear in both Description and Examples sections, keep only the first occurrence. + +**Output:** store as `$plan`. +**Stop.** Proceed to STATE 3. + +--- + +### STATE 3 — Read example input files (parallel) + +Emit one Read call per file to be used in the Hugo page (both input and output files). Do not read compressed files (`.gz`). + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/FILENAME"} + +``` + +**Output:** store file contents as `$input_files`. +**Stop.** Proceed to STATE 4. + +--- + +### STATE 4 — Write Hugo files (parallel) + +**Step 4a — write `_index.md`** following SECTION STRUCTURE and CONTENT RULES below. + +**CRITICAL:** The Synopsis section MUST use the **full verbatim** synopsis from `$doc`, not a simplified version. + +**Step 4b — copy data files (parallel with 4a):** +``` + +{"command": "cp /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/*.fasta /Users/coissac/Sync/travail/__MOI__/GO/obitools4-doc/content/docs/commands//obi{xxx}/ 2>/dev/null || true"} + + +{"command": "cp /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/*.fastq /Users/coissac/Sync/travail/__MOI__/GO/obitools4-doc/content/docs/commands//obi{xxx}/ 2>/dev/null || true"} + + +{"command": "cp /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/*.csv /Users/coissac/Sync/travail/__MOI__/GO/obitools4-doc/content/docs/commands//obi{xxx}/ 2>/dev/null || true"} + + +{"command": "cp /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/*.json /Users/coissac/Sync/travail/__MOI__/GO/obitools4-doc/content/docs/commands//obi{xxx}/ 2>/dev/null || true"} + + +{"command": "cp /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/*.yaml /Users/coissac/Sync/travail/__MOI__/GO/obitools4-doc/content/docs/commands//obi{xxx}/ 2>/dev/null || true"} + +``` + +Skip compressed files (`.gz`). + +**Step 4c — delete unused files from Hugo directory:** +``` + +{"command": "ls -la /Users/coissac/Sync/travail/__MOI__/GO/obitools4-doc/content/docs/commands//obi{xxx}/"} + +``` +Compare with `$examples_list` and remove any files not present in the working examples. + +--- + +## CONTENT RULES (apply throughout Phase 3 STATE 4) + +### Workflow diagram + +Place in the Description section, after introductory prose and before the first `{{< code >}}` block. + +``` +{{< mermaid class="workflow" >}} +graph TD + A@{ shape: doc, label: "input_file.fastq" } + C[obi{xxx}] + D@{ shape: doc, label: "output_file.fasta" } + A --> C:::obitools + C --> D + classDef obitools fill:#99d57c +{{< /mermaid >}} +``` + +Rules: +- One `@{ shape: doc, label: "FILENAME" }` node per input file; use filenames from the first example. +- One output node for the output file of the same example. +- Apply `:::obitools` on the **last arrow pointing to the command node**, not on the node definition line. +- `classDef obitools fill:#99d57c` must always be the last line inside the block. +- If the command produces no file output (prints to stdout only), use a terminal node `D([stdout])`. + +### Description section + +Narrative prose teaching the reader how the command works, one concept at a time. These are NOT the same examples as in the Examples section — simpler, focused on a single behaviour, chosen to clarify specific options or edge cases. + +- Write flowing paragraphs, not bullet lists of options. +- Explain **why** a biologist would use the command and **what** it does. +- Introduce data files with `{{< code "FILENAME" FORMAT true >}}` before the first command that uses them. +- Show example commands and their output in **paired** fenced blocks — no `**Expected output:**` label: + ````markdown + ```bash + obi{xxx} [options] input_file + ``` + ``` + + ``` + ```` +- Replace tool name in prose with `{{< obi obi{xxx} >}}`. +- Replace format names in prose with `{{% fasta %}}`, `{{% fastq %}}`, etc. +- **Do NOT reuse** examples from the Examples section verbatim. Description examples are simpler and pedagogical. + +### Synopsis section + +Use the synopsis from `$doc` verbatim. Wrap in a `bash` fenced code block. + +### Options section + +- Only document options **not** covered by `{{< option-sets/... >}}` (see STATE 2 item 8). +- Group under `#### {{< obi obi{xxx} >}} specific options`, then `#### Taxonomic options` if applicable, then the three `{{< option-sets/... >}}`. + +### Examples section + +Practical, real-world recipes. Each example addresses a distinct use case not already shown in Description. + +- **Never duplicate** an example from the Description section. +- Every example that produces sequence or annotation file output uses this pattern: + 1. Short intro paragraph (2–4 sentences) with a Markdown hyperlink to the input file, e.g. `The file [input.fasta](input.fasta) contains …`. + 2. `{{< code "input_file" FORMAT true >}}` + 3. `bash` fenced block with `-o out_name.ext` (never `>` redirection for non-paired examples). + 4. `{{< code "out_name.ext" FORMAT true >}}` +- **`--paired-with` examples:** use `--out .fastq` (not `>`), show both output files: + ````markdown + ```bash + obi{xxx} --paired-with reverse.fastq --out out_paired.fastq forward.fastq + ``` + {{< code "out_paired_R1.fastq" fastq true >}} + {{< code "out_paired_R2.fastq" fastq true >}} + ```` + Both `_R1` and `_R2` files must be copied to the Hugo command directory (Step 4b). +- **CSV output:** pipe through `csvlook`, no file redirection, no `{{< code >}}`: + ````markdown + ```bash + obi{xxx} [options] input_file | csvlook + ``` + ``` + | col1 | col2 | + | ---- | ---- | + | val1 | val2 | + ``` + ```` +- Last example always: `` ```bash\nobi{xxx} --help\n``` `` (no output block). +- Never inline file content as raw fenced blocks — always use `{{< code >}}`. +- Output files must be copied to the Hugo command directory alongside input files (Step 4b). diff --git a/autodoc/prompt_hugo.md b/autodoc/prompt_hugo.md new file mode 100644 index 0000000..2f71d5b --- /dev/null +++ b/autodoc/prompt_hugo.md @@ -0,0 +1,414 @@ +# Task + +Convert `autodoc/cmd/obi{xxx}.md` and `autodoc/examples/obi{xxx}/` into a Hugo +documentation page at +`/Users/coissac/Sync/travail/__MOI__/GO/obitools4-doc/content/docs/commands//obi{xxx}/`. + +The Hugo site uses the **Book** theme with custom shortcodes specific to OBITools4. +Every rule below is derived from reading existing pages — do not invent shortcodes or +patterns not listed here. + +--- + +## TOOL CALL FORMAT — enforce before every call + +A tool call is exactly: + + + {"param": "value"} + + +Rules: +- `<` immediately followed by `f` — zero spaces. +- Parameters are a **single JSON object** — no XML wrapper tags. +- No outer `` or `` wrapper. +- Tool name lowercase with double underscores. + +--- + +## HUGO SHORTCODE REFERENCE + +Use only the shortcodes listed below. Never invent others. + +| Shortcode | Syntax | Effect | +|-----------|--------|--------| +| Command link | `{{< obi obi{xxx} >}}` | Renders command name as internal link | +| Format name | `{{% fasta %}}` `{{% fastq %}}` `{{% csv %}}` `{{% json %}}` `{{% yaml %}}` | Renders format name (use in prose) | +| Suite name | `{{% obitools4 %}}` | Renders "OBITools4" as styled text | +| Embed data file | `{{< code "FILENAME" FORMAT true >}}` | Embeds file content in page; FORMAT = `fasta`, `fastq`, `txt`, `csv`, `json`, `yaml` | +| Standard option set | `{{< option-sets/input >}}` | Renders the shared input-format options block | +| Standard option set | `{{< option-sets/output >}}` | Renders the shared output-format options block | +| Standard option set | `{{< option-sets/common >}}` | Renders the shared performance/logging options block | +| Standard option set | `{{< option-sets/selection >}}` | Renders the shared sequence-selection options block (obigrep only) | +| Single shared option | `{{< cmd-options/paired-with >}}` | Renders the `--paired-with` option description | +| Custom option block | `{{< cmd-option name="NAME" short="S" param="PARAM" >}}` text `{{< /cmd-option >}}` | Renders a command-specific option; `short` and `param` are optional | +| Workflow diagram | `{{< mermaid class="workflow" >}}` … `{{< /mermaid >}}` | Renders a Mermaid flowchart showing command inputs and outputs | + +--- + +## SECTION STRUCTURE OF A HUGO COMMAND PAGE + +```markdown +--- ← YAML front matter (see below) +... +--- + +# `obi{xxx}`: + +> [!WARNING] Preliminary AI-generated documentation +> This page was automatically generated by an AI assistant and has **not yet been +> reviewed or validated** by the {{% obitools4 %}} development team. It may contain +> inaccuracies or incomplete information. Use with caution and refer to the command's +> `--help` output for authoritative option descriptions. + +## Description + +}} and {{% format %}} shortcodes> + + + +}} shortcodes> + + +## Synopsis + +```bash +obi{xxx} [--option1] [--option2|-s PARAM] ... [] +``` + +## Options + +#### {{< obi obi{xxx} >}} specific options + +- {{< cmd-option name="NAME" short="S" param="PARAM" >}} + Description of option. + {{< /cmd-option >}} + +#### Taxonomic options ← include only if command uses taxonomy + +- {{< cmd-options/taxonomy/taxonomy >}} + +{{< option-sets/input >}} + +{{< option-sets/output >}} ← omit if command has no output (e.g. obicount) + +{{< option-sets/common >}} + +## Examples + +```bash +obi{xxx} --help +``` +``` + +--- + +## YAML FRONT MATTER TEMPLATE + +```yaml +--- +archetype: "command" +title: "obi{xxx}" +date: +command: "obi{xxx}" +category: +url: "/obitools/obi{xxx}" +weight: +--- +``` + +Fields: +- `archetype`: always `"command"`. +- `title`: the command name, e.g. `"obigrep"`. +- `date`: today's date in `YYYY-MM-DD` format. +- `command`: same as title. +- `category`: the subdirectory name under `commands/` (see STATE 1). +- `url`: always `/obitools/obi{xxx}`. +- `weight`: copy the value from the **existing** `_index.md` if the page already exists; + otherwise use `50`. + +--- + +## PIPELINE + +Execute the five states below in order. Do not skip states. Do not merge states. + +--- + +### STATE 1 — Read source material (parallel) + +**Input:** nothing. +**Action:** emit all of the following calls in a single parallel message. + +Call 1 — read the autodoc file: +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/cmd/obi{xxx}.md"} + +``` + +Call 2 — list example files: +``` + +{"command": "ls /Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/ 2>/dev/null || echo NO_EXAMPLES"} + +``` + +Call 3 — read the existing Hugo page (may not exist yet): +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4-doc/content/docs/commands/basics/obi{xxx}/_index.md"} + +``` + +Call 4 — list the Hugo commands directory to find the right category: +``` + +{"command": "find /Users/coissac/Sync/travail/__MOI__/GO/obitools4-doc/content/docs/commands -type d -name 'obi{xxx}'"} + +``` + +**Output:** store results as `$doc`, `$examples_list`, `$existing_hugo`, `$category_path`. +**Stop.** Do not interpret. Proceed to STATE 2. + +--- + +### STATE 2 — Determine category and plan content (no tool calls) + +**Input:** `$doc`, `$examples_list`, `$existing_hugo`, `$category_path`. + +1. **Category:** + - If `$category_path` found a directory, extract the category name from the path + (segment between `commands/` and `obi{xxx}`). + - If `$existing_hugo` exists and contains `category:`, use that value. + - Otherwise, default to `basics`. + +2. **Weight:** + - If `$existing_hugo` contains `weight:`, reuse that value. + - Otherwise, use `50`. + +3. **Example files to copy:** + - From `$examples_list`, keep only files that are referenced in the EXAMPLES section + of `$doc` **as input files** (not output files that start with `out_`). + - Identify the format of each file from its extension: + `.fasta` / `.fa` → `fasta`, `.fastq` / `.fq` → `fastq`, + `.txt` → `txt`, `.csv` → `csv`, `.gz` → skip (do not embed compressed files). + +4. **Description section plan:** + - Extract the DESCRIPTION content from `$doc`. + - Identify every occurrence of `obi{xxx}` and plan to replace with `{{< obi obi{xxx} >}}`. + - Identify format names (`FASTA`, `FASTQ`, `JSON`, `CSV`) and plan to replace with + `{{% fasta %}}`, `{{% fastq %}}`, etc. in flowing prose (not in code blocks). + - Identify every input filename used in examples and plan to show with + `{{< code "FILENAME" FORMAT true >}}` just before the first command that uses it. + +5. **Options section plan:** + - List options that are command-specific (not covered by standard option-sets). + - The standard option-sets cover: + - `{{< option-sets/input >}}`: all `--fasta`, `--fastq`, `--embl`, `--genbank`, + `--ecopcr`, `--csv`, `--input-OBI-header`, `--input-json-header`, `--u-to-t`, + `--solexa`, `--skip-empty`, `--no-order` flags. + - `{{< option-sets/output >}}`: all `--fasta-output`, `--fastq-output`, + `--json-output`, `--output-OBI-header`, `--output-json-header`, `--out`/`-o`, + `--compress`/`-Z` flags. + - `{{< option-sets/common >}}`: all `--max-cpu`, `--batch-size`, `--batch-size-max`, + `--batch-mem`, `--no-progressbar`, `--debug`, `--verbose`, `--silent-warning`, + `--pprof`, `--pprof-goroutine`, `--pprof-mutex`, `--version`, `--help` flags. + - Do NOT re-document options already covered by a standard option-set. + - `--paired-with` → use `{{< cmd-options/paired-with >}}`. + - Taxonomy options (`--taxonomy`, `--restrict-to-taxon`, `--ignore-taxon`, etc.) + → grouped under `#### Taxonomic options` with + `{{< cmd-options/taxonomy/taxonomy >}}` for `--taxonomy`; + document the rest with `{{< cmd-option >}}` blocks. + - All remaining command-specific options → `{{< cmd-option >}}` blocks. + +6. **Examples section plan:** + - Keep only examples whose input files exist in `$examples_list` + (skip examples requiring external resources like taxonomy databases or URLs). + - For each kept example, identify the corresponding output file in `$examples_list` + (typically `out_.fasta`, `out_.fastq`, etc.). + - Plan to read every identified output file in STATE 3. + - Always add a final example: `` ```bash\nobi{xxx} --help\n``` `` + +**Output:** store the plan as `$plan`. +**Stop.** Proceed to STATE 3. + +--- + +### STATE 3 — Read example input files (parallel) + +**Input:** `$plan` (list of input files to embed and output files to show in examples). +**Action:** emit one Read call per file to be used in the Hugo page — both input files +(to embed with `{{< code >}}`) and output files (to show as example results). + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/examples/obi{xxx}/FILENAME"} + +``` + +Do **not** read compressed files (`.gz`). + +**Output:** store file contents as `$input_files`. +**Stop.** Proceed to STATE 4. + +--- + +### STATE 4 — Write Hugo files (parallel) + +**Input:** `$doc`, `$plan`, `$input_files`. + +**Step 4a — write the Hugo `_index.md`:** + +Compose the Hugo page following the SECTION STRUCTURE and YAML FRONT MATTER templates +above, applying the plan from STATE 2. Then emit: + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4-doc/content/docs/commands//obi{xxx}/_index.md", + "content": "..."} + +``` + +**Step 4b — copy data files (parallel with 4a):** + +For each file in `$input_files` and `$output_files`, emit a Write call to place the +file in the Hugo command directory: + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4-doc/content/docs/commands//obi{xxx}/FILENAME", + "content": ""} + +``` + +Emit the `_index.md` write and all data file writes in a **single parallel message**. + +**Stop.** Do not emit any text after the Write calls. + +--- + +## CONTENT RULES (apply throughout STATE 4) + +### Workflow diagram + +Place the diagram in the Description section, after the introductory prose and before +the first `{{< code >}}` block. It represents the main use case of the command. + +``` +{{< mermaid class="workflow" >}} +graph TD + A@{ shape: doc, label: "input_file.fastq" } + C[obi{xxx}] + D@{ shape: doc, label: "output_file.fasta" } + A --> C:::obitools + C --> D + classDef obitools fill:#99d57c +{{< /mermaid >}} +``` + +Rules: +- One `@{ shape: doc, label: "FILENAME" }` node per input file; use the actual + filenames from the main example (first example in the Examples section). +- One `@{ shape: doc, label: "FILENAME" }` node for the output file, using the + actual output filename from the same example. +- The command node uses `[obi{xxx}]` (square brackets = rounded rectangle). +- Apply `:::obitools` on the **last arrow pointing to the command node**, not on + the node definition line itself. +- `classDef obitools fill:#99d57c` must always be the last line inside the block. +- If the command produces no file output (e.g. prints to stdout only), use a + terminal node `D([stdout])` instead of a doc node. + +### Description section + +The Description section serves an **explanatory** purpose: it teaches the reader how the +command works by walking through a few illustrative cases, one option or concept at a time. +These are NOT the same examples as in the Examples section — they are simpler, focused on +a single behaviour, and chosen to clarify specific options or edge cases. + +- Write narrative prose, not a bullet list of options. +- Explain **why** a biologist would use the command and **what** it does to sequences. +- Introduce data files with `{{< code "FILENAME" FORMAT true >}}` before the first + command that uses them. +- Show example commands and their output in **paired** fenced blocks: + ````markdown + ```bash + obi{xxx} [options] input_file + ``` + ``` + + ``` + ```` +- Do **not** use `**Expected output:**` labels — output goes directly in the second code block. +- Replace tool name occurrences in prose with `{{< obi obi{xxx} >}}`. +- Replace format names in prose with `{{% fasta %}}`, `{{% fastq %}}`, etc. +- **Do NOT reuse** examples from the Examples section verbatim. The Description examples + are simpler, pedagogical, focused on one concept; the Examples section examples are + richer, more realistic cookbook recipes. + +### Synopsis section + +- Use the synopsis from `$doc` verbatim, or reconstruct from the OPTIONS section if + the synopsis in `$doc` is incomplete. +- Wrap in a `bash` fenced code block. + +### Options section + +- Only document options **not** covered by `{{< option-sets/... >}}`. +- Use `{{< cmd-option >}}` blocks for each command-specific option. +- Group under `#### {{< obi obi{xxx} >}} specific options`, then + `#### Taxonomic options` if applicable, then the three `{{< option-sets/... >}}`. + +### Examples section + +The Examples section serves a **cookbook** purpose: it shows practical, real-world +recipes that a biologist might want to run directly. Each example should address a +distinct use case that goes beyond the introductory illustrations already shown in the +Description section. Examples here may combine multiple options, use realistic file +names, and demonstrate more complex pipelines. + +**Never duplicate** an example that already appears in the Description section — choose +different scenarios, different option combinations, or more complete workflows. + +- When a command produces **CSV output**, pipe it through `csvlook` for readable + display. Do not redirect to a file in that case — show the result inline: + ````markdown + ```bash + obi{xxx} [options] input_file | csvlook + ``` + ``` + | col1 | col2 | + | ---- | ---- | + | val1 | val2 | + ``` + ```` + The output block contains the verbatim `csvlook` rendering (table with `|` borders). + No `{{< code >}}` shortcode is needed since there is no output file to download. + +- For **`--paired-with`** examples: the command uses `--out .fastq` (never `>` + redirection) and produces two files `_R1.fastq` and `_R2.fastq`. + Show both output files with two consecutive `{{< code >}}` shortcodes: + ````markdown + ```bash + obi{xxx} --paired-with reverse.fastq --out out_paired.fastq forward.fastq + ``` + {{< code "out_paired_R1.fastq" fastq true >}} + {{< code "out_paired_R2.fastq" fastq true >}} + ```` + Both `_R1` and `_R2` files must be copied to the Hugo command directory (Step 4b). + +- Every example that produces **sequence or annotation file output** (non paired) uses the following pattern: + 1. A short introductory paragraph (2–4 sentences) that explains the biological + motivation for the example and includes a Markdown hyperlink to the input file, + e.g. `The file [input.fastq](input.fastq) contains …`. + 2. `{{< code "input_file" FORMAT true >}}` — shows the input file content and makes + it downloadable. + 3. A `bash` fenced block with the command writing to an output file + (use `-o out_name.fasta`, never `>` redirection for non-paired examples). + 4. Immediately after: `{{< code "out_name.fasta" FORMAT true >}}` — so the result is + rendered AND downloadable. +- The output files must be copied to the Hugo command directory alongside input files + (Step 4b), so the shortcode can find them. +- If no output file exists for an example, omit the `{{< code >}}` line entirely. +- Last example always: `obi{xxx} --help` (no output). +- Never inline file content as raw fenced blocks — always use `{{< code >}}`. diff --git a/autodoc/prompt_v2.md b/autodoc/prompt_v2.md new file mode 100644 index 0000000..b12e8bd --- /dev/null +++ b/autodoc/prompt_v2.md @@ -0,0 +1,230 @@ +# Task + +Produce `autodoc/cmd/obi{xxx}.md` — the documentation file for the `obi{xxx}` CLI command. + +--- + +## TOOL CALL FORMAT — enforce before every call + +A tool call is exactly: + + + {"param": "value"} + + +Rules (no exceptions): +- `<` is immediately followed by `f` — zero spaces, zero characters in between. +- Parameters are a **single JSON object** — no XML tags, no ``, no ``, no ``. +- No outer wrapper — never use ``, ``, or any other enclosing tag. +- Tool name is lowercase with double underscores — never ALL_CAPS, never single underscore between server and tool name. + +CORRECT: `` +WRONG: `< function=mcp__jcodemunch__get_file_outline >` ← spaces → parse error +WRONG: `` ← wrong separator +WRONG: `` ← wrong casing + +--- + +## HALLUCINATION GUARD — enforce before writing anything + +OBITools4 is a **complete rewrite**. Training data about OBITools v1/v2/v3 is wrong for this version. + +Before writing any sentence, apply this check: + +> "Can I point to the exact line in $help or $docs that justifies this claim?" +> If NO → do not write it. + +This applies to: option names, option flags, default values, file formats, behaviours, algorithms, output fields. +Omit rather than guess. A shorter correct page is better than a longer hallucinated one. + +--- + +## PIPELINE + +Execute the four states below in order. Do not skip states. Do not merge states. + +--- + +### STATE 1 — Gather raw data (parallel) + +**Input:** nothing. +**Action:** emit all of the following tool calls in a single message (parallel execution). + +Call 1 — dependencies of the main entry point: +``` + +{"language": "go", "file_path": "cmd/obitools/obi{xxx}/main.go"} + +``` + +Call 2 — dependencies of every file in the command package (one call per file found by glob `pkg/obitools/obi{xxx}/*.go`): +``` + +{"language": "go", "file_path": "pkg/obitools/obi{xxx}/FILE.go"} + +``` + +Call 3 — symbol outline of the command package (single batch call): +``` + +{"repo": "git.metabarcoding.org/obitools/obitools4/obitools4", "file_paths": ["pkg/obitools/obi{xxx}/options.go", "pkg/obitools/obi{xxx}/obi{xxx}.go"]} + +``` + +Call 4 — CLI help text: +``` + +{"command": "cd /Users/coissac/Sync/travail/__MOI__/GO/obitools4 && obi{xxx} --help 2>&1"} + +``` + +Call 5 — list of already-documented OBITools4 commands (to inform SEE ALSO): +``` + +{"url": "https://obitools4.metabarcoding.org/obitools/"} + +``` + +**Output:** store results as `$deps`, `$outline`, `$help`, `$web_doc`. +`$web_doc` contains the index of documented commands — use it only to determine which +`obi*` commands have existing documentation pages, so that SEE ALSO only links to pages +that actually exist. Do NOT use it as a source for option names, flags, or defaults. +**Stop.** Do not interpret, summarise, or write anything. Proceed to STATE 2. + +--- + +### STATE 2 — Resolve documentation files + +**Input:** `$deps` (import paths from STATE 1). +**Action (no tool calls):** + +1. Collect every import path that starts with `git.metabarcoding.org/obitools/obitools4/obitools4/pkg/`. +2. Remove these infrastructure packages (already covered by the convert doc): + - `pkg/obidefault`, `pkg/obiiter`, `pkg/obiformats`, `pkg/obioptions`, `pkg/obiutils`, `pkg/obiparams` + - `pkg/obiseq` — keep only if `$outline` shows non-trivial sequence manipulation. +3. Always add: `pkg/obitools/obi{xxx}` and `pkg/obitools/obiconvert`. +4. Map each remaining package path to its doc file: + - take all path segments from `pkg/` onward (inclusive), replace `/` with `_` → `autodoc/docmd/.md` + - examples: `git.metabarcoding.org/.../pkg/obialign` → `autodoc/docmd/pkg_obialign.md` + - examples: `git.metabarcoding.org/.../pkg/obitools/obiuniq` → `autodoc/docmd/pkg_obitools_obiuniq.md` + +**Output:** store file list as `$docfiles`. +**Stop.** Proceed to STATE 3. + +--- + +### STATE 3 — Read documentation files (parallel) + +**Input:** `$docfiles`. +**Action:** emit one `Read` call per file in `$docfiles`, all in a single parallel message. + +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/DOCFILE"} + +``` + +Do NOT read any `.go` source file. +Do NOT produce any summary or analysis. + +**Output:** store all file contents as `$docs`. +**Stop.** Proceed to STATE 4. + +--- + +### STATE 4 — Write the documentation file + +**Input:** `$help`, `$docs`, `$outline`, `$web_doc`. +**Action:** fill the template below, then emit exactly one `Write` call. Then stop. + +**Source discipline:** every piece of information in the template MUST come from the labelled source. +If the source does not contain the information, write `_(not available)_` — never invent. +`$web_doc` (index of documented commands) is used exclusively to filter the SEE ALSO +section — only list commands that appear as documented pages in `$web_doc`. +It is NOT a source for option names, descriptions, or behaviour. + +--- + +#### Output template + +```markdown +# NAME + +obi{xxx} — [FILL: one-line description. Source: first line of $help] + +--- + +# SYNOPSIS + +[FILL: verbatim USAGE block from $help, inside a code block] + +--- + +# DESCRIPTION + +[FILL: 2–4 paragraphs explaining what the command does, why a biologist would use it, +and what it produces. Source: $help description section + $docs. +No jargon. No implementation details (goroutines, channels, GC, arena). +No options that belong in the OPTIONS section.] + +--- + +# INPUT + +[FILL: accepted input formats and how to provide them. Source: $help + $docs/obiconvert.] + +--- + +# OUTPUT + +[FILL: output format and what fields/attributes are added or changed. Source: $help + $docs.] + +--- + +# OPTIONS + +[FILL: one subsection per thematic group found in $help. +For each flag: +- Flag name(s): long form + short form if it exists. Source: $help exactly. +- Default: state it. Source: $help exactly. If absent from $help: write "none". +- Meaning: explain the biological or practical purpose, not just the mechanical action. +- Do NOT include any flag not present in $help.] + +--- + +# EXAMPLES + +[FILL: at least four copy-pasteable examples. +Each example: +1. A one-line comment explaining the biological use case. +2. The command inside a code block. +3. COVERAGE RULE: every command-specific option documented in the OPTIONS section MUST + appear in at least one example. Add extra examples if needed to achieve full coverage. +Source for flags and options: $help only.] + +--- + +# SEE ALSO + +[FILL: related obi commands mentioned in $docs or $help, AND present in $web_doc +(i.e. commands that have an existing documentation page online). +If none qualify: omit section.] + +--- + +# NOTES + +[FILL: caveats, performance notes, known limitations. Source: $docs only. +If none: omit section.] +``` + +--- + +Emit the Write call: +``` + +{"file_path": "/Users/coissac/Sync/travail/__MOI__/GO/obitools4/autodoc/cmd/obi{xxx}.md", "content": "..."} + +``` + +**Stop. Do not emit any text after the Write call.** diff --git a/entities.json b/entities.json new file mode 100644 index 0000000..5ffdaa5 --- /dev/null +++ b/entities.json @@ -0,0 +1,10 @@ +{ + "people": [ + "Software", + "Agreement", + "Module" + ], + "projects": [ + "Code" + ] +} \ No newline at end of file diff --git a/obitests/obitools/.DS_Store b/obitests/obitools/.DS_Store index a6c2f8046b46e9e5e9f7751d4934e8001b77a04a..afb1c50b75c4de016d6aece20034baf33d3fcd62 100644 GIT binary patch literal 8196 zcmeHMTWl0n7(U;$&>1?=0g5cR6BY_V$O5HB+Hx`5n*ss_wx!!rSY~%dIxwACc6PTw zX>3e<;bwf&c=t&okp}}xe33*&^ik0SV|*ZLygcB8FDeiIGiR1SOP?eL;+*81|J?p_ z&i~K%o!NhuF^0CHzM8QJV@#y%Nu`>KyCiSi%M+3i^i+}r=`*G=ljSMf?7)QCVMpkJ z&;y|dLJx!<2t9B=c!1v7UJ@sH@AJ{H3_TEf;D6}>?|z6;_GBWEV_f=I2Niw@K++2V z{6uw%1ALuWAQOQcgA_L?Fkw&_t<<(Vc0n_3BkwDixNV4EmStCpv$y{W!_c_6LKDv=Rgxq|7~?rtORl1wW# zE*ZNy>)1Q;hBw}ub*$o$?WUEPS8;^o6S3xwZm~uZ<)SS!8lUz z#wHA<`dueqX!jROD@xHdb{VvmSu#3kMK@*AmY!nmI711@kG>$oCAJdSS3nbF+r6oA!a;lIfZm%jmYXA!8=ul618uGH?FF zQB|F6YO0tq4;RhCQKe24{XJp}Re3y;AksRt`;vvKy2l%7&l^bh^k{K&i-HL3Q9%9kQNBGWFJL;-rNoC#uG`O(xRmh)=eCHstI{ie)?!O!cMan*;#g;eabGgui5wP7j}dF31BKDOhYwlF&~Q&$D?RM zGg{DwF6_iEBr$+Nm^cU*M{o=yIF4s=0;g~qXYc}E#p`$jZ{i)ihx53A_wf;~;B$O| zuW%hd;79y~8~6)<iqEBqvyhYynz|ij63{JE;56z7&S-xsb^TyU&faC}+ zgZ6eR`1rqrH9r1*e2l~HRnACxL7A=Nso6usF^-5Hf0Zgedy%|YB;qJjcyp<|OeD%E zQ+YEUT`3Yt6~S-HtD}t)QBkSj&3Lp)5*6^qI(dUcq*N+-Q/ - .md +Fully document OBITools (version 4, written in Go) in English, using a 4‑phase incremental pipeline. + +You **MUST** use the available MCP servers: + +- `cclsp` – exact definitions, references, diagnostics +- `jcodemunch` – code indexing, symbol extraction +- `treesitter` – AST and CLI parsing +- `context7` – external documentation + +All tool calls must follow the exact API described in the MCP server documentation. If a required tool is unavailable, you **MUST** log the error and stop execution. + +### Tool call format (CRITICAL) + +Tool calls **MUST** use this exact XML format — no spaces inside the angle brackets: + +``` + +{"param": "value"} + +``` + +**FORBIDDEN** — these variants will cause parse errors and must NEVER be used: +- `< function=tool_name >` (spaces around the tag name) +- `< function = tool_name>` (spaces around `=`) +- `` (space before `=`) + +The opening tag is `` with **zero spaces** inside `<` and `>`. + +--- + +# Global rules + +** You are not allowed to read twice the same file in a row. ** + +## Language + +- All generated documentation **MUST** be in English. +- If an existing documentation file is in French: + 1. Translate it to English + 2. Save the original as `.fr.md` **before** overwriting + 3. Write the new English version + +--- + +## Execution mode (STRICT) + +You are operating in **STRICT TOOL MODE**: + +- If a file must be written, you **MUST** use the `Shell` tool. +- You **MUST NOT** read entire directory listings into memory. +- You **MUST** work with **one item at a time** using a simple text file as a task queue. + +### Reading files before writing + +- **Before writing to an existing documentation file**, you must first read it using the `Read` tool. +- **When documenting a single Go source file**, you only need to read that one file (plus up to 4-5 helper files if needed for context). +- Do NOT read the entire codebase - only what is necessary to document the current file. + +--- + +### Rules + +- Always write the **full** file (no partial updates). +- Paths are relative to the project root; directories are created implicitly. +- Content must be valid UTF‑8; use `\n` line endings. +- Do **not** wrap content in backticks. + +--- + +## Progress tracking: task queue files + +We use **line‑oriented task files** to avoid loading large lists into memory. Each phase has its own task file: + +- `docs/todo/phase1.txt` – list of Go files (one per line) to document. +- `docs/todo/phase1bis.txt` – same list, but after phase1 is done. +- `docs/todo/phase2.txt` – list of packages. +- `docs/todo/phase3.txt` – list of tools. + +**How it works:** + +1. At the start of a phase, if the task file does not exist, it is created by scanning the codebase once (Phase 0 or Phase X init). +2. **Each run of the LLM processes only the first line of the task file.** +3. After processing the item (success or permanent failure), the line is removed from the task file. + - On success, the line is deleted (no extra sentinel file needed). + - On transient failure (retry < 3), we keep the line but increment a retry counter stored in a separate file. + - On permanent failure (retry ≥ 3), we move the line to a `failed.txt` file and log the error. +4. The LLM then exits (or continues if the task file is still non‑empty, but it must never load more than one line). + +This way, the LLM’s context never holds more than a single task at a time. + +### Retry mechanism + +For each item (e.g., `internal/align/align.go`), we maintain a retry counter in: + +- `docs/retry/phase1/internal/align/align.go.count` + +If the file does not exist, retries = 0. +Each time processing fails, we increment the counter (write the new number). +If after increment the counter < 3, we keep the line in the task file. +If counter reaches 3, we **remove the line from the task file**, add it to `docs/failed/phase1/internal/align/align.go.failed` (just a marker), and log the error. + +--- + +## Documentation quality requirements (CRITICAL) + +Documentation MUST NOT be superficial. For each documented element (file, function, struct, package): + +### You MUST explain: + +- what it does +- why it exists (context, problem solved) +- how it is used +- assumptions and preconditions +- possible edge cases + +### Forbidden patterns + +- Vague phrases like “This function handles…”, “Utility for…”, “Helper function…”. +- Generic descriptions that could apply to any project. + +### Required content per element type + +- Functions: + - Purpose + - Parameter meaning + - Return values + - Notable behaviour (panic conditions, side effects, concurrency) +- Structs: + - Role in the system + - Meaning of key fields +- Files: + - Role within the package + - Interactions with other files + +### Anti‑generic rule + +If the description could apply to any project, it is INVALID. You MUST include domain‑specific context (bioinformatics, sequence processing, etc.) and concrete behaviour. + +### Quality validation + +Before marking an item as done (i.e., creating the .done sentinel), you MUST perform a self‑validation: + +- Check that all required sections are present. +- Verify that no forbidden patterns remain. + +If validation fails, increment the retry counter and keep the item pending. + + +--- + +# Directory structure + +``` +docs/ + todo/ # task queues + phase1.txt + phase1bis.txt + phase2.txt + phase3.txt + retry/ # retry counters + phase1/ # mirrors file structure + internal/align/align.go.count + phase1bis/ phase2/ - .md phase3/ - .md - ``` - -# Instructions générales -- Avant de commencer, vérifiez que le répertoire `docs/phase1`, `docs/phase2`, `docs/phase3` existe (créez-les si nécessaire). -- Pour chaque phase, respectez le format Markdown demandé. -- N’hésitez pas à utiliser les outils MCP pour obtenir des informations précises sur les symboles (signatures, commentaires, types). -- Si un outil MCP nécessite des coordonnées (ligne, caractère), vous pouvez extraire ces informations en lisant le fichier source. -- Soyez méthodique : traitez un fichier/package/outil à la fois et sauvegardez immédiatement le résultat. - ---- - -## Phase 1 : Documentation par fichier Go (sauf `main.go`) - -**Objectif** : Pour chaque fichier `.go` qui n’est pas un `main.go` d’outil, générer un document Markdown décrivant son rôle, ses structures, ses fonctions principales et ses dépendances. - -**Étapes** : -1. Listez tous les fichiers `.go` du projet, en excluant ceux nommés `main.go` (ils seront traités en phase 3). Utilisez par exemple `find . -name "*.go" -not -name "main.go"`. -2. Pour chaque fichier : - a. Lisez son contenu (vous pouvez le faire via shell ou en utilisant l’outil `read_file` si disponible). - b. Utilisez les outils MCP pour extraire les informations suivantes : - - Pour chaque fonction publique (commençant par une majuscule) : appelez `find_definition` avec la ligne approximative où se trouve la fonction. Récupérez la signature et les commentaires associés. - - Pour les types (structs, interfaces) : faites de même. - c. Générez un document Markdown avec les sections : - ```markdown - # Fichier : `chemin/vers/fichier.go` - ## Rôle - (une phrase ou deux) - - ## Structures - - `NomStruct` : description, champs principaux - - ## Fonctions principales - - `NomFonction(paramètres) (retours)` : description - - ## Dépendances notables - - packages importés (externes ou internes) significatifs - ``` - d. Sauvegardez dans `docs/phase1//.md` (où `` est le nom du répertoire parent contenant le fichier, et `` le nom sans extension). - -**Exemple d’utilisation d’outil MCP** : -``` -# Pour obtenir la définition de la fonction `Align` dans `align/align.go` -# On suppose qu’elle se trouve approximativement ligne 120, colonne 1. -find_definition(file="align/align.go", line=120, character=1) + failed/ # permanent failure markers + phase1/ + internal/align/align.go.failed + phase1bis/ + phase2/ + phase3/ + phase1/ # actual documentation + /.go.md + phase2/ + .md + phase3/ + .md + error.log ``` -**Conseil** : Vous pouvez d’abord lister tous les symboles exportés d’un fichier en utilisant `go list` ou en analysant le code. Mais les outils MCP peuvent aussi vous aider. +--- + +# Phase 0: Initialization + +1. Ensure required directories exist: `docs/todo`, `docs/retry`, `docs/failed`, `docs/phase1`, `docs/phase2`, `docs/phase3`. +2. **If `docs/todo/phase1.txt` does not exist**: + - Use `find pkg -name "*.go" ! -name "*_test.go" ! -path "*/cmd/*"` to list all Go files (excluding tests and main.go). + - Write the list (one relative path per line, e.g., `internal/align/align.go`) to `docs/todo/phase1.txt`. +3. Do the same for phase2 and phase3 later when those phases start. +4. **No other state is stored.** --- -## Phase 2 : Agrégation par package +# Phase 1: File documentation -**Objectif** : Pour chaque package (répertoire contenant au moins un fichier `.go` sauf `main.go`), créer un document Markdown qui synthétise l’ensemble des fichiers du package. +**Processing rule:** +- Read the **first line** of `docs/todo/phase1.txt` (using `head -n 1`). +- If the file is empty, Phase 1 is complete → proceed to Phase 1bis initialization. +- Otherwise, process that single file. -**Étapes** : -1. Identifiez tous les packages (répertoires) qui contiennent des fichiers `.go` traités en phase 1. -2. Pour chaque package : - a. Lisez tous les fichiers `.md` de `docs/phase1//`. - b. Utilisez les outils MCP pour obtenir une vue d’ensemble des symboles exportés du package (par exemple, en interrogeant `find_references` sur un symbole clé ou en parcourant les définitions). - c. Générez un document Markdown avec : - ```markdown - # Package : `` +**Processing a file:** - ## Présentation - Description générale du package, son rôle dans OBITools (traitement de séquences, alignement, etc.) +1. Let `relpath` be the line content (e.g., `internal/align/align.go`). +2. Check if a permanent failure marker exists at `docs/failed/phase1/${relpath}.failed`. If yes, remove the line from the task file and skip (line will be deleted). +3. If the documentation file `docs/phase1/${relpath}.go.md` exists go directly to its validation (step 6). +4. Otherwise, generate documentation for that file (using MCP tools as before). +5. Write the documentation to `docs/phase1/${relpath}.go.md`. +6. Validate quality. +7. If validation succeeds: + - Remove the line from the task file. + - Remove any retry counter file for this item. + - (No sentinel needed; the removal from todo indicates completion.) +8. If validation fails: + - Increment retry counter: + - If `docs/retry/phase1/${relpath}.count` does not exist, set to 1. + - Else read it, add 1, write back. + - If new counter >= 3: + - Remove line from task file. + - Create `docs/failed/phase1/${relpath}.failed`. + - Log error. + - If new counter < 3: + - Keep the line in the task file (do nothing, it stays as first line for next run). +9. **Exit** (or stop if this was a single run). The next invocation will read the first line again (same if retry, or next if removed). - ## Structure interne - - Liste des fichiers principaux et leurs responsabilités (liens vers docs phase1) - - Flux de données ou interactions entre fichiers - - ## API publique - - **Types** : liste des types exportés avec brève description - - **Fonctions** : liste des fonctions exportées avec signature et rôle - - **Constantes/Variables** si pertinentes - - ## Exemple d’utilisation - (si possible, un extrait de code illustrant comment ce package est utilisé ailleurs) - ``` - d. Sauvegardez dans `docs/phase2/.md`. - -**Conseil** : Pour l’API publique, vous pouvez invoquer `find_definition` sur chaque symbole exporté (en vous basant sur les fichiers sources) pour obtenir les signatures exactes et les commentaires. +**Important:** +- Do **not** read more than one line. +- Do **not** attempt to process multiple items in one run. +- The LLM should finish after handling one item. --- -## Phase 3 : Documentation des outils (exécutables) +# Phase 1bis: Review and harmonization -**Objectif** : Pour chaque outil (répertoire contenant un `main.go`), générer une documentation utilisateur complète, en utilisant les informations des packages documentés en phase 2. +When Phase 1 is complete (i.e., `docs/todo/phase1.txt` empty), we initialize `docs/todo/phase1bis.txt` with the same list of files (the ones that succeeded). +But note: we need to know which files were successfully documented. Since we removed lines from `phase1.txt` on success, we need a record. The simplest is to reuse the same list but we can generate it by listing the existing `.go.md` files in `docs/phase1/` (since every successful file has a `.go.md`). +Thus, Phase 1bis initialization: -**Étapes** : -1. Trouvez tous les `main.go` du projet. Pour chacun, identifiez le nom de l’outil (généralement le nom du répertoire parent). -2. Pour chaque outil : - a. Lisez le `main.go` pour comprendre la logique globale et les options de ligne de commande (flags, cobra, etc.). - b. Identifiez tous les packages Go importés par l’outil (en analysant les imports du fichier). - c. Récupérez les documents `docs/phase2/.md` correspondants. - d. Utilisez les outils MCP pour explorer les fonctions appelées dans le `main.go` et comprendre leur rôle. - e. Générez un document Markdown : - ```markdown - # Outil : `` +- If `docs/todo/phase1bis.txt` does not exist, create it by listing all `.go.md` files under `docs/phase1/`, stripping the `docs/phase1/` prefix and the `.go.md` suffix, and writing the relative path (same format as phase1). - ## Résumé - Une ligne de description. - - ## Description - Explication détaillée de ce que fait l’outil, dans quel contexte l’utiliser, comment il traite les données (séquences, fichiers, etc.). - - ## Options - | Option | Type | Défaut | Description | - |--------|------|--------|-------------| - | `--flag` | string | "" | description extraite du code | - - ## Exemples - ```bash - # Exemple 1 : utilisation simple - outil -i input.fasta -o output.fasta - - # Exemple 2 : avec options avancées - outil --flag value ... - ``` - - ## Voir aussi - - [Package ``](../phase2/.md) - - [Package ``](../phase2/.md) - - Autres outils connexes - ``` - f. Sauvegardez dans `docs/phase3/.md`. - -**Conseil** : Pour extraire les options, examinez le code qui utilise `flag` ou `cobra`. Vous pouvez aussi utiliser `go doc` sur le package principal, mais les outils MCP vous permettront de suivre les références aux symboles. +Then processing is identical to Phase 1, but using `docs/todo/phase1bis.txt` and output is overwriting the same `.go.md` files (with improvements). Retry counters go in `docs/retry/phase1bis/`. --- -# Validation et finalisation -- Après avoir généré tous les documents, vérifiez que la documentation phase3 inclut bien des liens fonctionnels vers les documents phase2. -- Si certains packages ne sont pas documentés (parce que leurs fichiers n’ont pas été traités en phase1), repérez les oublis et corrigez. -- Vous pouvez générer un index global dans `docs/README.md` listant tous les packages et outils avec leurs descriptions. +# Phase 2: Package documentation + +When Phase 1bis is complete (`docs/todo/phase1bis.txt` empty), initialize `docs/todo/phase2.txt`: + +- List all packages: unique directories under `pkg/` that contain at least one `.go` file and are not tools. +- Write each package identifier (e.g., `align`, `internal/align`) as a line. + +Processing: read first line, generate `docs/phase2/.md`, validate, remove line on success, retry logic in `docs/retry/phase2/`. --- -# Consignes d’exécution -- Travaillez de manière séquentielle : terminez une phase avant de passer à la suivante. -- Pour chaque fichier/package/outil, utilisez les outils MCP de manière parcimonieuse mais exhaustive. -- Si un outil MCP échoue (par exemple, coordonnées incorrectes), ajustez en relisant le fichier source pour trouver la bonne ligne/colonne. -- Enregistrez les résultats immédiatement après génération pour éviter les pertes. +# Phase 3: Tool documentation -Maintenant, commencez par la phase 1. Listez tous les fichiers `.go` (hors main.go) et générez la documentation pour le premier fichier. +When Phase 2 complete, initialize `docs/todo/phase3.txt`: +- List all directories under `cmd/` that contain a `main.go`. Write each tool name as a line. +Processing: read first line, generate `docs/phase3/.md`, validate, remove line on success, retry logic in `docs/retry/phase3/`. + +--- + +# Finalization + +When all task files are empty and no pending phases, generate `docs/README.md` by: + +- Listing all package docs (files in `docs/phase2/`) and linking. +- Listing all tool docs (files in `docs/phase3/`) and linking. + +Write using `Shell`. + +--- + +# Execution flow summary + +1. **Phase 0**: Create directories and initial `todo/phase1.txt` if missing. Exit. +2. **Phase 1**: + - If `todo/phase1.txt` exists and non‑empty → process first line. + - Else → move to Phase 1bis initialization. +3. **Phase 1bis**: + - If `todo/phase1bis.txt` does not exist → create from successful phase1 docs. + - If non‑empty → process first line. + - Else → move to Phase 2 initialization. +4. **Phase 2**: similar. +5. **Phase 3**: similar. +6. **Finalization**: generate README. + +The LLM should be invoked repeatedly (e.g., by a scheduler) until all phases are done. Each invocation processes exactly one item. + +--- + +# Important reminders + +- Always call `Shell` to write files; never output content in plain text. +- Validate quality before removing a line from the task file. +- Log all failures to `docs/error.log` in JSON lines format. +- If any MCP tool fails, treat as failure and increment retry counter. +- Never read more than one line from a task file in a single run. diff --git a/scripts/find_setattribute.go b/scripts/find_setattribute.go new file mode 100644 index 0000000..c3eca52 --- /dev/null +++ b/scripts/find_setattribute.go @@ -0,0 +1,222 @@ +//go:build ignore + +package main + +import ( + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "regexp" + "sort" + "strconv" + "strings" +) + +type Reference struct { + File string `json:"file"` + Line int `json:"line"` + Column int `json:"column"` + Key string `json:"key"` + Function string `json:"function"` + Context string `json:"context"` +} + +type Result struct { + Method string `json:"method"` + Signature string `json:"signature"` + Definition string `json:"definition"` + References []Reference `json:"references"` + Total int `json:"total"` +} + +var basePath = "/Users/coissac/Sync/travail/__MOI__/GO/obitools4" + +func main() { + cmd := exec.Command("rg", "-n", `\.SetAttribute\(`, basePath+"/pkg", "--type", "go") + output, err := cmd.Output() + if err != nil { + fmt.Fprintf(os.Stderr, "Error running rg: %v\n", err) + os.Exit(1) + } + + lines := strings.Split(string(output), "\n") + lineRe := regexp.MustCompile(`^(.+?):(\d+):\s*(.+)$`) + keyRe := regexp.MustCompile(`SetAttribute\("([^"]+)"`) + templateKeyRe := regexp.MustCompile(`SetAttribute\("([^"]+)[^"]*"\s*,`) + + var refs []Reference + seen := make(map[string]bool) + + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" { + continue + } + + matches := lineRe.FindStringSubmatch(line) + if matches == nil { + continue + } + + file := matches[1] + lineNum, _ := strconv.Atoi(matches[2]) + context := strings.TrimSpace(matches[3]) + + // Skip definition + if strings.Contains(file, "obiseq/attributes.go") && lineNum == 132 { + continue + } + + // Extract key + var key string + if keyMatches := keyRe.FindStringSubmatch(context); keyMatches != nil { + key = keyMatches[1] + } else if tmplMatches := templateKeyRe.FindStringSubmatch(context); tmplMatches != nil { + key = tmplMatches[1] + } else { + continue + } + + // Get function name using treesitter + funcName := getFunctionNameTreesitter(file, lineNum) + + uniqueKey := fmt.Sprintf("%s:%d", file, lineNum) + if seen[uniqueKey] { + continue + } + seen[uniqueKey] = true + + refs = append(refs, Reference{ + File: filepath.Base(file), + Line: lineNum, + Column: 0, + Key: key, + Function: funcName, + Context: context, + }) + } + + sort.Slice(refs, func(i, j int) bool { + if refs[i].File != refs[j].File { + return refs[i].File < refs[j].File + } + return refs[i].Line < refs[j].Line + }) + + result := Result{ + Method: "SetAttribute", + Signature: "func (s *BioSequence) SetAttribute(key string, value interface{})", + Definition: basePath + "/pkg/obiseq/attributes.go:132", + References: refs, + Total: len(refs), + } + + outputJSON, err := json.MarshalIndent(result, "", " ") + if err != nil { + fmt.Fprintf(os.Stderr, "Error marshaling JSON: %v\n", err) + os.Exit(1) + } + + fmt.Println(string(outputJSON)) +} + +// getFunctionNameTreesitter uses the treesitter_cursor_walk tool to get the containing function +func getFunctionNameTreesitter(file string, targetLine int) string { + // Convert to 0-based for treesitter + row := targetLine - 1 + + // Use treesitter cursor walk to get ancestors + cmd := exec.Command("bash", "-c", + fmt.Sprintf(`kilo treesitter_cursor_walk --file_path %q --row %d --column 0 --max_depth 10 2>/dev/null`, file, row)) + + output, err := cmd.Output() + if err != nil { + return findContainingFunction(file, targetLine) + } + + // Parse the JSON output to find function_declaration or method_declaration + var result map[string]interface{} + if err := json.Unmarshal(output, &result); err != nil { + return findContainingFunction(file, targetLine) + } + + // Check ancestors for function declaration + if ancestors, ok := result["ancestors"].([]interface{}); ok { + for _, a := range ancestors { + if anc, ok := a.(map[string]interface{}); ok { + nodeType, _ := anc["type"].(string) + if nodeType == "function_declaration" || nodeType == "method_declaration" { + // Try to get the function name from children + if children, ok := anc["children"].([]interface{}); ok { + for _, c := range children { + if child, ok := c.(map[string]interface{}); ok { + childType, _ := child["type"].(string) + if childType == "identifier" { + if text, ok := child["text"].(string); ok { + return text + } + } + if childType == "field_identifier" { + if text, ok := child["text"].(string); ok { + return text + } + } + } + } + } + } + if nodeType == "func_literal" { + return "closure" + } + } + } + } + + return findContainingFunction(file, targetLine) +} + +func findContainingFunction(file string, targetLine int) string { + data, err := os.ReadFile(file) + if err != nil { + return "" + } + lines := strings.Split(string(data), "\n") + + for i := targetLine - 1; i >= 0 && i >= targetLine-200; i-- { + if i >= len(lines) { + continue + } + line := strings.TrimSpace(lines[i]) + + if line == "}" && i > 0 { + for j := i - 1; j >= 0 && j >= i-50; j-- { + if j >= len(lines) { + continue + } + funcLine := strings.TrimSpace(lines[j]) + if strings.HasPrefix(funcLine, "func ") { + if match := regexp.MustCompile(`func\s+\([^)]+\)\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(`).FindStringSubmatch(funcLine); match != nil { + return match[1] + } + if match := regexp.MustCompile(`func\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(`).FindStringSubmatch(funcLine); match != nil { + return match[1] + } + } + } + continue + } + + if strings.HasPrefix(line, "func ") { + if match := regexp.MustCompile(`func\s+\([^)]+\)\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(`).FindStringSubmatch(line); match != nil { + return match[1] + } + if match := regexp.MustCompile(`func\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(`).FindStringSubmatch(line); match != nil { + return match[1] + } + } + } + + return "" +} diff --git a/scripts/find_setattribute.sh b/scripts/find_setattribute.sh new file mode 100755 index 0000000..9e27da4 --- /dev/null +++ b/scripts/find_setattribute.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +basePath="/Users/coissac/Sync/travail/__MOI__/GO/obitools4" +OUTPUT_FILE="${1:-/dev/stdout}" + +# Get all SetAttribute calls +rg -n '\.SetAttribute\(' "$basePath/pkg" --type go | while read -r line; do + file="${line%%:*}" + line_num="${line%:*}" + line_num="${line_num##*:}" + context="${line##*: }" + + # Extract key (only literal strings) + key=$(echo "$context" | sed -n 's/.*SetAttribute("\([^"]*\)".*/\1/p') + [ -z "$key" ] && continue + + # Get function name using treesitter + func=$(kilo treesitter_cursor_walk \ + --file_path "$file" \ + --row "$((line_num - 1))" \ + --column 0 \ + --max_depth 10 2>/dev/null | + jq -r '.ancestors[] | select(.type == "function_declaration" or .type == "method_declaration") | .children[] | select(.type == "identifier" or .type == "field_identifier") | .text' 2>/dev/null) + + # Fallback to func_literal for closures + if [ -z "$func" ]; then + func=$(kilo treesitter_cursor_walk \ + --file_path "$file" \ + --row "$((line_num - 1))" \ + --column 0 \ + --max_depth 10 2>/dev/null | + jq -r '.ancestors[] | select(.type == "func_literal") | "closure"' 2>/dev/null) + fi + + echo "$(basename "$file")|$line_num|$key|${func:-unknown}|$context" +done | sort -t'|' -k1,1 -k2,2n diff --git a/setattribute_refs.json b/setattribute_refs.json new file mode 100644 index 0000000..e047d59 --- /dev/null +++ b/setattribute_refs.json @@ -0,0 +1,308 @@ +{ + "obiannotate": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ], + "(git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter.IBioSequence).NumberSequences$1": [ + "seq_number" + ] + }, + "obiclean": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obicleandb": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetCount": [ + "count" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obicomplement": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obiconsensus": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetCount": [ + "count" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ], + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconsensus.BuildConsensus": [ + "obiconsensus_kmer_max_occur", + "obiconsensus_filtered_graph_size", + "obiconsensus_full_graph_size", + "obiconsensus_consensus", + "obiconsensus_weight", + "obiconsensus_seq_length", + "obiconsensus_kmer_size" + ], + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconsensus.MinionClusterDenoise": [ + "obiconsensus_consensus" + ], + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconsensus.MinionDenoise$1": [ + "obiconsensus_consensus", + "obiconsensus_weight" + ] + }, + "obiconvert": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obicount": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obicsv": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obidemerge": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obidistribute": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obigrep": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obijoin": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obikmermatch": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obikmersimcount": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obilandmark": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetCoordinate": [ + "landmark_coord" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetOBITagGeomRefIndex": [ + "obitag_geomref_index" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ], + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obilandmark.CLISelectLandmarkSequences": [ + "landmark_id" + ] + }, + "obimatrix": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obimicrosat": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obimultiplex": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obipairing": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence)._revcmpMutation": [ + "pairing_mismatches" + ], + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign.BuildQualityConsensus": [ + "pairing_mismatches" + ] + }, + "obipcr": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obireffamidx": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetOBITagRefIndex": [ + "obitag_ref_index" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ], + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obirefidx.IndexFamilyDB": [ + "reffamidx_id" + ] + }, + "obirefidx": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetOBITagRefIndex": [ + "obitag_ref_index" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obiscript": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obisplit": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obisummary": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obitag": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetPath": [ + "taxonomic_path" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + }, + "obitagpcr": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obingslibrary.NGSLibrary).ExtractMultiBarcode": [ + "obimultiplex_error", + "obimultiplex_amplicon_rank" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence)._revcmpMutation": [ + "pairing_mismatches" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence)._subseqMutation": [ + "pairing_mismatches" + ], + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign.BuildQualityConsensus": [ + "pairing_mismatches" + ] + }, + "obitaxonomy": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetPath": [ + "taxonomic_path" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ], + "(git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter.IBioSequence).NumberSequences$1": [ + "seq_number" + ] + }, + "obiuniq": { + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetCount": [ + "count" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [ + "definition" + ], + "(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [ + "taxid" + ] + } +} diff --git a/x b/x new file mode 100644 index 0000000..08209fc --- /dev/null +++ b/x @@ -0,0 +1,19 @@ +```markdown +# DNA Scoring and Matching Utilities in `obialign` + +This module provides low-level utilities for computing nucleotide alignment scores using probabilistic and bit-encoded representations. + +- **Bit Encoding**: Nucleotides are encoded in 4-bit groups (e.g., `A=0b0001`, `C=0b0010`, etc.), enabling efficient bitwise comparison. +- **`_MatchRatio(a, b)`**: Computes a normalized match ratio between two encoded bytes based on shared bits: + `ratio = common_bits / (bits_in_a × bits_in_b)`. +- **`_FourBitsCount`**: Precomputed lookup table for Hamming weight (popcount) of 4-bit values. +- **Log-space Arithmetic**: Helper functions (`_Logaddexp`, `_Logdiffexp`, `_Log1mexp`) ensure numerical stability in probabilistic computations. +- **Phred-scaled Quality Integration**: + `_MatchScoreRatio(QF, QR)` derives log-odds match/mismatch scores from Phred quality values (`QF`, `QR`), modeling sequencing error probabilities. +- **Precomputed Matrices**: + - `_NucPartMatch[i][j]`: Match ratios for all nucleotide pairs (from 4-bit codes). + - `_NucScorePartMatchMatch/Mismatch[i][j]`: Integer-scaled match/mismatch scores (×10) for quality pairs `(i, j)` in `[0..99]`. +- **Thread-Safe Initialization**: `_InitDNAScoreMatrix()` ensures one-time, synchronized initialization of all scoring tables via a mutex. + +Designed for high-performance alignment kernels where speed and numerical robustness are critical. +```