diff --git a/doc/book/annexes.qmd b/doc/book/annexes.qmd index 80d7e27..7eb0467 100644 --- a/doc/book/annexes.qmd +++ b/doc/book/annexes.qmd @@ -1,82 +1,107 @@ # Annexes -### Sequence attributes +## Sequence attributes -#### Reserved sequence attributes +**ali_dir (`string`)** -##### `ali_dir` + - Set by the *obipairing* tool + - The attribute can contain 2 string values `left` or `right`. -###### Type : `string` + The alignment generated by *obipairing* is a 3'-end gap free algorithm. + Two cases can occur when aligning the forward and reverse reads. If the + barcode is long enough, both the reads overlap only on their 3' ends. In + such case, the alignment direction `ali_dir` is set to *left*. If the + barcode is shorter than the read length, the paired reads overlap by + their 5' ends, and the complete barcode is sequenced by both the reads. + In that later case, `ali_dir` is set to *right*. -The attribute can contain 2 string values `"left"` or `"right".` +**ali_length (`int`)** -###### Set by the *obipairing* tool + - Set by the *obipairing* tool -The alignment generated by *obipairing* is a 3'-end gap free algorithm. -Two cases can occur when aligning the forward and reverse reads. If the -barcode is long enough, both the reads overlap only on their 3' ends. In -such case, the alignment direction `ali_dir` is set to *left*. If the -barcode is shorter than the read length, the paired reads overlap by -their 5' ends, and the complete barcode is sequenced by both the reads. -In that later case, `ali_dir` is set to *right*. + Length of the aligned parts when merging forward and reverse reads -##### `ali_length` -###### Set by the *obipairing* tool +**count (`int`)** -Length of the aligned parts when merging forward and reverse reads + - Set by the *obiuniq* tool + - Getter : method `Count()` + - Setter : method `SetCount(int)` -##### `count` : the number of sequence occurrences + The `count` attribute indicates how-many strictly identical reads + have been merged in a single record. It contains an integer value. If it + is absent this means that the sequence record represents a single + occurrence of the sequence. -###### Set by the *obiuniq* tool + The `Count()` method allows to access to the count attribute as an + integer value. If the `count` attribute is not defined for the given + sequence, the value *1* is returned -The `count` attribute indicates how-many strictly identical sequences -have been merged in a single record. It contains an integer value. If it -is absent this means that the sequence record represents a single -occurrence of the sequence. +**merged_* (`map[string]int`)** -###### Getter : method `Count()` + - Set by the *obiuniq* tool -The `Count()` method allows to access to the count attribute as an -integer value. If the `count` attribute is not defined for the given -sequence, the value *1* is returned + The `-m` option of the *obiuniq* tools allows for keeping track of the + distribution of the values stored in given attribute of interest. Often + this option is used to summarise distribution of a sequence variant + accross samples when *obiuniq* is run after running *obimultiplex*. The + actual name of the attribute depends on the name of the monitored + attribute. If `-m` option is used with the attribute *sample*, then this + attribute names *merged_sample*. -##### `merged_*` +**mode (`string`)** -###### Type : `map[string]int` + - Set by the *obipairing* tool + - The attribute can contain 2 string values `join` or `alignment`. -###### Set by the *obiuniq* tool -The `-m` option of the *obiuniq* tools allows for keeping track of the -distribution of the values stored in given attribute of interest. Often -this option is used to summarise distribution of a sequence variant -accross samples when *obiuniq* is run after running *obimultiplex*. The -actual name of the attribute depends on the name of the monitored -attribute. If `-m` option is used with the attribute *sample*, then this -attribute names *merged_sample*. +**obitag_ref_index (`map[string]string`)** -##### `mode` + - Set by the *obirefidx* tool. -###### Set by the *obipairing* tool + It resumes to which taxonomic annotation a match to that sequence must + lead according to the number of differences existing between the query + sequence and the reference sequence having that tag. -**`obitag_ref_index`** +```json + {"0":"9606@Homo sapiens@species", + "2":"207598@Homininae@subfamily", + "3":"9604@Hominidae@family", + "8":"314295@Hominoidea@superfamily", + "10":"9526@Catarrhini@parvorder", + "12":"1437010@Boreoeutheria@clade", + "16":"9347@Eutheria@clade", + "17":"40674@Mammalia@class", + "22":"117571@Euteleostomi@clade", + "25":"7776@Gnathostomata@clade", + "29":"33213@Bilateria@clade", + "30":"6072@Eumetazoa@clade"} +``` -###### Set by the *obirefidx* tool. +**pairing_mismatches (`map[string]string`)** -It resumes to which taxonomic annotation a match to that sequence must -lead according to the number of differences existing between the query -sequence and the reference sequence having that tag. + - Set by the *obipairing* tool -###### Getter : method `Count()` +**seq_a_single (`int`)** -##### `pairing_mismatches` + - Set by the *obipairing* tool -###### Set by the *obipairing* tool +**seq_ab_match (`int`)** -##### `score` + - Set by the *obipairing* tool -###### Set by the *obipairing* tool +**seq_b_single (`int`)** -##### `score_norm` + - Set by the *obipairing* tool -###### Set by the *obipairing* tool +**score (`int`)** + + - Set by the *obipairing* tool + +**score_norm (`float`)** + + - Set by the *obipairing* tool + - The value ranges between 0 and 1. + + Score of the alignment between forward and reverse reads expressed as a fraction of identity. + diff --git a/doc/book/comm_sampling.qmd b/doc/book/comm_sampling.qmd index 0a3ba45..a8ca2a6 100644 --- a/doc/book/comm_sampling.qmd +++ b/doc/book/comm_sampling.qmd @@ -10,13 +10,39 @@ Sequences can be selected on several of their caracteristics, their length, their id, their sequence. Options allow for specifying the condition if selection. +**Selection based on the sequence** + + +Sequence records can be selected according if they match or not with a pattern. The simplest pattern is as short sequence (*e.g* `AACCTT`). But the usage of regular patterns allows for looking for more complex pattern. As example, `A[TG]C+G` matches a `A`, followed by a `T` or a `G`, then one or several `C` and endly a `G`. + +{{< include ../lib/options/selection/_sequence.qmd >}} + +*Examples:* + +: Selects only the sequence records that contain an *EcoRI* restriction site. + +```bash +obigrep -s 'GAATTC' seq1.fasta > seq2.fasta +``` + +: Selects only the sequence records that contain a stretch of at least 10 ``A``. + +```bash +obigrep -s 'A{10,}' seq1.fasta > seq2.fasta +``` + +: Selects only the sequence records that do not contain ambiguous nucleotides. + +```bash +obigrep -s '^[ACGT]+$' seq1.fasta > seq2.fasta +``` {{< include ../lib/options/selection/_min-count.qmd >}} {{< include ../lib/options/selection/_max-count.qmd >}} -Example +*Examples* : Selecting sequence records representing at least five reads in the dataset. diff --git a/doc/book/expressions.qmd b/doc/book/expressions.qmd index 71261c4..4a1a883 100644 --- a/doc/book/expressions.qmd +++ b/doc/book/expressions.qmd @@ -11,26 +11,64 @@ Several OBITools (*e.g.* obigrep, obiannotate) allow the user to specify some si ### Instrospection functions {.unnumbered} -- `len(x)`is a generic function allowing to retreive the size of a object. It returns +**`len(x)`** + +: It is a generic function allowing to retreive the size of a object. It returns the length of a sequences, the number of element in a map like `annotations`, the number of elements in an array. The reurned value is an `int`. ### Cast functions {.unnumbered} -- `int(x)` converts if possible the `x` value to an integer value. The function +**`int(x)`** + +: Converts if possible the `x` value to an integer value. The function returns an `int`. -- `numeric(x)` converts if possible the `x` value to a float value. The function + +**`numeric(x)`** + +: Converts if possible the `x` value to a float value. The function returns a `float`. -- `bool(x)` converts if possible the `x` value to a boolean value. The function + +**`bool(x)`** + +: Converts if possible the `x` value to a boolean value. The function returns a `bool`. ### String related functions {.unnumbered} -- `printf(format,...)` allows to combine several values to build a string. `format` follows the +**`printf(format,...)`** + +: Allows to combine several values to build a string. `format` follows the classical C `printf` syntax. The function returns a `string`. -- `subspc(x)` substitutes every space in the `x` string by the underscore (`_`) character. The function + +**`subspc(x)`** + +: substitutes every space in the `x` string by the underscore (`_`) character. The function returns a `string`. +### Condition function {.unnumbered} + +**`ifelse(condition,val1,val2)`** + +: The `condition` value has to be a `bool` value. If it is `true` the function returns `val1`, + otherwise, it is returning `val2`. + +### Sequence analysis related function + +**`composition(sequence)`** + +: The nucleotide composition of the sequence is returned as as map indexed by `a`, `c`, `g`, or `t` and + each value is the number of occurrences of that nucleotide. A fifth key `others` accounts for + all others symboles. + +**`gcskew(sequence)`** + +: Computes the excess of g compare to c of the sequence, known as the GC skew. + + $$ + Skew_{GC}=\frac{G-C}{G+C} + $$ + ## Accessing to the sequence annotations The `annotations` variable is a map object containing all the annotations associated to the currently processed sequence. Index of the map are the attribute names. It exists to possibillities to retreive @@ -53,4 +91,7 @@ Special attributes of the sequence are accessible only by dedicated methods of t - The sequence identifier : `Id()` - THe sequence definition : `Definition()` +```go +sequence.Id() +``` diff --git a/doc/build/_book/OBITools-V4.epub b/doc/build/_book/OBITools-V4.epub index 934b2b4..8bffd15 100644 Binary files a/doc/build/_book/OBITools-V4.epub and b/doc/build/_book/OBITools-V4.epub differ diff --git a/doc/build/_book/annexes.html b/doc/build/_book/annexes.html index bbdf381..1185fc3 100644 --- a/doc/build/_book/annexes.html +++ b/doc/build/_book/annexes.html @@ -20,6 +20,69 @@ ul.task-list li input[type="checkbox"] { margin: 0 0.8em 0.2em -1.6em; vertical-align: middle; } +pre > code.sourceCode { white-space: pre; position: relative; } +pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } +pre > code.sourceCode > span:empty { height: 1.2em; } +.sourceCode { overflow: visible; } +code.sourceCode > span { color: inherit; text-decoration: inherit; } +div.sourceCode { margin: 1em 0; } +pre.sourceCode { margin: 0; } +@media screen { +div.sourceCode { overflow: auto; } +} +@media print { +pre > code.sourceCode { white-space: pre-wrap; } +pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; } +} +pre.numberSource code + { counter-reset: source-line 0; } +pre.numberSource code > span + { position: relative; left: -4em; counter-increment: source-line; } +pre.numberSource code > span > a:first-child::before + { content: counter(source-line); + position: relative; left: -1em; text-align: right; vertical-align: baseline; + border: none; display: inline-block; + -webkit-touch-callout: none; -webkit-user-select: none; + -khtml-user-select: none; -moz-user-select: none; + -ms-user-select: none; user-select: none; + padding: 0 4px; width: 4em; + color: #aaaaaa; + } +pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; } +div.sourceCode + { } +@media screen { +pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } +} +code span.al { color: #ff0000; font-weight: bold; } /* Alert */ +code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */ +code span.at { color: #7d9029; } /* Attribute */ +code span.bn { color: #40a070; } /* BaseN */ +code span.bu { color: #008000; } /* BuiltIn */ +code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */ +code span.ch { color: #4070a0; } /* Char */ +code span.cn { color: #880000; } /* Constant */ +code span.co { color: #60a0b0; font-style: italic; } /* Comment */ +code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */ +code span.do { color: #ba2121; font-style: italic; } /* Documentation */ +code span.dt { color: #902000; } /* DataType */ +code span.dv { color: #40a070; } /* DecVal */ +code span.er { color: #ff0000; font-weight: bold; } /* Error */ +code span.ex { } /* Extension */ +code span.fl { color: #40a070; } /* Float */ +code span.fu { color: #06287e; } /* Function */ +code span.im { color: #008000; font-weight: bold; } /* Import */ +code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */ +code span.kw { color: #007020; font-weight: bold; } /* Keyword */ +code span.op { color: #666666; } /* Operator */ +code span.ot { color: #007020; } /* Other */ +code span.pp { color: #bc7a00; } /* Preprocessor */ +code span.sc { color: #4070a0; } /* SpecialChar */ +code span.ss { color: #bb6688; } /* SpecialString */ +code span.st { color: #4070a0; } /* String */ +code span.va { color: #19177c; } /* Variable */ +code span.vs { color: #4070a0; } /* VerbatimString */ +code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */ @@ -215,7 +278,7 @@ ul.task-list li input[type="checkbox"] {

Table of contents

@@ -239,84 +302,82 @@ ul.task-list li input[type="checkbox"] { -
-

A.0.1 Sequence attributes

-
-

A.0.1.1 Reserved sequence attributes

-
-
A.0.1.1.1 ali_dir
-
-
A.0.1.1.1.1 Type : string
-

The attribute can contain 2 string values "left" or "right".

-
-
-
A.0.1.1.1.2 Set by the obipairing tool
+
+

A.1 Sequence attributes

+

ali_dir (string)

+
    +
  • Set by the obipairing tool
  • +
  • The attribute can contain 2 string values left or right.
  • +

The alignment generated by obipairing is a 3’-end gap free algorithm. Two cases can occur when aligning the forward and reverse reads. If the barcode is long enough, both the reads overlap only on their 3’ ends. In such case, the alignment direction ali_dir is set to left. If the barcode is shorter than the read length, the paired reads overlap by their 5’ ends, and the complete barcode is sequenced by both the reads. In that later case, ali_dir is set to right.

-
-
-
-
A.0.1.1.2 ali_length
-
-
A.0.1.1.2.1 Set by the obipairing tool
+

ali_length (int)

+
    +
  • Set by the obipairing tool
  • +

Length of the aligned parts when merging forward and reverse reads

-
-
-
-
A.0.1.1.3 count : the number of sequence occurrences
-
-
A.0.1.1.3.1 Set by the obiuniq tool
-

The count attribute indicates how-many strictly identical sequences have been merged in a single record. It contains an integer value. If it is absent this means that the sequence record represents a single occurrence of the sequence.

-
-
-
A.0.1.1.3.2 Getter : method Count()
+

count (int)

+
    +
  • Set by the obiuniq tool
  • +
  • Getter : method Count()
  • +
  • Setter : method SetCount(int)
  • +
+

The count attribute indicates how-many strictly identical reads have been merged in a single record. It contains an integer value. If it is absent this means that the sequence record represents a single occurrence of the sequence.

The Count() method allows to access to the count attribute as an integer value. If the count attribute is not defined for the given sequence, the value 1 is returned

-
-
-
-
A.0.1.1.4 merged_*
-
-
A.0.1.1.4.1 Type : map[string]int
-
-
-
A.0.1.1.4.2 Set by the obiuniq tool
+

merged_* (map[string]int)

+
    +
  • Set by the obiuniq tool
  • +

The -m option of the obiuniq tools allows for keeping track of the distribution of the values stored in given attribute of interest. Often this option is used to summarise distribution of a sequence variant accross samples when obiuniq is run after running obimultiplex. The actual name of the attribute depends on the name of the monitored attribute. If -m option is used with the attribute sample, then this attribute names merged_sample.

-
-
-
-
A.0.1.1.5 mode
-
-
A.0.1.1.5.1 Set by the obipairing tool
-

obitag_ref_index

-
-
-
A.0.1.1.5.2 Set by the obirefidx tool.
+

mode (string)

+
    +
  • Set by the obipairing tool
  • +
  • The attribute can contain 2 string values join or alignment.
  • +
+

obitag_ref_index (map[string]string)

+
    +
  • Set by the obirefidx tool.
  • +

It resumes to which taxonomic annotation a match to that sequence must lead according to the number of differences existing between the query sequence and the reference sequence having that tag.

-
-
-
A.0.1.1.5.3 Getter : method Count()
-
-
-
-
A.0.1.1.6 pairing_mismatches
-
-
A.0.1.1.6.1 Set by the obipairing tool
-
-
-
-
A.0.1.1.7 score
-
-
A.0.1.1.7.1 Set by the obipairing tool
-
-
-
-
A.0.1.1.8 score_norm
-
-
A.0.1.1.8.1 Set by the obipairing tool
+
   {"0":"9606@Homo sapiens@species",
+    "2":"207598@Homininae@subfamily",
+    "3":"9604@Hominidae@family",
+    "8":"314295@Hominoidea@superfamily",
+    "10":"9526@Catarrhini@parvorder",
+    "12":"1437010@Boreoeutheria@clade",
+    "16":"9347@Eutheria@clade",
+    "17":"40674@Mammalia@class",
+    "22":"117571@Euteleostomi@clade",
+    "25":"7776@Gnathostomata@clade",
+    "29":"33213@Bilateria@clade",
+    "30":"6072@Eumetazoa@clade"}
+

pairing_mismatches (map[string]string)

+
    +
  • Set by the obipairing tool
  • +
+

seq_a_single (int)

+
    +
  • Set by the obipairing tool
  • +
+

seq_ab_match (int)

+
    +
  • Set by the obipairing tool
  • +
+

seq_b_single (int)

+
    +
  • Set by the obipairing tool
  • +
+

score (int)

+
    +
  • Set by the obipairing tool
  • +
+

score_norm (float)

+
    +
  • Set by the obipairing tool
  • +
  • The value ranges between 0 and 1.
  • +
+

Score of the alignment between forward and reverse reads expressed as a fraction of identity.

-
-
-
diff --git a/doc/build/_book/comm_sampling.html b/doc/build/_book/comm_sampling.html index a7e2001..5eaf17b 100644 --- a/doc/build/_book/comm_sampling.html +++ b/doc/build/_book/comm_sampling.html @@ -314,6 +314,23 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni

12.1.1.1 Selecting sequences based on their caracteristics

Sequences can be selected on several of their caracteristics, their length, their id, their sequence. Options allow for specifying the condition if selection.

+

Selection based on the sequence

+

Sequence records can be selected according if they match or not with a pattern. The simplest pattern is as short sequence (e.g AACCTT). But the usage of regular patterns allows for looking for more complex pattern. As example, A[TG]C+G matches a A, followed by a T or a G, then one or several C and endly a G.

+
+
--sequence|-s PATTERN
+
+

Regular expression pattern to be tested against the sequence itself. The pattern is case insensitive. A complete description of the regular pattern grammar is available here.

+
+
Examples:
+
+

Selects only the sequence records that contain an EcoRI restriction site.

+
+
+
obigrep -s 'GAATTC' seq1.fasta > seq2.fasta
+

: Selects only the sequence records that contain a stretch of at least 10 A.

+
obigrep -s 'A{10,}' seq1.fasta > seq2.fasta
+

: Selects only the sequence records that do not contain ambiguous nucleotides.

+
obigrep -s '^[ACGT]+$' seq1.fasta > seq2.fasta
--min-count | -c COUNT
@@ -323,12 +340,12 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni

only sequences reprensenting no more than COUNT reads will be selected. That option rely on the count attribute. If the count attribute is not defined for a sequence record, it is assumed equal to \(1\).

-
Example
+
Examples

Selecting sequence records representing at least five reads in the dataset.

-
obigrep -c 5 data_SPER01.fasta > data_norare_SPER01.fasta
+
obigrep -c 5 data_SPER01.fasta > data_norare_SPER01.fasta
diff --git a/doc/build/_book/expressions.html b/doc/build/_book/expressions.html index 7ccf909..8bc2ad5 100644 --- a/doc/build/_book/expressions.html +++ b/doc/build/_book/expressions.html @@ -124,6 +124,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni } } + @@ -284,6 +285,8 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni
  • Instrospection functions
  • Cast functions
  • String related functions
  • +
  • Condition function
  • +
  • 7.2.1 Sequence analysis related function
  • 7.3 Accessing to the sequence annotations
  • @@ -321,24 +324,67 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni

    7.2 Function defined in the language

    Instrospection functions

    - +
    +
    len(x)
    +
    +

    It is a generic function allowing to retreive the size of a object. It returns the length of a sequences, the number of element in a map like annotations, the number of elements in an array. The reurned value is an int.

    +
    +

    Cast functions

    - +
    +
    int(x)
    +
    +

    Converts if possible the x value to an integer value. The function returns an int.

    +
    +
    numeric(x)
    +
    +

    Converts if possible the x value to a float value. The function returns a float.

    +
    +
    bool(x)
    +
    +

    Converts if possible the x value to a boolean value. The function returns a bool.

    +
    +
    +
    +

    Condition function

    +
    +
    ifelse(condition,val1,val2)
    +
    +

    The condition value has to be a bool value. If it is true the function returns val1, otherwise, it is returning val2.

    +
    +
    +
    +
    @@ -352,6 +398,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni
  • The sequence identifier : Id()
  • THe sequence definition : Definition()
  • +
    sequence.Id()
    diff --git a/doc/build/_man/man1/obigrep.man b/doc/build/_man/man1/obigrep.man index 4c8e9a1..a27dc67 100644 --- a/doc/build/_man/man1/obigrep.man +++ b/doc/build/_man/man1/obigrep.man @@ -174,14 +174,22 @@ selected. That option rely on the \f[V]count\f[R] attribute. If the \f[V]count\f[R] attribute is not defined for a sequence record, it is assumed equal to 1. -.PP +.TP \f[B]--max-length\f[R] | \f[B]-L\f[R] \f[I]LENGTH\f[R] -.PP +Keeps sequence records whose sequence length is equal or shorter than +\f[I]LENGTH\f[R]. +.TP \f[B]--min-length\f[R] | \f[B]-l\f[R] \f[I]LENGTH\f[R] +Keeps sequence records whose sequence length is equal or longer than +\f[I]LENGTH\f[R]. .PP \f[B]--predicate\f[R]|\f[B]-p\f[R] \f[I]EXPRESSION\f[R] -.PP +.TP \f[B]--sequence\f[R]|\f[B]-s\f[R] \f[I]PATTERN\f[R] +Regular expression pattern to be tested against the sequence itself. +The pattern is case insensitive. +A complete description of the regular pattern grammar is available +here (https://yourbasic.org/golang/regexp-cheat-sheet/#cheat-sheet). .PP \f[B]--inverse-match\f[R] | \f[B]-v\f[R] .PP diff --git a/doc/lib/options/selection/_max-length.qmd b/doc/lib/options/selection/_max-length.qmd new file mode 100644 index 0000000..8264c5b --- /dev/null +++ b/doc/lib/options/selection/_max-length.qmd @@ -0,0 +1,3 @@ +**\--max-length** | **-L** _LENGTH_ + +: Keeps sequence records whose sequence length is equal or shorter than _LENGTH_. diff --git a/doc/lib/options/selection/_min-length.qmd b/doc/lib/options/selection/_min-length.qmd new file mode 100644 index 0000000..e64defc --- /dev/null +++ b/doc/lib/options/selection/_min-length.qmd @@ -0,0 +1,3 @@ +**\--min-length** | **-l** _LENGTH_ + +: Keeps sequence records whose sequence length is equal or longer than _LENGTH_. diff --git a/doc/lib/options/selection/_sequence.qmd b/doc/lib/options/selection/_sequence.qmd new file mode 100644 index 0000000..fa5c5ca --- /dev/null +++ b/doc/lib/options/selection/_sequence.qmd @@ -0,0 +1,7 @@ +**\--sequence**|**-s** _PATTERN_ + +: Regular expression pattern to be tested against the + sequence itself. The pattern is case insensitive. A + complete description of the regular pattern grammar + is available [here](https://yourbasic.org/golang/regexp-cheat-sheet/#cheat-sheet). + \ No newline at end of file diff --git a/doc/man/obigrep.qmd b/doc/man/obigrep.qmd index 05b4e6d..14af0f5 100644 --- a/doc/man/obigrep.qmd +++ b/doc/man/obigrep.qmd @@ -99,13 +99,13 @@ The OBITools are centered around the [FASTA] (https://en.wikipedia.org/wiki/FAST {{< include ../lib/options/selection/_min-count.qmd >}} -**\--max-length** | **-L** _LENGTH_ +{{< include ../lib/options/selection/_max-length.qmd >}} -**\--min-length** | **-l** _LENGTH_ +{{< include ../lib/options/selection/_min-length.qmd >}} **\--predicate**|**-p** _EXPRESSION_ -**\--sequence**|**-s** _PATTERN_ +{{< include ../lib/options/selection/_sequence.qmd >}} **\--inverse-match** | **-v** diff --git a/pkg/goutils/goutils.go b/pkg/goutils/goutils.go index 4015f76..136668d 100644 --- a/pkg/goutils/goutils.go +++ b/pkg/goutils/goutils.go @@ -13,6 +13,7 @@ import ( "github.com/barkimedes/go-deepcopy" ) + // InterfaceToInt converts a interface{} to an integer value if possible. // If not a "NotAnInteger" error is returned via the err // return value and val is set to 0. @@ -302,15 +303,6 @@ func ReadLines(path string) (lines []string, err error) { return } -func Contains[T comparable](arr []T, x T) bool { - for _, v := range arr { - if v == x { - return true - } - } - return false -} - func AtomicCounter(initial ...int) func() int { counterMutex := sync.Mutex{} counter := 0 diff --git a/pkg/goutils/slices.go b/pkg/goutils/slices.go new file mode 100644 index 0000000..738abe5 --- /dev/null +++ b/pkg/goutils/slices.go @@ -0,0 +1,24 @@ +package goutils + + +func Contains[T comparable](arr []T, x T) bool { + for _, v := range arr { + if v == x { + return true + } + } + return false +} + +func LookFor[T comparable](arr []T, x T) int { + for i, v := range arr { + if v == x { + return i + } + } + return -1 +} + +func RemoveIndex[T comparable](s []T, index int) []T { + return append(s[:index], s[index+1:]...) +} diff --git a/pkg/obiapat/pcr.go b/pkg/obiapat/pcr.go index e446997..c87f879 100644 --- a/pkg/obiapat/pcr.go +++ b/pkg/obiapat/pcr.go @@ -13,7 +13,6 @@ type _Options struct { circular bool forwardError int reverseError int - bufferSize int batchSize int parallelWorkers int forward ApatPattern @@ -66,12 +65,6 @@ func (options Options) Circular() bool { return options.pointer.circular } -// BufferSize returns the size of the channel -// buffer specified by the options -func (options Options) BufferSize() int { - return options.pointer.bufferSize -} - // BatchSize returns the size of the // sequence batch used by the PCR algorithm func (options Options) BatchSize() int { @@ -95,7 +88,6 @@ func MakeOptions(setters []WithOption) Options { circular: false, parallelWorkers: 4, batchSize: 100, - bufferSize: 100, forward: NilApatPattern, cfwd: NilApatPattern, reverse: NilApatPattern, @@ -188,16 +180,6 @@ func OptionCircular(circular bool) WithOption { return f } -// OptionBufferSize sets the requested channel -// buffer size. -func OptionBufferSize(size int) WithOption { - f := WithOption(func(opt Options) { - opt.pointer.bufferSize = size - }) - - return f -} - // OptionParallelWorkers sets how many search // jobs will be run in parallel. func OptionParallelWorkers(nworkers int) WithOption { diff --git a/pkg/obichunk/chunk_on_disk.go b/pkg/obichunk/chunk_on_disk.go index 772eba7..fdfef49 100644 --- a/pkg/obichunk/chunk_on_disk.go +++ b/pkg/obichunk/chunk_on_disk.go @@ -36,20 +36,14 @@ func find(root, ext string) []string { } func ISequenceChunkOnDisk(iterator obiiter.IBioSequence, - classifier *obiseq.BioSequenceClassifier, - sizes ...int) (obiiter.IBioSequence, error) { + classifier *obiseq.BioSequenceClassifier) (obiiter.IBioSequence, error) { dir, err := tempDir() if err != nil { return obiiter.NilIBioSequence, err } - bufferSize := iterator.BufferSize() - if len(sizes) > 0 { - bufferSize = sizes[0] - } - - newIter := obiiter.MakeIBioSequence(bufferSize) + newIter := obiiter.MakeIBioSequence() newIter.Add(1) diff --git a/pkg/obichunk/chunks.go b/pkg/obichunk/chunks.go index 13b3404..f88e779 100644 --- a/pkg/obichunk/chunks.go +++ b/pkg/obichunk/chunks.go @@ -10,16 +10,9 @@ import ( ) func ISequenceChunk(iterator obiiter.IBioSequence, - classifier *obiseq.BioSequenceClassifier, - sizes ...int) (obiiter.IBioSequence, error) { + classifier *obiseq.BioSequenceClassifier) (obiiter.IBioSequence, error) { - bufferSize := iterator.BufferSize() - - if len(sizes) > 0 { - bufferSize = sizes[0] - } - - newIter := obiiter.MakeIBioSequence(bufferSize) + newIter := obiiter.MakeIBioSequence() newIter.Add(1) diff --git a/pkg/obichunk/options.go b/pkg/obichunk/options.go index e49a614..d6ffa03 100644 --- a/pkg/obichunk/options.go +++ b/pkg/obichunk/options.go @@ -6,7 +6,6 @@ type __options__ struct { navalue string cacheOnDisk bool batchCount int - bufferSize int batchSize int parallelWorkers int noSingleton bool @@ -25,7 +24,6 @@ func MakeOptions(setters []WithOption) Options { navalue: "NA", cacheOnDisk: false, batchCount: 100, - bufferSize: 2, batchSize: 5000, parallelWorkers: 4, noSingleton: false, @@ -65,10 +63,6 @@ func (opt Options) BatchCount() int { return opt.pointer.batchCount } -func (opt Options) BufferSize() int { - return opt.pointer.bufferSize -} - func (opt Options) BatchSize() int { return opt.pointer.batchSize } @@ -148,14 +142,6 @@ func OptionsBatchSize(size int) WithOption { return f } -func OptionsBufferSize(size int) WithOption { - f := WithOption(func(opt Options) { - opt.pointer.bufferSize = size - }) - - return f -} - func OptionsNoSingleton() WithOption { f := WithOption(func(opt Options) { opt.pointer.noSingleton = true diff --git a/pkg/obichunk/subchunks.go b/pkg/obichunk/subchunks.go index b55942c..7250946 100644 --- a/pkg/obichunk/subchunks.go +++ b/pkg/obichunk/subchunks.go @@ -58,20 +58,13 @@ func (by _By) Sort(seqs []sSS) { func ISequenceSubChunk(iterator obiiter.IBioSequence, classifier *obiseq.BioSequenceClassifier, - sizes ...int) (obiiter.IBioSequence, error) { + nworkers int) (obiiter.IBioSequence, error) { - bufferSize := iterator.BufferSize() - nworkers := 4 - - if len(sizes) > 0 { - nworkers = sizes[0] + if nworkers <=0 { + nworkers = 4 } - if len(sizes) > 1 { - bufferSize = sizes[1] - } - - newIter := obiiter.MakeIBioSequence(bufferSize) + newIter := obiiter.MakeIBioSequence() newIter.Add(nworkers) diff --git a/pkg/obichunk/unique.go b/pkg/obichunk/unique.go index ec2d202..aa3631b 100644 --- a/pkg/obichunk/unique.go +++ b/pkg/obichunk/unique.go @@ -19,7 +19,7 @@ func IUniqueSequence(iterator obiiter.IBioSequence, opts := MakeOptions(options) nworkers := opts.ParallelWorkers() - iUnique := obiiter.MakeIBioSequence(opts.BufferSize()) + iUnique := obiiter.MakeIBioSequence() iterator = iterator.Speed("Splitting data set") @@ -28,8 +28,7 @@ func IUniqueSequence(iterator obiiter.IBioSequence, if opts.SortOnDisk() { nworkers = 1 iterator, err = ISequenceChunkOnDisk(iterator, - obiseq.HashClassifier(opts.BatchCount()), - 0) + obiseq.HashClassifier(opts.BatchCount())) if err != nil { return obiiter.NilIBioSequence, err @@ -37,8 +36,7 @@ func IUniqueSequence(iterator obiiter.IBioSequence, } else { iterator, err = ISequenceChunk(iterator, - obiseq.HashClassifier(opts.BatchCount()), - opts.BufferSize()) + obiseq.HashClassifier(opts.BatchCount())) if err != nil { return obiiter.NilIBioSequence, err @@ -78,12 +76,11 @@ func IUniqueSequence(iterator obiiter.IBioSequence, icat-- input, err = ISequenceSubChunk(input, classifier, - 1, - opts.BufferSize()) + 1) var next obiiter.IBioSequence if icat >= 0 { - next = obiiter.MakeIBioSequence(opts.BufferSize()) + next = obiiter.MakeIBioSequence() iUnique.Add(1) @@ -130,7 +127,6 @@ func IUniqueSequence(iterator obiiter.IBioSequence, iMerged := iUnique.IMergeSequenceBatch(opts.NAValue(), opts.StatsOn(), - opts.BufferSize(), ) return iMerged, nil diff --git a/pkg/obiformats/csv_writer.go b/pkg/obiformats/csv_writer.go new file mode 100644 index 0000000..b4b590a --- /dev/null +++ b/pkg/obiformats/csv_writer.go @@ -0,0 +1,248 @@ +package obiformats + +import ( + "bytes" + "encoding/csv" + "fmt" + "io" + "os" + "sync" + "time" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + log "github.com/sirupsen/logrus" +) + +func CSVRecord(sequence *obiseq.BioSequence, opt Options) []string { + keys := opt.CSVKeys() + record := make([]string, 0, len(keys)+4) + + if opt.CSVId() { + record = append(record, sequence.Id()) + } + + if opt.CSVCount() { + record = append(record, fmt.Sprint(sequence.Count())) + } + + if opt.CSVTaxon() { + taxid := sequence.Taxid() + sn, ok := sequence.GetAttribute("scientific_name") + + if !ok { + if taxid == 1 { + sn = "root" + } else { + sn = opt.CSVNAValue() + } + } + + record = append(record, fmt.Sprint(taxid), fmt.Sprint(sn)) + } + + if opt.CSVDefinition() { + record = append(record, sequence.Definition()) + } + + for _, key := range opt.CSVKeys() { + value, ok := sequence.GetAttribute(key) + if !ok { + value = opt.CSVNAValue() + } + + svalue, _ := goutils.InterfaceToString(value) + record = append(record, svalue) + } + + if opt.CSVSequence() { + record = append(record, string(sequence.Sequence())) + } + + if opt.CSVQuality() { + if sequence.HasQualities() { + l := sequence.Len() + q := sequence.Qualities() + ascii := make([]byte, l) + quality_shift := opt.QualityShift() + for j := 0; j < l; j++ { + ascii[j] = uint8(q[j]) + uint8(quality_shift) + } + record = append(record, string(ascii)) + } else { + record = append(record, opt.CSVNAValue()) + } + } + + return record +} + +func CSVHeader(opt Options) []string { + keys := opt.CSVKeys() + record := make([]string, 0, len(keys)+4) + + if opt.CSVId() { + record = append(record, "id") + } + + if opt.CSVCount() { + record = append(record, "count") + } + + if opt.CSVTaxon() { + record = append(record, "taxid", "scientific_name") + } + + if opt.CSVDefinition() { + record = append(record, "definition") + } + + record = append(record, opt.CSVKeys()...) + + if opt.CSVSequence() { + record = append(record, "sequence") + } + + if opt.CSVQuality() { + record = append(record, "quality") + } + + return record +} + +func FormatCVSBatch(batch obiiter.BioSequenceBatch, opt Options) []byte { + buff := new(bytes.Buffer) + csv := csv.NewWriter(buff) + + if batch.Order() == 0 { + csv.Write(CSVHeader(opt)) + } + for _, s := range batch.Slice() { + csv.Write(CSVRecord(s, opt)) + } + + csv.Flush() + + return buff.Bytes() +} + +func WriteCSV(iterator obiiter.IBioSequence, + file io.WriteCloser, + options ...WithOption) (obiiter.IBioSequence, error) { + opt := MakeOptions(options) + + file, _ = goutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile()) + + newIter := obiiter.MakeIBioSequence() + + nwriters := opt.ParallelWorkers() + + obiiter.RegisterAPipe() + chunkchan := make(chan FileChunck) + + newIter.Add(nwriters) + var waitWriter sync.WaitGroup + + go func() { + newIter.WaitAndClose() + for len(chunkchan) > 0 { + time.Sleep(time.Millisecond) + } + close(chunkchan) + waitWriter.Wait() + }() + + ff := func(iterator obiiter.IBioSequence) { + for iterator.Next() { + + batch := iterator.Get() + + chunkchan <- FileChunck{ + FormatCVSBatch(batch, opt), + batch.Order(), + } + newIter.Push(batch) + } + newIter.Done() + } + + log.Debugln("Start of the CSV file writing") + go ff(iterator) + for i := 0; i < nwriters-1; i++ { + go ff(iterator.Split()) + } + + next_to_send := 0 + received := make(map[int]FileChunck, 100) + + waitWriter.Add(1) + go func() { + for chunk := range chunkchan { + if chunk.order == next_to_send { + file.Write(chunk.text) + next_to_send++ + chunk, ok := received[next_to_send] + for ok { + file.Write(chunk.text) + delete(received, next_to_send) + next_to_send++ + chunk, ok = received[next_to_send] + } + } else { + received[chunk.order] = chunk + } + + } + + file.Close() + + log.Debugln("End of the CSV file writing") + obiiter.UnregisterPipe() + waitWriter.Done() + + }() + + return newIter, nil +} + +func WriteCSVToStdout(iterator obiiter.IBioSequence, + options ...WithOption) (obiiter.IBioSequence, error) { + options = append(options, OptionDontCloseFile()) + return WriteCSV(iterator, os.Stdout, options...) +} + +func WriteCSVToFile(iterator obiiter.IBioSequence, + filename string, + options ...WithOption) (obiiter.IBioSequence, error) { + + opt := MakeOptions(options) + flags := os.O_WRONLY | os.O_CREATE + + if opt.AppendFile() { + flags |= os.O_APPEND + } + file, err := os.OpenFile(filename, flags, 0660) + + if err != nil { + log.Fatalf("open file error: %v", err) + return obiiter.NilIBioSequence, err + } + + options = append(options, OptionCloseFile()) + + iterator, err = WriteCSV(iterator, file, options...) + + if opt.HaveToSavePaired() { + var revfile *os.File + + revfile, err = os.OpenFile(opt.PairedFileName(), flags, 0660) + if err != nil { + log.Fatalf("open file error: %v", err) + return obiiter.NilIBioSequence, err + } + iterator, err = WriteCSV(iterator.PairedWith(), revfile, options...) + } + + return iterator, err +} diff --git a/pkg/obiformats/ecopcr_read.go b/pkg/obiformats/ecopcr_read.go index 20d7fb3..c3da13e 100644 --- a/pkg/obiformats/ecopcr_read.go +++ b/pkg/obiformats/ecopcr_read.go @@ -166,7 +166,7 @@ func ReadEcoPCR(reader io.Reader, options ...WithOption) obiiter.IBioSequence { opt := MakeOptions(options) - newIter := obiiter.MakeIBioSequence(opt.BufferSize()) + newIter := obiiter.MakeIBioSequence() newIter.Add(1) go func() { diff --git a/pkg/obiformats/embl_read.go b/pkg/obiformats/embl_read.go index b72ff1b..13fc176 100644 --- a/pkg/obiformats/embl_read.go +++ b/pkg/obiformats/embl_read.go @@ -244,9 +244,9 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) { // ?//? func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence { opt := MakeOptions(options) - entry_channel := make(chan _FileChunk, opt.BufferSize()) + entry_channel := make(chan _FileChunk) - newIter := obiiter.MakeIBioSequence(opt.BufferSize()) + newIter := obiiter.MakeIBioSequence() nworkers := opt.ParallelWorkers() newIter.Add(nworkers) diff --git a/pkg/obiformats/fastseq_header.go b/pkg/obiformats/fastseq_header.go index fca2896..29da3c3 100644 --- a/pkg/obiformats/fastseq_header.go +++ b/pkg/obiformats/fastseq_header.go @@ -19,6 +19,5 @@ func IParseFastSeqHeaderBatch(iterator obiiter.IBioSequence, options ...WithOption) obiiter.IBioSequence { opt := MakeOptions(options) return iterator.MakeIWorker(obiseq.AnnotatorToSeqWorker(opt.ParseFastSeqHeader()), - opt.ParallelWorkers(), - opt.BufferSize()) + opt.ParallelWorkers()) } diff --git a/pkg/obiformats/fastseq_read.go b/pkg/obiformats/fastseq_read.go index 33decc7..79873b8 100644 --- a/pkg/obiformats/fastseq_read.go +++ b/pkg/obiformats/fastseq_read.go @@ -105,7 +105,7 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe size = -1 } - newIter := obiiter.MakeIBioSequence(opt.BufferSize()) + newIter := obiiter.MakeIBioSequence() newIter.Add(1) go func() { @@ -127,7 +127,7 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe func ReadFastSeqFromStdin(options ...WithOption) obiiter.IBioSequence { opt := MakeOptions(options) - newIter := obiiter.MakeIBioSequence(opt.BufferSize()) + newIter := obiiter.MakeIBioSequence() newIter.Add(1) diff --git a/pkg/obiformats/fastseq_write_fasta.go b/pkg/obiformats/fastseq_write_fasta.go index 0071cd9..0c36d3d 100644 --- a/pkg/obiformats/fastseq_write_fasta.go +++ b/pkg/obiformats/fastseq_write_fasta.go @@ -71,8 +71,7 @@ func WriteFasta(iterator obiiter.IBioSequence, file, _ = goutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile()) - buffsize := iterator.BufferSize() - newIter := obiiter.MakeIBioSequence(buffsize) + newIter := obiiter.MakeIBioSequence() nwriters := opt.ParallelWorkers() diff --git a/pkg/obiformats/fastseq_write_fastq.go b/pkg/obiformats/fastseq_write_fastq.go index 5fb63fb..f8e3a39 100644 --- a/pkg/obiformats/fastseq_write_fastq.go +++ b/pkg/obiformats/fastseq_write_fastq.go @@ -60,8 +60,7 @@ func WriteFastq(iterator obiiter.IBioSequence, file, _ = goutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile()) - buffsize := iterator.BufferSize() - newIter := obiiter.MakeIBioSequence(buffsize) + newIter := obiiter.MakeIBioSequence() nwriters := opt.ParallelWorkers() diff --git a/pkg/obiformats/genbank_read.go b/pkg/obiformats/genbank_read.go index 53cb271..9d98b40 100644 --- a/pkg/obiformats/genbank_read.go +++ b/pkg/obiformats/genbank_read.go @@ -113,9 +113,9 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) { func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence { opt := MakeOptions(options) - entry_channel := make(chan _FileChunk, opt.BufferSize()) + entry_channel := make(chan _FileChunk) - newIter := obiiter.MakeIBioSequence(opt.BufferSize()) + newIter := obiiter.MakeIBioSequence() nworkers := opt.ParallelWorkers() newIter.Add(nworkers) diff --git a/pkg/obiformats/options.go b/pkg/obiformats/options.go index 10da43c..87780ac 100644 --- a/pkg/obiformats/options.go +++ b/pkg/obiformats/options.go @@ -15,10 +15,15 @@ type __options__ struct { closefile bool appendfile bool compressed bool - csv_ids bool - cvs_sequence bool + csv_id bool + csv_sequence bool + csv_quality bool csv_definition bool + csv_count bool + csv_taxon bool + csv_keys []string csv_separator string + csv_navalue string paired_filename string } @@ -40,11 +45,16 @@ func MakeOptions(setters []WithOption) Options { closefile: false, appendfile: false, compressed: false, - csv_ids: true, + csv_id: true, csv_definition: false, - cvs_sequence: true, + csv_count: false, + csv_taxon: false, + csv_sequence: true, + csv_quality: false, csv_separator: ",", - paired_filename: "", + csv_navalue: "NA", + csv_keys: make([]string, 0), + paired_filename: "", } opt := Options{&o} @@ -60,10 +70,6 @@ func (opt Options) QualityShift() int { return opt.pointer.quality_shift } -func (opt Options) BufferSize() int { - return opt.pointer.buffer_size -} - func (opt Options) BatchSize() int { return opt.pointer.batch_size } @@ -96,8 +102,40 @@ func (opt Options) CompressedFile() bool { return opt.pointer.compressed } -func (opt Options) CSVIds() bool { - return opt.pointer.csv_ids +func (opt Options) CSVId() bool { + return opt.pointer.csv_id +} + +func (opt Options) CSVDefinition() bool { + return opt.pointer.csv_definition +} + +func (opt Options) CSVCount() bool { + return opt.pointer.csv_count +} + +func (opt Options) CSVTaxon() bool { + return opt.pointer.csv_taxon +} + +func (opt Options) CSVSequence() bool { + return opt.pointer.csv_sequence +} + +func (opt Options) CSVQuality() bool { + return opt.pointer.csv_quality +} + +func (opt Options) CSVKeys() []string { + return opt.pointer.csv_keys +} + +func (opt Options) CSVSeparator() string { + return opt.pointer.csv_separator +} + +func (opt Options) CSVNAValue() string { + return opt.pointer.csv_navalue } func (opt Options) HaveToSavePaired() bool { @@ -108,14 +146,6 @@ func (opt Options) PairedFileName() string { return opt.pointer.paired_filename } -func OptionsBufferSize(size int) WithOption { - f := WithOption(func(opt Options) { - opt.pointer.buffer_size = size - }) - - return f -} - func OptionCloseFile() WithOption { f := WithOption(func(opt Options) { opt.pointer.closefile = true @@ -247,3 +277,82 @@ func WritePairedReadsTo(filename string) WithOption { return f } +func CSVId(include bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.csv_id = include + }) + + return f +} + +func CSVSequence(include bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.csv_sequence = include + }) + + return f +} + +func CSVQuality(include bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.csv_quality = include + }) + + return f +} + +func CSVDefinition(include bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.csv_definition = include + }) + + return f +} + +func CSVCount(include bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.csv_count = include + }) + + return f +} + +func CSVTaxon(include bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.csv_taxon = include + }) + + return f +} + +func CSVKey(key string) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.csv_keys = append(opt.pointer.csv_keys, key) + }) + + return f +} + +func CSVKeys(keys []string) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.csv_keys = append(opt.pointer.csv_keys, keys...) + }) + + return f +} + +func CSVSeparator(separator string) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.csv_separator = separator + }) + + return f +} + +func CSVNAValue(navalue string) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.csv_navalue = navalue + }) + + return f +} diff --git a/pkg/obiiter/batchiterator.go b/pkg/obiiter/batchiterator.go index f2d5263..8782b87 100644 --- a/pkg/obiiter/batchiterator.go +++ b/pkg/obiiter/batchiterator.go @@ -60,17 +60,11 @@ type IBioSequence struct { var NilIBioSequence = IBioSequence{pointer: nil} func MakeIBioSequence(sizes ...int) IBioSequence { - buffsize := int32(0) - - if len(sizes) > 0 { - buffsize = int32(sizes[0]) - } i := _IBioSequence{ - channel: make(chan BioSequenceBatch, buffsize), + channel: make(chan BioSequenceBatch), current: NilBioSequenceBatch, pushBack: abool.New(), - buffer_size: buffsize, batch_size: -1, sequence_format: "", finished: abool.New(), @@ -160,14 +154,6 @@ func (iterator IBioSequence) IsNil() bool { return iterator.pointer == nil } -func (iterator IBioSequence) BufferSize() int { - if iterator.pointer == nil { - log.Panic("call of IBioSequenceBatch.BufferSize method on NilIBioSequenceBatch") - } - - return int(atomic.LoadInt32(&iterator.pointer.buffer_size)) -} - func (iterator IBioSequence) BatchSize() int { if iterator.pointer == nil { log.Panic("call of IBioSequenceBatch.BatchSize method on NilIBioSequenceBatch") @@ -279,13 +265,8 @@ func (iterator IBioSequence) Finished() bool { // Sorting the batches of sequences. func (iterator IBioSequence) SortBatches(sizes ...int) IBioSequence { - buffsize := iterator.BufferSize() - if len(sizes) > 0 { - buffsize = sizes[0] - } - - newIter := MakeIBioSequence(buffsize) + newIter := MakeIBioSequence() newIter.Add(1) @@ -338,8 +319,7 @@ func (iterator IBioSequence) Concat(iterators ...IBioSequence) IBioSequence { allPaired = allPaired && i.IsPaired() } - buffsize := iterator.BufferSize() - newIter := MakeIBioSequence(buffsize) + newIter := MakeIBioSequence() newIter.Add(1) @@ -396,8 +376,7 @@ func (iterator IBioSequence) Pool(iterators ...IBioSequence) IBioSequence { } nextCounter := goutils.AtomicCounter() - buffsize := iterator.BufferSize() - newIter := MakeIBioSequence(buffsize) + newIter := MakeIBioSequence() newIter.Add(niterator) @@ -431,13 +410,8 @@ func (iterator IBioSequence) Pool(iterators ...IBioSequence) IBioSequence { // indicated in parameter. Rebatching implies to sort the // source IBioSequenceBatch. func (iterator IBioSequence) Rebatch(size int, sizes ...int) IBioSequence { - buffsize := iterator.BufferSize() - if len(sizes) > 0 { - buffsize = sizes[0] - } - - newIter := MakeIBioSequence(buffsize) + newIter := MakeIBioSequence() newIter.Add(1) @@ -532,14 +506,9 @@ func (iterator IBioSequence) Count(recycle bool) (int, int, int) { // iterator following the predicate value. func (iterator IBioSequence) DivideOn(predicate obiseq.SequencePredicate, size int, sizes ...int) (IBioSequence, IBioSequence) { - buffsize := iterator.BufferSize() - if len(sizes) > 0 { - buffsize = sizes[0] - } - - trueIter := MakeIBioSequence(buffsize) - falseIter := MakeIBioSequence(buffsize) + trueIter := MakeIBioSequence() + falseIter := MakeIBioSequence() trueIter.Add(1) falseIter.Add(1) @@ -604,18 +573,13 @@ func (iterator IBioSequence) DivideOn(predicate obiseq.SequencePredicate, // A function that takes a predicate and a batch of sequences and returns a filtered batch of sequences. func (iterator IBioSequence) FilterOn(predicate obiseq.SequencePredicate, size int, sizes ...int) IBioSequence { - buffsize := iterator.BufferSize() nworkers := 4 if len(sizes) > 0 { nworkers = sizes[0] } - if len(sizes) > 1 { - buffsize = sizes[1] - } - - trueIter := MakeIBioSequence(buffsize) + trueIter := MakeIBioSequence() trueIter.Add(nworkers) @@ -661,18 +625,13 @@ func (iterator IBioSequence) FilterOn(predicate obiseq.SequencePredicate, func (iterator IBioSequence) FilterAnd(predicate obiseq.SequencePredicate, size int, sizes ...int) IBioSequence { - buffsize := iterator.BufferSize() nworkers := 4 if len(sizes) > 0 { nworkers = sizes[0] } - if len(sizes) > 1 { - buffsize = sizes[1] - } - - trueIter := MakeIBioSequence(buffsize) + trueIter := MakeIBioSequence() trueIter.Add(nworkers) @@ -740,13 +699,7 @@ func (iterator IBioSequence) Load() obiseq.BioSequenceSlice { func IBatchOver(data obiseq.BioSequenceSlice, size int, sizes ...int) IBioSequence { - buffsize := 0 - - if len(sizes) > 0 { - buffsize = sizes[0] - } - - newIter := MakeIBioSequence(buffsize) + newIter := MakeIBioSequence() newIter.Add(1) diff --git a/pkg/obiiter/distribute.go b/pkg/obiiter/distribute.go index c1b453f..45e755a 100644 --- a/pkg/obiiter/distribute.go +++ b/pkg/obiiter/distribute.go @@ -36,7 +36,6 @@ func (dist *IDistribute) Classifier() *obiseq.BioSequenceClassifier { func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, sizes ...int) IDistribute { batchsize := 5000 - buffsize := 2 outputs := make(map[int]IBioSequence, 100) slices := make(map[int]*obiseq.BioSequenceSlice, 100) @@ -47,9 +46,7 @@ func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, siz batchsize = sizes[0] } - if len(sizes) > 1 { - buffsize = sizes[1] - } + jobDone := sync.WaitGroup{} lock := sync.Mutex{} @@ -80,7 +77,7 @@ func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, siz orders[key] = 0 lock.Lock() - outputs[key] = MakeIBioSequence(buffsize) + outputs[key] = MakeIBioSequence() lock.Unlock() news <- key diff --git a/pkg/obiiter/merge.go b/pkg/obiiter/merge.go index f6bed9b..8a24d7a 100644 --- a/pkg/obiiter/merge.go +++ b/pkg/obiiter/merge.go @@ -4,16 +4,12 @@ import "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" func (iterator IBioSequence) IMergeSequenceBatch(na string, statsOn []string, sizes ...int) IBioSequence { batchsize := 100 - buffsize := iterator.BufferSize() if len(sizes) > 0 { batchsize = sizes[0] } - if len(sizes) > 1 { - buffsize = sizes[1] - } - newIter := MakeIBioSequence(buffsize) + newIter := MakeIBioSequence() newIter.Add(1) diff --git a/pkg/obiiter/workers.go b/pkg/obiiter/workers.go index 1e2c503..8003851 100644 --- a/pkg/obiiter/workers.go +++ b/pkg/obiiter/workers.go @@ -6,7 +6,6 @@ import ( "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" ) - // That method allows for applying a SeqWorker function on every sequences. // // Sequences are provided by the iterator and modified sequences are pushed @@ -17,17 +16,12 @@ import ( // - The second the size of the chanel buffer. By default set to the same value than the input buffer. func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int) IBioSequence { nworkers := 4 - buffsize := iterator.BufferSize() if len(sizes) > 0 { nworkers = sizes[0] } - if len(sizes) > 1 { - buffsize = sizes[1] - } - - newIter := MakeIBioSequence(buffsize) + newIter := MakeIBioSequence() newIter.Add(nworkers) @@ -64,17 +58,12 @@ func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int) func (iterator IBioSequence) MakeIConditionalWorker(predicate obiseq.SequencePredicate, worker obiseq.SeqWorker, sizes ...int) IBioSequence { nworkers := 4 - buffsize := iterator.BufferSize() if len(sizes) > 0 { nworkers = sizes[0] } - if len(sizes) > 1 { - buffsize = sizes[1] - } - - newIter := MakeIBioSequence(buffsize) + newIter := MakeIBioSequence() newIter.Add(nworkers) @@ -112,17 +101,12 @@ func (iterator IBioSequence) MakeIConditionalWorker(predicate obiseq.SequencePre func (iterator IBioSequence) MakeISliceWorker(worker obiseq.SeqSliceWorker, sizes ...int) IBioSequence { nworkers := 4 - buffsize := iterator.BufferSize() if len(sizes) > 0 { nworkers = sizes[0] } - if len(sizes) > 1 { - buffsize = sizes[1] - } - - newIter := MakeIBioSequence(buffsize) + newIter := MakeIBioSequence() newIter.Add(nworkers) @@ -140,7 +124,7 @@ func (iterator IBioSequence) MakeISliceWorker(worker obiseq.SeqSliceWorker, size newIter.Done() } - log.Printf("Start of the batch slice workers on %d workers (buffer : %d)\n", nworkers, buffsize) + log.Printf("Start of the batch slice workers on %d workers\n", nworkers) for i := 0; i < nworkers-1; i++ { go f(iterator.Split()) } @@ -168,4 +152,3 @@ func SliceWorkerPipe(worker obiseq.SeqSliceWorker, sizes ...int) Pipeable { return f } - diff --git a/pkg/obingslibrary/worker.go b/pkg/obingslibrary/worker.go index fb57f0f..df9ad26 100644 --- a/pkg/obingslibrary/worker.go +++ b/pkg/obingslibrary/worker.go @@ -11,7 +11,6 @@ type _Options struct { withProgressBar bool parallelWorkers int batchSize int - bufferSize int } // Options stores a set of option usable by the @@ -56,16 +55,6 @@ func OptionAllowedMismatches(count int) WithOption { return f } -// OptionBufferSize sets the requested channel -// buffer size. -func OptionBufferSize(size int) WithOption { - f := WithOption(func(opt Options) { - opt.pointer.bufferSize = size - }) - - return f -} - // OptionParallelWorkers sets how many search // jobs will be run in parallel. func OptionParallelWorkers(nworkers int) WithOption { @@ -102,12 +91,6 @@ func (options Options) WithProgressBar() bool { return options.pointer.withProgressBar } -// BufferSize returns the size of the channel -// buffer specified by the options -func (options Options) BufferSize() int { - return options.pointer.bufferSize -} - // BatchSize returns the size of the // sequence batch used by the PCR algorithm func (options Options) BatchSize() int { @@ -130,7 +113,6 @@ func MakeOptions(setters []WithOption) Options { withProgressBar: false, parallelWorkers: 4, batchSize: 1000, - bufferSize: 100, } opt := Options{&o} diff --git a/pkg/obioptions/options.go b/pkg/obioptions/options.go index 1366aae..8212bb7 100644 --- a/pkg/obioptions/options.go +++ b/pkg/obioptions/options.go @@ -11,12 +11,11 @@ import ( ) var _Debug = false -var _ParallelWorkers = runtime.NumCPU() * 2 - 1 +var _ParallelWorkers = runtime.NumCPU()*2 - 1 var _MaxAllowedCPU = runtime.NumCPU() -var _BufferSize = 1 var _BatchSize = 5000 -type ArgumentParser func([]string) (*getoptions.GetOpt, []string, error) +type ArgumentParser func([]string) (*getoptions.GetOpt, []string) func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser { @@ -38,16 +37,20 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser o(options) } - return func(args []string) (*getoptions.GetOpt, []string, error) { + return func(args []string) (*getoptions.GetOpt, []string) { remaining, err := options.Parse(args[1:]) + if err != nil { + log.Fatalf("Error on the commande line : %v",err) + } + // Setup the maximum number of CPU usable by the program runtime.GOMAXPROCS(_MaxAllowedCPU) if options.Called("max-cpu") { log.Printf("CPU number limited to %d", _MaxAllowedCPU) - if ! options.Called("workers") { - _ParallelWorkers=_MaxAllowedCPU * 2 - 1 + if !options.Called("workers") { + _ParallelWorkers = _MaxAllowedCPU*2 - 1 log.Printf("Number of workers set %d", _ParallelWorkers) } } @@ -67,7 +70,7 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser log.Debugln("Switch to debug level logging") } - return options, remaining, err + return options, remaining } } @@ -88,11 +91,6 @@ func CLIMaxCPU() int { return _MaxAllowedCPU } -// CLIBufferSize returns the expeted channel buffer size for obitools -func CLIBufferSize() int { - return _BufferSize -} - // CLIBatchSize returns the expeted size of the sequence batches func CLIBatchSize() int { return _BatchSize diff --git a/pkg/obiseq/attributes.go b/pkg/obiseq/attributes.go index cbdc82b..f78ea6a 100644 --- a/pkg/obiseq/attributes.go +++ b/pkg/obiseq/attributes.go @@ -8,6 +8,15 @@ import ( log "github.com/sirupsen/logrus" ) +func (s *BioSequence) HasAttribute(key string) bool { + ok := s.annotations != nil + + if ok { + _, ok = s.annotations[key] + } + + return ok +} // A method that returns the value of the key in the annotation map. func (s *BioSequence) GetAttribute(key string) (interface{}, bool) { var val interface{} diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index a1d4b10..f399cd1 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -278,3 +278,28 @@ func (s *BioSequence) Clear() { s.sequence = s.sequence[0:0] } +func (s *BioSequence) Composition() map[byte]int { + + a := 0 + c := 0 + g := 0 + t := 0 + other := 0 + for _, char := range s.sequence { + switch char { + case 'a': + a++ + case 'c': + c++ + case 'g': + g++ + case 't': + t++ + default: + other++ + + } + } + + return map[byte]int{'a': a, 'c': c, 'g': g, 't': t, 'o': other} +} diff --git a/pkg/obiseq/class.go b/pkg/obiseq/class.go index 3608d2d..ba711c1 100644 --- a/pkg/obiseq/class.go +++ b/pkg/obiseq/class.go @@ -316,3 +316,4 @@ func RotateClassifier(size int) *BioSequenceClassifier { c := BioSequenceClassifier{code, value, reset, clone,"RotateClassifier"} return &c } + diff --git a/pkg/obiseq/eval.go b/pkg/obiseq/eval.go index 8e007f6..a919142 100644 --- a/pkg/obiseq/eval.go +++ b/pkg/obiseq/eval.go @@ -4,22 +4,21 @@ import ( "context" "fmt" - "git.metabarcoding.org/lecasofts/go/obitools/pkg/obieval" log "github.com/sirupsen/logrus" ) -func Expression(expression string) func(*BioSequence) (interface{},error) { +func Expression(expression string) func(*BioSequence) (interface{}, error) { - exp, err := obieval.OBILang.NewEvaluable(expression) + exp, err := OBILang.NewEvaluable(expression) if err != nil { log.Fatalf("Error in the expression : %s", expression) } - f := func(sequence *BioSequence) (interface{},error) { + f := func(sequence *BioSequence) (interface{}, error) { return exp(context.Background(), map[string]interface{}{ - "annotations": sequence.Annotations(), - "sequence": sequence, + "annotations": sequence.Annotations(), + "sequence": sequence, }, ) } @@ -30,14 +29,14 @@ func Expression(expression string) func(*BioSequence) (interface{},error) { func EditIdWorker(expression string) SeqWorker { e := Expression(expression) f := func(sequence *BioSequence) *BioSequence { - v,err := e(sequence) + v, err := e(sequence) if err != nil { log.Fatalf("Expression '%s' cannot be evaluated on sequence %s", expression, sequence.Id()) } - sequence.SetId(fmt.Sprintf("%v",v)) + sequence.SetId(fmt.Sprintf("%v", v)) return sequence } @@ -47,16 +46,16 @@ func EditIdWorker(expression string) SeqWorker { func EditAttributeWorker(key string, expression string) SeqWorker { e := Expression(expression) f := func(sequence *BioSequence) *BioSequence { - v,err := e(sequence) + v, err := e(sequence) if err != nil { log.Fatalf("Expression '%s' cannot be evaluated on sequence %s", expression, sequence.Id()) } - sequence.SetAttribute(key,v) + sequence.SetAttribute(key, v) return sequence } return f -} \ No newline at end of file +} diff --git a/pkg/obieval/language.go b/pkg/obiseq/language.go similarity index 84% rename from pkg/obieval/language.go rename to pkg/obiseq/language.go index 74b4784..473a52a 100644 --- a/pkg/obieval/language.go +++ b/pkg/obiseq/language.go @@ -1,4 +1,4 @@ -package obieval +package obiseq import ( "fmt" @@ -174,8 +174,19 @@ var OBILang = gval.NewLanguage( log.Fatalf("%v cannot be converted to a boolan value", args[0]) } return val, nil + }), + gval.Function("ifelse", func(args ...interface{}) (interface{}, error) { + if args[0].(bool) { + return args[1], nil + } else { + return args[2], nil + } + }), + gval.Function("gcskew", func(args ...interface{}) (interface{}, error) { + composition := (args[0].(*BioSequence)).Composition() + return float64(composition['g']-composition['c']) / float64(composition['g']+composition['c']), nil + }), + gval.Function("composition", func(args ...interface{}) (interface{}, error) { + return (args[0].(*BioSequence)).Composition(), nil })) -func Expression(expression string) (gval.Evaluable, error) { - return OBILang.NewEvaluable(expression) -} diff --git a/pkg/obiseq/predicate.go b/pkg/obiseq/predicate.go index 31ddf48..0f00080 100644 --- a/pkg/obiseq/predicate.go +++ b/pkg/obiseq/predicate.go @@ -5,7 +5,6 @@ import ( "fmt" "regexp" - "git.metabarcoding.org/lecasofts/go/obitools/pkg/obieval" log "github.com/sirupsen/logrus" ) @@ -256,7 +255,7 @@ func IsIdIn(ids ...string) SequencePredicate { func ExpressionPredicat(expression string) SequencePredicate { - exp, err := obieval.OBILang.NewEvaluable(expression) + exp, err := OBILang.NewEvaluable(expression) if err != nil { log.Fatalf("Error in the expression : %s", expression) } diff --git a/pkg/obitools/obicleandb/obicleandb.go b/pkg/obitools/obicleandb/obicleandb.go new file mode 100644 index 0000000..64e3752 --- /dev/null +++ b/pkg/obitools/obicleandb/obicleandb.go @@ -0,0 +1,63 @@ +package obicleandb + +import ( + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obichunk" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obigrep" +) + +func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence { + var rankPredicate obiseq.SequencePredicate + + options := make([]obichunk.WithOption, 0, 30) + + // Make sequence dereplication with a constraint on the taxid. + // To be merged, both sequences must have the same taxid. + + options = append(options, + obichunk.OptionBatchCount(100), + obichunk.OptionSortOnMemory(), + obichunk.OptionSubCategory("taxid"), + obichunk.OptionsParallelWorkers( + obioptions.CLIParallelWorkers()), + obichunk.OptionsBatchSize( + obioptions.CLIBatchSize()), + obichunk.OptionNAValue("NA"), + ) + + unique, err := obichunk.IUniqueSequence(itertator, options...) + + if err != nil { + log.Fatal(err) + } + + taxonomy := obigrep.CLILoadSelectedTaxonomy() + + if len(obigrep.CLIRequiredRanks()) > 0 { + rankPredicate = obigrep.CLIHasRankDefinedPredicate() + } else { + rankPredicate = taxonomy.HasRequiredRank("species").And(taxonomy.HasRequiredRank("genus")).And(taxonomy.HasRequiredRank("family")) + } + + goodTaxa := taxonomy.IsAValidTaxon(CLIUpdateTaxids()).And(rankPredicate) + + usable := unique.FilterOn(goodTaxa, + obioptions.CLIBatchSize(), + obioptions.CLIParallelWorkers()) + + annotated := usable.MakeIWorker(taxonomy.MakeSetSpeciesWorker(), + obioptions.CLIParallelWorkers(), + ).MakeIWorker(taxonomy.MakeSetGenusWorker(), + obioptions.CLIParallelWorkers(), + ).MakeIWorker(taxonomy.MakeSetFamilyWorker(), + obioptions.CLIParallelWorkers(), + ) + + // annotated.MakeIConditionalWorker(obiseq.IsMoreAbundantOrEqualTo(3),1000) + + return annotated +} diff --git a/pkg/obitools/obiconvert/options.go b/pkg/obitools/obiconvert/options.go index 7a4792d..006a390 100644 --- a/pkg/obitools/obiconvert/options.go +++ b/pkg/obitools/obiconvert/options.go @@ -60,6 +60,21 @@ func InputOptionSet(options *getoptions.GetOpt) { } +func OutputModeOptionSet(options *getoptions.GetOpt) { + options.BoolVar(&__no_progress_bar__, "no-progressbar", false, + options.Description("Disable the progress bar printing")) + + options.BoolVar(&__compressed__, "compress", false, + options.Alias("Z"), + options.Description("Output is compressed")) + + options.StringVar(&__output_file_name__, "out", __output_file_name__, + options.Alias("o"), + options.ArgName("FILENAME"), + options.Description("Filename used for saving the output"), + ) +} + func OutputOptionSet(options *getoptions.GetOpt) { options.BoolVar(&__output_in_fasta__, "fasta-output", false, options.Description("Read data following the ecoPCR output format.")) @@ -73,19 +88,7 @@ func OutputOptionSet(options *getoptions.GetOpt) { options.Alias("O"), options.Description("output FASTA/FASTQ title line annotations follow OBI format.")) - options.BoolVar(&__no_progress_bar__, "no-progressbar", false, - options.Description("Disable the progress bar printing")) - - options.BoolVar(&__compressed__, "compress", false, - options.Alias("Z"), - options.Description("Output is compressed")) - - options.StringVar(&__output_file_name__, "out", __output_file_name__, - options.Alias("o"), - options.ArgName("FILENAME"), - options.Description("Filename used for saving the output"), - ) - + OutputModeOptionSet(options) } func PairedFilesOptionSet(options *getoptions.GetOpt) { @@ -197,4 +200,4 @@ func CLIHasPairedFile() bool { } func CLIPairedFileName() string { return __paired_file_name__ -} \ No newline at end of file +} diff --git a/pkg/obitools/obiconvert/sequence_reader.go b/pkg/obitools/obiconvert/sequence_reader.go index 1d20962..c17f3b5 100644 --- a/pkg/obitools/obiconvert/sequence_reader.go +++ b/pkg/obitools/obiconvert/sequence_reader.go @@ -48,6 +48,10 @@ func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) { strings.HasSuffix(path, "fasta.gz") || strings.HasSuffix(path, "fastq") || strings.HasSuffix(path, "fastq.gz") || + strings.HasSuffix(path, "seq") || + strings.HasSuffix(path, "seq.gz") || + strings.HasSuffix(path, "gb") || + strings.HasSuffix(path, "gb.gz") || strings.HasSuffix(path, "dat") || strings.HasSuffix(path, "dat.gz") || strings.HasSuffix(path, "ecopcr") || @@ -82,13 +86,12 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) { opts = append(opts, obiformats.OptionsFastSeqHeaderParser(obiformats.ParseGuessedFastSeqHeader)) } - nworkers := obioptions.CLIParallelWorkers() // / 4 + nworkers := obioptions.CLIParallelWorkers() if nworkers < 2 { nworkers = 2 } opts = append(opts, obiformats.OptionsParallelWorkers(nworkers)) - opts = append(opts, obiformats.OptionsBufferSize(obioptions.CLIBufferSize())) opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize())) opts = append(opts, obiformats.OptionsQualityShift(CLIInputQualityShift())) diff --git a/pkg/obitools/obiconvert/sequence_writer.go b/pkg/obitools/obiconvert/sequence_writer.go index 01b1882..d5ef634 100644 --- a/pkg/obitools/obiconvert/sequence_writer.go +++ b/pkg/obitools/obiconvert/sequence_writer.go @@ -60,7 +60,6 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence, } opts = append(opts, obiformats.OptionsParallelWorkers(nworkers)) - opts = append(opts, obiformats.OptionsBufferSize(obioptions.CLIBufferSize())) opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize())) opts = append(opts, obiformats.OptionsQualityShift(CLIOutputQualityShift())) diff --git a/pkg/obitools/obicsv/obicsv.go b/pkg/obitools/obicsv/obicsv.go new file mode 100644 index 0000000..0e66565 --- /dev/null +++ b/pkg/obitools/obicsv/obicsv.go @@ -0,0 +1,61 @@ +package obicsv + +import ( + "log" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" +) + +func CLIWriteCSV(iterator obiiter.IBioSequence, + terminalAction bool, filenames ...string) (obiiter.IBioSequence, error) { + + if obiconvert.CLIProgressBar() { + iterator = iterator.Speed() + } + + var newIter obiiter.IBioSequence + + opts := make([]obiformats.WithOption, 0, 10) + + nworkers := obioptions.CLIParallelWorkers() / 4 + if nworkers < 2 { + nworkers = 2 + } + + opts = append(opts, obiformats.OptionsParallelWorkers(nworkers)) + opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize())) + + opts = append(opts, obiformats.OptionsQualityShift(obiconvert.CLIOutputQualityShift())) + opts = append(opts, obiformats.OptionsCompressed(obiconvert.CLICompressed())) + + opts = append(opts, obiformats.CSVId(CLIPrintId()), + obiformats.CSVCount(CLIPrintCount()), + obiformats.CSVTaxon(CLIPrintTaxon()), + obiformats.CSVDefinition(CLIPrintDefinition()), + obiformats.CSVKeys(CLIToBeKeptAttributes()), + ) + + var err error + + if len(filenames) == 0 { + newIter, err = obiformats.WriteCSVToStdout(iterator, opts...) + } else { + newIter, err = obiformats.WriteCSVToFile(iterator, filenames[0], opts...) + } + + if err != nil { + log.Fatalf("Write file error: %v", err) + return obiiter.NilIBioSequence, err + } + + if terminalAction { + newIter.Recycle() + return obiiter.NilIBioSequence, nil + } + + return newIter, nil + +} diff --git a/pkg/obitools/obicsv/options.go b/pkg/obitools/obicsv/options.go new file mode 100644 index 0000000..c4119d6 --- /dev/null +++ b/pkg/obitools/obicsv/options.go @@ -0,0 +1,126 @@ +package obicsv + +import ( + "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +var _outputIds = true +var _outputCount = false +var _outputTaxon = false +var _outputSequence = true +var _outputQuality = true +var _outputDefinition = false +var _obipairing = false +var _autoColumns = false +var _keepOnly = make([]string, 0) +var _naValue = "NA" + +var _softAttributes = map[string][]string{ + "obipairing": {"mode", "seq_a_single", "seq_b_single", + "ali_dir", "score", "score_norm", + "seq_ab_match", "pairing_mismatches", + }, +} + +func CSVOptionSet(options *getoptions.GetOpt) { + options.BoolVar(&_outputIds, "ids", _outputIds, + options.Alias("i"), + options.Description("Prints sequence ids in the ouput.")) + + options.BoolVar(&_outputSequence, "sequence", _outputSequence, + options.Alias("s"), + options.Description("Prints sequence itself in the output.")) + + options.BoolVar(&_outputQuality, "quality", _outputQuality, + options.Alias("q"), + options.Description("Prints sequence quality in the output.")) + + options.BoolVar(&_outputDefinition, "definition", _outputDefinition, + options.Alias("d"), + options.Description("Prints sequence definition in the output.")) + + options.BoolVar(&_autoColumns, "auto", _autoColumns, + options.Description("Based on the first sequences, propose a list of attibutes to print")) + + options.BoolVar(&_outputCount, "count", _outputCount, + options.Description("Prints the count attribute in the output")) + + options.BoolVar(&_outputTaxon, "taxon", _outputTaxon, + options.Description("Prints the NCBI taxid and its related scientific name")) + + options.BoolVar(&_obipairing, "obipairing", _obipairing, + options.Description("Prints the attributes added by obipairing")) + + options.StringSliceVar(&_keepOnly, "keep", 1, 1, + options.Alias("k"), + options.ArgName("KEY"), + options.Description("Keeps only attribute with key . Several -k options can be combined.")) + + options.StringVar(&_naValue, "na-value", _naValue, + options.ArgName("NAVALUE"), + options.Description("A string representing non available values in the CSV file.")) +} + +func OptionSet(options *getoptions.GetOpt) { + obiconvert.OutputModeOptionSet(options) + CSVOptionSet(options) +} + +func CLIPrintId() bool { + return _outputIds +} + +func CLIPrintSequence() bool { + return _outputSequence +} + +func CLIPrintCount() bool { + return _outputCount +} +func CLIPrintTaxon() bool { + return _outputTaxon +} +func CLIPrintQuality() bool { + return _outputQuality +} + +func CLIPrintDefinition() bool { + return _outputDefinition +} + +func CLIAutoColumns() bool { + return _autoColumns +} + +func CLIHasToBeKeptAttributes() bool { + return len(_keepOnly) > 0 +} + +func CLIToBeKeptAttributes() []string { + if _obipairing { + _keepOnly = append(_keepOnly, _softAttributes["obipairing"]...) + } + + if i := goutils.LookFor(_keepOnly, "count"); i >= 0 { + _keepOnly = goutils.RemoveIndex(_keepOnly, i) + _outputCount = true + } + + if i := goutils.LookFor(_keepOnly, "taxid"); i >= 0 { + _keepOnly = goutils.RemoveIndex(_keepOnly, i) + _outputTaxon = true + } + + if i := goutils.LookFor(_keepOnly, "scientific_name"); i >= 0 { + _keepOnly = goutils.RemoveIndex(_keepOnly, i) + _outputTaxon = true + } + + return _keepOnly +} + +func CLINAValue() string { + return _naValue +} diff --git a/pkg/obitools/obidistribute/distribute.go b/pkg/obitools/obidistribute/distribute.go index 88a77b9..beb5a9c 100644 --- a/pkg/obitools/obidistribute/distribute.go +++ b/pkg/obitools/obidistribute/distribute.go @@ -31,7 +31,6 @@ func DistributeSequence(sequences obiiter.IBioSequence) { } opts = append(opts, obiformats.OptionsParallelWorkers(nworkers), - obiformats.OptionsBufferSize(obioptions.CLIBufferSize()), obiformats.OptionsBatchSize(obioptions.CLIBatchSize()), obiformats.OptionsQualityShift(obiconvert.CLIOutputQualityShift()), obiformats.OptionsAppendFile(CLIAppendSequences()), diff --git a/pkg/obitools/obigrep/grep.go b/pkg/obitools/obigrep/grep.go index d3e474c..a358061 100644 --- a/pkg/obitools/obigrep/grep.go +++ b/pkg/obitools/obigrep/grep.go @@ -39,7 +39,6 @@ func CLIFilterSequence(iterator obiiter.IBioSequence) obiiter.IBioSequence { newIter = iterator.FilterOn(predicate, obioptions.CLIBatchSize(), obioptions.CLIParallelWorkers(), - obioptions.CLIBufferSize(), ) } } else { diff --git a/pkg/obitools/obimultiplex/demultiplex.go b/pkg/obitools/obimultiplex/demultiplex.go index 5ba527c..69b8a9c 100644 --- a/pkg/obitools/obimultiplex/demultiplex.go +++ b/pkg/obitools/obimultiplex/demultiplex.go @@ -20,7 +20,6 @@ func IExtractBarcode(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error obingslibrary.OptionDiscardErrors(!CLIConservedErrors()), obingslibrary.OptionParallelWorkers(obioptions.CLIParallelWorkers()), obingslibrary.OptionBatchSize(obioptions.CLIBatchSize()), - obingslibrary.OptionBufferSize(obioptions.CLIBufferSize()), ) ngsfilter, err := CLINGSFIlter() diff --git a/pkg/obitools/obipairing/pairing.go b/pkg/obitools/obipairing/pairing.go index 36cf231..f1ecf93 100644 --- a/pkg/obitools/obipairing/pairing.go +++ b/pkg/obitools/obipairing/pairing.go @@ -211,17 +211,13 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence, } nworkers := obioptions.CLIMaxCPU() * 3 / 2 - buffsize := iterator.BufferSize() if len(sizes) > 0 { nworkers = sizes[0] } - if len(sizes) > 1 { - buffsize = sizes[1] - } - newIter := obiiter.MakeIBioSequence(buffsize) + newIter := obiiter.MakeIBioSequence() newIter.Add(nworkers) diff --git a/pkg/obitools/obiuniq/unique.go b/pkg/obitools/obiuniq/unique.go index fc1ac26..14d8e26 100644 --- a/pkg/obitools/obiuniq/unique.go +++ b/pkg/obitools/obiuniq/unique.go @@ -51,8 +51,6 @@ func Unique(sequences obiiter.IBioSequence) obiiter.IBioSequence { options = append(options, obichunk.OptionsParallelWorkers( obioptions.CLIParallelWorkers()), - obichunk.OptionsBufferSize( - obioptions.CLIBufferSize()), obichunk.OptionsBatchSize( obioptions.CLIBatchSize()), obichunk.OptionNAValue(CLINAValue()),