mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Merge pull request #82 from metabarcoding/push-vkprtnlyxmkl
Push vkprtnlyxmkl
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -31,3 +31,5 @@ LLM/**
|
|||||||
*_files
|
*_files
|
||||||
|
|
||||||
entropy.html
|
entropy.html
|
||||||
|
bug_id.txt
|
||||||
|
obilowmask_ref
|
||||||
|
|||||||
19
Makefile
19
Makefile
@@ -133,14 +133,23 @@ jjpush:
|
|||||||
@jj auto-describe
|
@jj auto-describe
|
||||||
@echo "$(BLUE)→ Creating new commit for version bump...$(NC)"
|
@echo "$(BLUE)→ Creating new commit for version bump...$(NC)"
|
||||||
@jj new
|
@jj new
|
||||||
@$(MAKE) bump-version
|
@previous_version=$$(cat version.txt); \
|
||||||
@echo "$(BLUE)→ Documenting version bump commit...$(NC)"
|
$(MAKE) bump-version; \
|
||||||
@jj auto-describe
|
version=$$(cat version.txt); \
|
||||||
@version=$$(cat version.txt); \
|
|
||||||
tag_name="Release_$$version"; \
|
tag_name="Release_$$version"; \
|
||||||
|
previous_tag="Release_$$previous_version"; \
|
||||||
|
echo "$(BLUE)→ Documenting version bump commit...$(NC)"; \
|
||||||
|
jj auto-describe; \
|
||||||
|
echo "$(BLUE)→ Generating release notes from $$previous_tag to current commit...$(NC)"; \
|
||||||
|
if command -v orla >/dev/null 2>&1; then \
|
||||||
|
release_message=$$(ORLA_MAX_TOOL_CALLS=50 jj log -r "$$previous_tag::@" -T 'commit_id.short() ++ " " ++ description' | \
|
||||||
|
orla agent -m ollama:qwen3-coder-next:latest "synthétise en anglais l'ensemble des commits présentés en un message de nouvelle version pour ma page GitHub. Tu détailleras précisément les changements, sans exposer de code, et tu éviteras toute redondance dans le texte généré."); \
|
||||||
|
else \
|
||||||
|
release_message="Release $$version"; \
|
||||||
|
fi; \
|
||||||
echo "$(BLUE)→ Pushing commits and creating tag $$tag_name...$(NC)"; \
|
echo "$(BLUE)→ Pushing commits and creating tag $$tag_name...$(NC)"; \
|
||||||
jj git push --change @; \
|
jj git push --change @; \
|
||||||
git tag -a "$$tag_name" -m "Release $$version" 2>/dev/null || echo "Tag $$tag_name already exists"; \
|
git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "Tag $$tag_name already exists"; \
|
||||||
git push origin "$$tag_name" 2>/dev/null || echo "Tag already pushed"
|
git push origin "$$tag_name" 2>/dev/null || echo "Tag already pushed"
|
||||||
@echo "$(GREEN)✓ Commits and tag pushed to repository$(NC)"
|
@echo "$(GREEN)✓ Commits and tag pushed to repository$(NC)"
|
||||||
|
|
||||||
|
|||||||
735
blackboard/architechture/architecture-commande-obitools.md
Normal file
735
blackboard/architechture/architecture-commande-obitools.md
Normal file
@@ -0,0 +1,735 @@
|
|||||||
|
# Architecture d'une commande OBITools
|
||||||
|
|
||||||
|
## Vue d'ensemble
|
||||||
|
|
||||||
|
Une commande OBITools suit une architecture modulaire et standardisée qui sépare clairement les responsabilités entre :
|
||||||
|
- Le package de la commande dans `pkg/obitools/<nom_commande>/`
|
||||||
|
- L'exécutable dans `cmd/obitools/<nom_commande>/`
|
||||||
|
|
||||||
|
Cette architecture favorise la réutilisabilité du code, la testabilité et la cohérence entre les différentes commandes de la suite OBITools.
|
||||||
|
|
||||||
|
## Structure du projet
|
||||||
|
|
||||||
|
```
|
||||||
|
obitools4/
|
||||||
|
├── pkg/obitools/
|
||||||
|
│ ├── obiconvert/ # Commande de conversion (base pour toutes)
|
||||||
|
│ │ ├── obiconvert.go # Fonctions vides (pas d'implémentation)
|
||||||
|
│ │ ├── options.go # Définition des options CLI
|
||||||
|
│ │ ├── sequence_reader.go # Lecture des séquences
|
||||||
|
│ │ └── sequence_writer.go # Écriture des séquences
|
||||||
|
│ ├── obiuniq/ # Commande de déréplication
|
||||||
|
│ │ ├── obiuniq.go # (fichier vide)
|
||||||
|
│ │ ├── options.go # Options spécifiques à obiuniq
|
||||||
|
│ │ └── unique.go # Implémentation du traitement
|
||||||
|
│ ├── obipairing/ # Assemblage de lectures paired-end
|
||||||
|
│ ├── obisummary/ # Résumé de fichiers de séquences
|
||||||
|
│ └── obimicrosat/ # Détection de microsatellites
|
||||||
|
└── cmd/obitools/
|
||||||
|
├── obiconvert/
|
||||||
|
│ └── main.go # Point d'entrée de la commande
|
||||||
|
├── obiuniq/
|
||||||
|
│ └── main.go
|
||||||
|
├── obipairing/
|
||||||
|
│ └── main.go
|
||||||
|
├── obisummary/
|
||||||
|
│ └── main.go
|
||||||
|
└── obimicrosat/
|
||||||
|
└── main.go
|
||||||
|
```
|
||||||
|
|
||||||
|
## Composants de l'architecture
|
||||||
|
|
||||||
|
### 1. Package `pkg/obitools/<commande>/`
|
||||||
|
|
||||||
|
Chaque commande possède son propre package dans `pkg/obitools/` qui contient l'implémentation complète de la logique métier. Ce package est structuré en plusieurs fichiers :
|
||||||
|
|
||||||
|
#### a) `options.go` - Gestion des options CLI
|
||||||
|
|
||||||
|
Ce fichier définit :
|
||||||
|
- Les **variables globales** privées (préfixées par `_`) stockant les valeurs des options
|
||||||
|
- La fonction **`OptionSet()`** qui configure toutes les options pour la commande
|
||||||
|
- Les fonctions **`CLI*()`** qui retournent les valeurs des options (getters)
|
||||||
|
- Les fonctions **`Set*()`** qui permettent de définir les options programmatiquement (setters)
|
||||||
|
|
||||||
|
**Exemple (obiuniq/options.go) :**
|
||||||
|
|
||||||
|
```go
|
||||||
|
package obiuniq
|
||||||
|
|
||||||
|
import (
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||||
|
"github.com/DavidGamba/go-getoptions"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Variables globales privées pour stocker les options
|
||||||
|
var _StatsOn = make([]string, 0, 10)
|
||||||
|
var _Keys = make([]string, 0, 10)
|
||||||
|
var _InMemory = false
|
||||||
|
var _chunks = 100
|
||||||
|
|
||||||
|
// Configuration des options spécifiques à la commande
|
||||||
|
func UniqueOptionSet(options *getoptions.GetOpt) {
|
||||||
|
options.StringSliceVar(&_StatsOn, "merge", 1, 1,
|
||||||
|
options.Alias("m"),
|
||||||
|
options.ArgName("KEY"),
|
||||||
|
options.Description("Adds a merged attribute..."))
|
||||||
|
|
||||||
|
options.BoolVar(&_InMemory, "in-memory", _InMemory,
|
||||||
|
options.Description("Use memory instead of disk..."))
|
||||||
|
|
||||||
|
options.IntVar(&_chunks, "chunk-count", _chunks,
|
||||||
|
options.Description("In how many chunks..."))
|
||||||
|
}
|
||||||
|
|
||||||
|
// OptionSet combine les options de base + les options spécifiques
|
||||||
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
|
obiconvert.OptionSet(false)(options) // Options de base
|
||||||
|
UniqueOptionSet(options) // Options spécifiques
|
||||||
|
}
|
||||||
|
|
||||||
|
// Getters pour accéder aux valeurs des options
|
||||||
|
func CLIStatsOn() []string {
|
||||||
|
return _StatsOn
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLIUniqueInMemory() bool {
|
||||||
|
return _InMemory
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setters pour définir les options programmatiquement
|
||||||
|
func SetUniqueInMemory(inMemory bool) {
|
||||||
|
_InMemory = inMemory
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Convention de nommage :**
|
||||||
|
- Variables privées : `_NomOption` (underscore préfixe)
|
||||||
|
- Getters : `CLINomOption()` (préfixe CLI)
|
||||||
|
- Setters : `SetNomOption()` (préfixe Set)
|
||||||
|
|
||||||
|
#### b) Fichier(s) d'implémentation
|
||||||
|
|
||||||
|
Un ou plusieurs fichiers contenant la logique métier de la commande :
|
||||||
|
|
||||||
|
**Exemple (obiuniq/unique.go) :**
|
||||||
|
|
||||||
|
```go
|
||||||
|
package obiuniq
|
||||||
|
|
||||||
|
import (
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obichunk"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Fonction CLI principale qui orchestre le traitement
|
||||||
|
func CLIUnique(sequences obiiter.IBioSequence) obiiter.IBioSequence {
|
||||||
|
// Récupération des options via les getters CLI*()
|
||||||
|
options := make([]obichunk.WithOption, 0, 30)
|
||||||
|
|
||||||
|
options = append(options,
|
||||||
|
obichunk.OptionBatchCount(CLINumberOfChunks()),
|
||||||
|
)
|
||||||
|
|
||||||
|
if CLIUniqueInMemory() {
|
||||||
|
options = append(options, obichunk.OptionSortOnMemory())
|
||||||
|
} else {
|
||||||
|
options = append(options, obichunk.OptionSortOnDisk())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Appel de la fonction de traitement réelle
|
||||||
|
iUnique, err := obichunk.IUniqueSequence(sequences, options...)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return iUnique
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Autres exemples d'implémentation :**
|
||||||
|
|
||||||
|
- **obimicrosat/microsat.go** : Contient `MakeMicrosatWorker()` et `CLIAnnotateMicrosat()`
|
||||||
|
- **obisummary/obisummary.go** : Contient `ISummary()` et les structures de données
|
||||||
|
|
||||||
|
#### c) Fichiers utilitaires (optionnel)
|
||||||
|
|
||||||
|
Certaines commandes ont des fichiers additionnels pour des fonctionnalités spécifiques.
|
||||||
|
|
||||||
|
**Exemple (obipairing/options.go) :**
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Fonction spéciale pour créer un itérateur de séquences pairées
|
||||||
|
func CLIPairedSequence() (obiiter.IBioSequence, error) {
|
||||||
|
forward, err := obiconvert.CLIReadBioSequences(_ForwardFile)
|
||||||
|
if err != nil {
|
||||||
|
return obiiter.NilIBioSequence, err
|
||||||
|
}
|
||||||
|
|
||||||
|
reverse, err := obiconvert.CLIReadBioSequences(_ReverseFile)
|
||||||
|
if err != nil {
|
||||||
|
return obiiter.NilIBioSequence, err
|
||||||
|
}
|
||||||
|
|
||||||
|
paired := forward.PairTo(reverse)
|
||||||
|
return paired, nil
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Package `obiconvert` - La base commune
|
||||||
|
|
||||||
|
Le package `obiconvert` est spécial car il fournit les fonctionnalités de base utilisées par toutes les autres commandes :
|
||||||
|
|
||||||
|
#### Fonctionnalités fournies :
|
||||||
|
|
||||||
|
1. **Lecture de séquences** (`sequence_reader.go`)
|
||||||
|
- `CLIReadBioSequences()` : lecture depuis fichiers ou stdin
|
||||||
|
- Support de multiples formats (FASTA, FASTQ, EMBL, GenBank, etc.)
|
||||||
|
- Gestion des fichiers multiples
|
||||||
|
- Barre de progression optionnelle
|
||||||
|
|
||||||
|
2. **Écriture de séquences** (`sequence_writer.go`)
|
||||||
|
- `CLIWriteBioSequences()` : écriture vers fichiers ou stdout
|
||||||
|
- Support de multiples formats
|
||||||
|
- Gestion des lectures pairées
|
||||||
|
- Compression optionnelle
|
||||||
|
|
||||||
|
3. **Options communes** (`options.go`)
|
||||||
|
- Options d'entrée (format, skip, etc.)
|
||||||
|
- Options de sortie (format, fichier, compression)
|
||||||
|
- Options de mode (barre de progression, etc.)
|
||||||
|
|
||||||
|
#### Utilisation par les autres commandes :
|
||||||
|
|
||||||
|
Toutes les commandes incluent les options de `obiconvert` via :
|
||||||
|
|
||||||
|
```go
|
||||||
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
|
obiconvert.OptionSet(false)(options) // false = pas de fichiers pairés
|
||||||
|
MaCommandeOptionSet(options) // Options spécifiques
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Exécutable `cmd/obitools/<commande>/main.go`
|
||||||
|
|
||||||
|
Le fichier `main.go` de chaque commande est volontairement **minimaliste** et suit toujours le même pattern :
|
||||||
|
|
||||||
|
```go
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/macommande"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
// 1. Configuration optionnelle de paramètres par défaut
|
||||||
|
obidefault.SetBatchSize(10)
|
||||||
|
|
||||||
|
// 2. Génération du parser d'options
|
||||||
|
optionParser := obioptions.GenerateOptionParser(
|
||||||
|
"macommande", // Nom de la commande
|
||||||
|
"description de la commande", // Description
|
||||||
|
macommande.OptionSet) // Fonction de configuration des options
|
||||||
|
|
||||||
|
// 3. Parsing des arguments
|
||||||
|
_, args := optionParser(os.Args)
|
||||||
|
|
||||||
|
// 4. Lecture des séquences d'entrée
|
||||||
|
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||||
|
obiconvert.OpenSequenceDataErrorMessage(args, err)
|
||||||
|
|
||||||
|
// 5. Traitement spécifique de la commande
|
||||||
|
resultat := macommande.CLITraitement(sequences)
|
||||||
|
|
||||||
|
// 6. Écriture des résultats
|
||||||
|
obiconvert.CLIWriteBioSequences(resultat, true)
|
||||||
|
|
||||||
|
// 7. Attente de la fin du pipeline
|
||||||
|
obiutils.WaitForLastPipe()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Patterns architecturaux
|
||||||
|
|
||||||
|
### Pattern 1 : Pipeline de traitement de séquences
|
||||||
|
|
||||||
|
La plupart des commandes suivent ce pattern :
|
||||||
|
|
||||||
|
```
|
||||||
|
Lecture → Traitement → Écriture
|
||||||
|
```
|
||||||
|
|
||||||
|
**Exemples :**
|
||||||
|
- **obiconvert** : Lecture → Écriture (conversion de format)
|
||||||
|
- **obiuniq** : Lecture → Déréplication → Écriture
|
||||||
|
- **obimicrosat** : Lecture → Annotation → Filtrage → Écriture
|
||||||
|
|
||||||
|
### Pattern 2 : Traitement avec entrées multiples
|
||||||
|
|
||||||
|
Certaines commandes acceptent plusieurs fichiers d'entrée :
|
||||||
|
|
||||||
|
**obipairing** :
|
||||||
|
```
|
||||||
|
Lecture Forward + Lecture Reverse → Pairing → Assemblage → Écriture
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pattern 3 : Traitement sans écriture de séquences
|
||||||
|
|
||||||
|
**obisummary** : produit un résumé JSON/YAML au lieu de séquences
|
||||||
|
|
||||||
|
```go
|
||||||
|
func main() {
|
||||||
|
// ... parsing options et lecture ...
|
||||||
|
|
||||||
|
summary := obisummary.ISummary(fs, obisummary.CLIMapSummary())
|
||||||
|
|
||||||
|
// Formatage et affichage direct
|
||||||
|
if obisummary.CLIOutFormat() == "json" {
|
||||||
|
output, _ := json.MarshalIndent(summary, "", " ")
|
||||||
|
fmt.Print(string(output))
|
||||||
|
} else {
|
||||||
|
output, _ := yaml.Marshal(summary)
|
||||||
|
fmt.Print(string(output))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pattern 4 : Utilisation de Workers
|
||||||
|
|
||||||
|
Les commandes qui transforment des séquences utilisent souvent le pattern Worker :
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Création d'un worker
|
||||||
|
worker := MakeMicrosatWorker(
|
||||||
|
CLIMinUnitLength(),
|
||||||
|
CLIMaxUnitLength(),
|
||||||
|
// ... autres paramètres
|
||||||
|
)
|
||||||
|
|
||||||
|
// Application du worker sur l'itérateur
|
||||||
|
newIter = iterator.MakeIWorker(
|
||||||
|
worker,
|
||||||
|
false, // merge results
|
||||||
|
obidefault.ParallelWorkers() // parallélisation
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Étapes d'implémentation d'une nouvelle commande
|
||||||
|
|
||||||
|
### Étape 1 : Créer le package dans `pkg/obitools/`
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir -p pkg/obitools/macommande
|
||||||
|
```
|
||||||
|
|
||||||
|
### Étape 2 : Créer `options.go`
|
||||||
|
|
||||||
|
```go
|
||||||
|
package macommande
|
||||||
|
|
||||||
|
import (
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||||
|
"github.com/DavidGamba/go-getoptions"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Variables privées pour les options
|
||||||
|
var _MonOption = "valeur_par_defaut"
|
||||||
|
|
||||||
|
// Configuration des options spécifiques
|
||||||
|
func MaCommandeOptionSet(options *getoptions.GetOpt) {
|
||||||
|
options.StringVar(&_MonOption, "mon-option", _MonOption,
|
||||||
|
options.Alias("o"),
|
||||||
|
options.Description("Description de l'option"))
|
||||||
|
}
|
||||||
|
|
||||||
|
// OptionSet combine options de base + spécifiques
|
||||||
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
|
obiconvert.OptionSet(false)(options) // false si pas de fichiers pairés
|
||||||
|
MaCommandeOptionSet(options)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Getters
|
||||||
|
func CLIMonOption() string {
|
||||||
|
return _MonOption
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setters
|
||||||
|
func SetMonOption(value string) {
|
||||||
|
_MonOption = value
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Étape 3 : Créer le fichier d'implémentation
|
||||||
|
|
||||||
|
Créer `macommande.go` (ou un nom plus descriptif) :
|
||||||
|
|
||||||
|
```go
|
||||||
|
package macommande
|
||||||
|
|
||||||
|
import (
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Fonction de traitement principale
|
||||||
|
func CLIMaCommande(sequences obiiter.IBioSequence) obiiter.IBioSequence {
|
||||||
|
// Récupération des options
|
||||||
|
option := CLIMonOption()
|
||||||
|
|
||||||
|
// Implémentation du traitement
|
||||||
|
// ...
|
||||||
|
|
||||||
|
return resultat
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Étape 4 : Créer l'exécutable dans `cmd/obitools/`
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir -p cmd/obitools/macommande
|
||||||
|
```
|
||||||
|
|
||||||
|
Créer `main.go` :
|
||||||
|
|
||||||
|
```go
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/macommande"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
// Parser d'options
|
||||||
|
optionParser := obioptions.GenerateOptionParser(
|
||||||
|
"macommande",
|
||||||
|
"Description courte de ma commande",
|
||||||
|
macommande.OptionSet)
|
||||||
|
|
||||||
|
_, args := optionParser(os.Args)
|
||||||
|
|
||||||
|
// Lecture
|
||||||
|
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||||
|
obiconvert.OpenSequenceDataErrorMessage(args, err)
|
||||||
|
|
||||||
|
// Traitement
|
||||||
|
resultat := macommande.CLIMaCommande(sequences)
|
||||||
|
|
||||||
|
// Écriture
|
||||||
|
obiconvert.CLIWriteBioSequences(resultat, true)
|
||||||
|
|
||||||
|
// Attente
|
||||||
|
obiutils.WaitForLastPipe()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Étape 5 : Configurations optionnelles
|
||||||
|
|
||||||
|
Dans `main.go`, avant le parsing des options, on peut configurer :
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Taille des batchs de séquences
|
||||||
|
obidefault.SetBatchSize(10)
|
||||||
|
|
||||||
|
// Nombre de workers en lecture (strict)
|
||||||
|
obidefault.SetStrictReadWorker(2)
|
||||||
|
|
||||||
|
// Nombre de workers en écriture
|
||||||
|
obidefault.SetStrictWriteWorker(2)
|
||||||
|
|
||||||
|
// Désactiver la lecture des qualités
|
||||||
|
obidefault.SetReadQualities(false)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Étape 6 : Gestion des erreurs
|
||||||
|
|
||||||
|
Utiliser les fonctions utilitaires pour les messages d'erreur cohérents :
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Pour les erreurs d'ouverture de fichiers
|
||||||
|
obiconvert.OpenSequenceDataErrorMessage(args, err)
|
||||||
|
|
||||||
|
// Pour les erreurs générales
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Message d'erreur: %v", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Étape 7 : Tests et debugging (optionnel)
|
||||||
|
|
||||||
|
Des commentaires dans le code montrent comment activer le profiling :
|
||||||
|
|
||||||
|
```go
|
||||||
|
// go tool pprof -http=":8000" ./macommande ./cpu.pprof
|
||||||
|
// f, err := os.Create("cpu.pprof")
|
||||||
|
// if err != nil {
|
||||||
|
// log.Fatal(err)
|
||||||
|
// }
|
||||||
|
// pprof.StartCPUProfile(f)
|
||||||
|
// defer pprof.StopCPUProfile()
|
||||||
|
|
||||||
|
// go tool trace cpu.trace
|
||||||
|
// ftrace, err := os.Create("cpu.trace")
|
||||||
|
// if err != nil {
|
||||||
|
// log.Fatal(err)
|
||||||
|
// }
|
||||||
|
// trace.Start(ftrace)
|
||||||
|
// defer trace.Stop()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Bonnes pratiques observées
|
||||||
|
|
||||||
|
### 1. Séparation des responsabilités
|
||||||
|
|
||||||
|
- **`main.go`** : orchestration minimale
|
||||||
|
- **`options.go`** : définition et gestion des options
|
||||||
|
- **Fichiers d'implémentation** : logique métier
|
||||||
|
|
||||||
|
### 2. Convention de nommage cohérente
|
||||||
|
|
||||||
|
- Variables d'options : `_NomOption`
|
||||||
|
- Getters CLI : `CLINomOption()`
|
||||||
|
- Setters : `SetNomOption()`
|
||||||
|
- Fonctions de traitement CLI : `CLITraitement()`
|
||||||
|
|
||||||
|
### 3. Réutilisation du code
|
||||||
|
|
||||||
|
- Toutes les commandes réutilisent `obiconvert` pour l'I/O
|
||||||
|
- Les options communes sont partagées
|
||||||
|
- Les fonctions utilitaires sont centralisées
|
||||||
|
|
||||||
|
### 4. Configuration par défaut
|
||||||
|
|
||||||
|
Les valeurs par défaut sont :
|
||||||
|
- Définies lors de l'initialisation des variables
|
||||||
|
- Modifiables via les options CLI
|
||||||
|
- Modifiables programmatiquement via les setters
|
||||||
|
|
||||||
|
### 5. Gestion des formats
|
||||||
|
|
||||||
|
Support automatique de multiples formats :
|
||||||
|
- FASTA / FASTQ (avec compression gzip)
|
||||||
|
- EMBL / GenBank
|
||||||
|
- ecoPCR
|
||||||
|
- CSV
|
||||||
|
- JSON (avec différents formats d'en-têtes)
|
||||||
|
|
||||||
|
### 6. Parallélisation
|
||||||
|
|
||||||
|
Les commandes utilisent les workers parallèles via :
|
||||||
|
- `obidefault.ParallelWorkers()`
|
||||||
|
- `obidefault.SetStrictReadWorker(n)`
|
||||||
|
- `obidefault.SetStrictWriteWorker(n)`
|
||||||
|
|
||||||
|
### 7. Logging cohérent
|
||||||
|
|
||||||
|
Utilisation de `logrus` pour tous les logs :
|
||||||
|
```go
|
||||||
|
log.Printf("Message informatif")
|
||||||
|
log.Errorf("Message d'erreur: %v", err)
|
||||||
|
log.Fatal(err) // Arrêt du programme
|
||||||
|
```
|
||||||
|
|
||||||
|
## Dépendances principales
|
||||||
|
|
||||||
|
### Packages internes OBITools
|
||||||
|
|
||||||
|
- `pkg/obidefault` : valeurs par défaut et configuration globale
|
||||||
|
- `pkg/obioptions` : génération du parser d'options
|
||||||
|
- `pkg/obiiter` : itérateurs de séquences biologiques
|
||||||
|
- `pkg/obiseq` : structures et fonctions pour séquences biologiques
|
||||||
|
- `pkg/obiformats` : lecture/écriture de différents formats
|
||||||
|
- `pkg/obiutils` : fonctions utilitaires diverses
|
||||||
|
- `pkg/obichunk` : traitement par chunks (pour dereplication, etc.)
|
||||||
|
|
||||||
|
### Packages externes
|
||||||
|
|
||||||
|
- `github.com/DavidGamba/go-getoptions` : parsing des options CLI
|
||||||
|
- `github.com/sirupsen/logrus` : logging structuré
|
||||||
|
- `gopkg.in/yaml.v3` : encodage/décodage YAML
|
||||||
|
- `github.com/dlclark/regexp2` : expressions régulières avancées
|
||||||
|
|
||||||
|
## Cas spéciaux
|
||||||
|
|
||||||
|
### Commande avec fichiers pairés (obipairing)
|
||||||
|
|
||||||
|
```go
|
||||||
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
|
obiconvert.OutputOptionSet(options)
|
||||||
|
obiconvert.InputOptionSet(options)
|
||||||
|
PairingOptionSet(options) // Options spécifiques au pairing
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLIPairedSequence() (obiiter.IBioSequence, error) {
|
||||||
|
forward, err := obiconvert.CLIReadBioSequences(_ForwardFile)
|
||||||
|
// ...
|
||||||
|
reverse, err := obiconvert.CLIReadBioSequences(_ReverseFile)
|
||||||
|
// ...
|
||||||
|
paired := forward.PairTo(reverse)
|
||||||
|
return paired, nil
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Dans `main.go` :
|
||||||
|
```go
|
||||||
|
pairs, err := obipairing.CLIPairedSequence() // Lecture spéciale
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Cannot open file (%v)", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
paired := obipairing.IAssemblePESequencesBatch(
|
||||||
|
pairs,
|
||||||
|
obipairing.CLIGapPenality(),
|
||||||
|
// ... autres paramètres
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Commande sans sortie de séquences (obisummary)
|
||||||
|
|
||||||
|
Au lieu de `obiconvert.CLIWriteBioSequences()`, affichage direct :
|
||||||
|
|
||||||
|
```go
|
||||||
|
summary := obisummary.ISummary(fs, obisummary.CLIMapSummary())
|
||||||
|
|
||||||
|
if obisummary.CLIOutFormat() == "json" {
|
||||||
|
output, _ := json.MarshalIndent(summary, "", " ")
|
||||||
|
fmt.Print(string(output))
|
||||||
|
} else {
|
||||||
|
output, _ := yaml.Marshal(summary)
|
||||||
|
fmt.Print(string(output))
|
||||||
|
}
|
||||||
|
fmt.Printf("\n")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Commande avec Workers personnalisés (obimicrosat)
|
||||||
|
|
||||||
|
```go
|
||||||
|
func CLIAnnotateMicrosat(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||||
|
// Création du worker
|
||||||
|
worker := MakeMicrosatWorker(
|
||||||
|
CLIMinUnitLength(),
|
||||||
|
CLIMaxUnitLength(),
|
||||||
|
CLIMinUnitCount(),
|
||||||
|
CLIMinLength(),
|
||||||
|
CLIMinFlankLength(),
|
||||||
|
CLIReoriented(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Application du worker
|
||||||
|
newIter := iterator.MakeIWorker(
|
||||||
|
worker,
|
||||||
|
false, // pas de merge
|
||||||
|
obidefault.ParallelWorkers(), // parallélisation
|
||||||
|
)
|
||||||
|
|
||||||
|
return newIter.FilterEmpty() // Filtrage des résultats vides
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Diagramme de flux d'exécution
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ cmd/obitools/macommande/main.go │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ 1. Génération du parser d'options │
|
||||||
|
│ obioptions.GenerateOptionParser( │
|
||||||
|
│ "macommande", │
|
||||||
|
│ "description", │
|
||||||
|
│ macommande.OptionSet) │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ pkg/obitools/macommande/options.go │
|
||||||
|
│ ┌─────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ func OptionSet(options *getoptions.GetOpt) │ │
|
||||||
|
│ │ obiconvert.OptionSet(false)(options) ───────────┐ │ │
|
||||||
|
│ │ MaCommandeOptionSet(options) │ │ │
|
||||||
|
│ └───────────────────────────────────────────────────┼─┘ │
|
||||||
|
└────────────────────────────────────────────────────────┼─────┘
|
||||||
|
│ │
|
||||||
|
│ │
|
||||||
|
┌─────────────┘ │
|
||||||
|
│ │
|
||||||
|
▼ ▼
|
||||||
|
┌─────────────────────────────────┐ ┌───────────────────────────────┐
|
||||||
|
│ 2. Parsing des arguments │ │ pkg/obitools/obiconvert/ │
|
||||||
|
│ _, args := optionParser(...) │ │ options.go │
|
||||||
|
└─────────────────────────────────┘ │ - InputOptionSet() │
|
||||||
|
│ │ - OutputOptionSet() │
|
||||||
|
▼ │ - PairedFilesOptionSet() │
|
||||||
|
┌─────────────────────────────────┐ └───────────────────────────────┘
|
||||||
|
│ 3. Lecture des séquences │
|
||||||
|
│ CLIReadBioSequences(args) │
|
||||||
|
└─────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ pkg/obitools/obiconvert/sequence_reader.go │
|
||||||
|
│ - ExpandListOfFiles() │
|
||||||
|
│ - ReadSequencesFromFile() / ReadSequencesFromStdin() │
|
||||||
|
│ - Support: FASTA, FASTQ, EMBL, GenBank, ecoPCR, CSV │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼ obiiter.IBioSequence
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ 4. Traitement spécifique │
|
||||||
|
│ macommande.CLITraitement(sequences) │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ pkg/obitools/macommande/<implementation>.go │
|
||||||
|
│ - Récupération des options via CLI*() getters │
|
||||||
|
│ - Application de la logique métier │
|
||||||
|
│ - Retour d'un nouvel iterator │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼ obiiter.IBioSequence
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ 5. Écriture des résultats │
|
||||||
|
│ CLIWriteBioSequences(resultat, true) │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ pkg/obitools/obiconvert/sequence_writer.go │
|
||||||
|
│ - WriteSequencesToFile() / WriteSequencesToStdout() │
|
||||||
|
│ - Support: FASTA, FASTQ, JSON │
|
||||||
|
│ - Gestion des lectures pairées │
|
||||||
|
│ - Compression optionnelle │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ 6. Attente de fin du pipeline │
|
||||||
|
│ obiutils.WaitForLastPipe() │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
L'architecture des commandes OBITools est conçue pour :
|
||||||
|
|
||||||
|
1. **Maximiser la réutilisation** : `obiconvert` fournit les fonctionnalités communes
|
||||||
|
2. **Simplifier l'ajout de nouvelles commandes** : pattern standardisé et minimaliste
|
||||||
|
3. **Faciliter la maintenance** : séparation claire des responsabilités
|
||||||
|
4. **Garantir la cohérence** : conventions de nommage et structure uniforme
|
||||||
|
5. **Optimiser les performances** : parallélisation intégrée et traitement par batch
|
||||||
|
|
||||||
|
Cette architecture modulaire permet de créer rapidement de nouvelles commandes tout en maintenant une qualité et une cohérence élevées dans toute la suite OBITools.
|
||||||
99
blackboard/architechture/definition-superkmer.md
Normal file
99
blackboard/architechture/definition-superkmer.md
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
# Définition du super k-mer
|
||||||
|
|
||||||
|
## Définition
|
||||||
|
|
||||||
|
Un **super k-mer** est une **sous-séquence MAXIMALE** d'une séquence dans laquelle **tous les k-mers consécutifs partagent le même minimiseur**.
|
||||||
|
|
||||||
|
### Termes
|
||||||
|
|
||||||
|
- **k-mer** : sous-séquence de longueur k
|
||||||
|
- **minimiseur** : le plus petit m-mer canonique parmi tous les m-mers d'un k-mer
|
||||||
|
- **k-mers consécutifs** : k-mers aux positions i et i+1 (chevauchement de k-1 nucléotides)
|
||||||
|
- **MAXIMALE** : ne peut être étendue ni à gauche ni à droite
|
||||||
|
|
||||||
|
## RÈGLES ABSOLUES
|
||||||
|
|
||||||
|
### RÈGLE 1 : Longueur minimum = k
|
||||||
|
|
||||||
|
Un super k-mer contient au minimum k nucléotides.
|
||||||
|
|
||||||
|
```
|
||||||
|
longueur(super-kmer) >= k
|
||||||
|
```
|
||||||
|
|
||||||
|
### RÈGLE 2 : Chevauchement obligatoire = k-1
|
||||||
|
|
||||||
|
Deux super-kmers consécutifs se chevauchent d'EXACTEMENT k-1 nucléotides.
|
||||||
|
|
||||||
|
```
|
||||||
|
SK1.End - SK2.Start = k - 1
|
||||||
|
```
|
||||||
|
|
||||||
|
### RÈGLE 3 : Bijection séquence ↔ minimiseur
|
||||||
|
|
||||||
|
Une séquence de super k-mer a UN et UN SEUL minimiseur.
|
||||||
|
|
||||||
|
```
|
||||||
|
Même séquence → Même minimiseur (TOUJOURS)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Si vous observez la même séquence avec deux minimiseurs différents, c'est un BUG.**
|
||||||
|
|
||||||
|
### RÈGLE 4 : Tous les k-mers partagent le minimiseur
|
||||||
|
|
||||||
|
TOUS les k-mers contenus dans un super k-mer ont le même minimiseur.
|
||||||
|
|
||||||
|
```
|
||||||
|
∀ k-mer K dans SK : minimiseur(K) = SK.minimizer
|
||||||
|
```
|
||||||
|
|
||||||
|
### RÈGLE 5 : Maximalité
|
||||||
|
|
||||||
|
Un super k-mer ne peut pas être étendu.
|
||||||
|
|
||||||
|
- Si on ajoute un nucléotide à gauche : le nouveau k-mer a un minimiseur différent
|
||||||
|
- Si on ajoute un nucléotide à droite : le nouveau k-mer a un minimiseur différent
|
||||||
|
|
||||||
|
## VIOLATIONS INTERDITES
|
||||||
|
|
||||||
|
❌ **Super k-mer de longueur < k**
|
||||||
|
❌ **Chevauchement ≠ k-1 entre consécutifs**
|
||||||
|
❌ **Même séquence avec minimiseurs différents**
|
||||||
|
❌ **K-mer dans le super k-mer avec minimiseur différent**
|
||||||
|
❌ **Super k-mer extensible (non-maximal)**
|
||||||
|
|
||||||
|
## CONSÉQUENCES PRATIQUES
|
||||||
|
|
||||||
|
### Pour l'extraction
|
||||||
|
|
||||||
|
L'algorithme doit :
|
||||||
|
1. Calculer le minimiseur de chaque k-mer
|
||||||
|
2. Découper quand le minimiseur change
|
||||||
|
3. Assigner au super k-mer le minimiseur commun à tous ses k-mers
|
||||||
|
4. Garantir que chaque super k-mer contient au moins k nucléotides
|
||||||
|
5. Garantir le chevauchement de k-1 entre consécutifs
|
||||||
|
|
||||||
|
### Pour la validation
|
||||||
|
|
||||||
|
Si après déduplication (obiuniq) on observe :
|
||||||
|
```
|
||||||
|
Séquence: ACGT...
|
||||||
|
Minimiseurs: {M1, M2} // plusieurs minimiseurs
|
||||||
|
```
|
||||||
|
|
||||||
|
C'est la PREUVE d'un bug : l'algorithme a produit cette séquence avec des minimiseurs différents, ce qui viole la RÈGLE 3.
|
||||||
|
|
||||||
|
## DIAGNOSTIC DU BUG
|
||||||
|
|
||||||
|
**Bug observé** : Même séquence avec minimiseurs différents après obiuniq
|
||||||
|
|
||||||
|
**Cause possible** : L'algorithme assigne le mauvais minimiseur OU découpe mal les super-kmers
|
||||||
|
|
||||||
|
**Ce que le bug NE PEUT PAS être** :
|
||||||
|
- Un problème d'obiuniq (révèle le bug, ne le crée pas)
|
||||||
|
- Un problème de chevauchement légitime (k-1 est correct)
|
||||||
|
|
||||||
|
**Ce que le bug DOIT être** :
|
||||||
|
- Minimiseur mal calculé ou mal assigné
|
||||||
|
- Découpage incorrect (mauvais endPos)
|
||||||
|
- Copie incorrecte des données
|
||||||
316
blackboard/architechture/guide-redaction-obitest.md
Normal file
316
blackboard/architechture/guide-redaction-obitest.md
Normal file
@@ -0,0 +1,316 @@
|
|||||||
|
# Guide de rédaction d'un obitest
|
||||||
|
|
||||||
|
## Règles essentielles
|
||||||
|
|
||||||
|
1. **Données < 1 KB** - Fichiers de test très petits
|
||||||
|
2. **Exécution < 10 sec** - Tests rapides pour CI/CD
|
||||||
|
3. **Auto-contenu** - Pas de dépendances externes
|
||||||
|
4. **Auto-nettoyage** - Pas de fichiers résiduels
|
||||||
|
|
||||||
|
## Structure minimale
|
||||||
|
|
||||||
|
```
|
||||||
|
obitests/obitools/<commande>/
|
||||||
|
├── test.sh # Script exécutable
|
||||||
|
└── data.fasta # Données minimales (optionnel)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Template de test.sh
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
TEST_NAME=<commande>
|
||||||
|
CMD=<commande>
|
||||||
|
|
||||||
|
TEST_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
|
||||||
|
OBITOOLS_DIR="${TEST_DIR/obitest*/}build"
|
||||||
|
export PATH="${OBITOOLS_DIR}:${PATH}"
|
||||||
|
|
||||||
|
MCMD="$(echo "${CMD:0:4}" | tr '[:lower:]' '[:upper:]')$(echo "${CMD:4}" | tr '[:upper:]' '[:lower:]')"
|
||||||
|
|
||||||
|
TMPDIR="$(mktemp -d)"
|
||||||
|
ntest=0
|
||||||
|
success=0
|
||||||
|
failed=0
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
echo "========================================" 1>&2
|
||||||
|
echo "## Results of the $TEST_NAME tests:" 1>&2
|
||||||
|
echo 1>&2
|
||||||
|
echo "- $ntest tests run" 1>&2
|
||||||
|
echo "- $success successfully completed" 1>&2
|
||||||
|
echo "- $failed failed tests" 1>&2
|
||||||
|
echo 1>&2
|
||||||
|
echo "Cleaning up the temporary directory..." 1>&2
|
||||||
|
echo 1>&2
|
||||||
|
echo "========================================" 1>&2
|
||||||
|
|
||||||
|
rm -rf "$TMPDIR"
|
||||||
|
|
||||||
|
if [ $failed -gt 0 ]; then
|
||||||
|
log "$TEST_NAME tests failed"
|
||||||
|
log
|
||||||
|
log
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log
|
||||||
|
log
|
||||||
|
exit 0
|
||||||
|
}
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo -e "[$TEST_NAME @ $(date)] $*" 1>&2
|
||||||
|
}
|
||||||
|
|
||||||
|
log "Testing $TEST_NAME..."
|
||||||
|
log "Test directory is $TEST_DIR"
|
||||||
|
log "obitools directory is $OBITOOLS_DIR"
|
||||||
|
log "Temporary directory is $TMPDIR"
|
||||||
|
log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
|
||||||
|
|
||||||
|
########## TESTS ##########
|
||||||
|
|
||||||
|
# Test 1: Help (OBLIGATOIRE)
|
||||||
|
((ntest++))
|
||||||
|
if $CMD -h > "${TMPDIR}/help.txt" 2>&1
|
||||||
|
then
|
||||||
|
log "$MCMD: printing help OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: printing help failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Ajoutez vos tests ici...
|
||||||
|
|
||||||
|
###########################
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
```
|
||||||
|
|
||||||
|
## Pattern de test
|
||||||
|
|
||||||
|
```bash
|
||||||
|
((ntest++))
|
||||||
|
if commande args > "${TMPDIR}/output.txt" 2>&1
|
||||||
|
then
|
||||||
|
log "$MCMD: description OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: description failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tests courants
|
||||||
|
|
||||||
|
### Exécution basique
|
||||||
|
```bash
|
||||||
|
((ntest++))
|
||||||
|
if $CMD "${TEST_DIR}/input.fasta" > "${TMPDIR}/output.fasta" 2>&1
|
||||||
|
then
|
||||||
|
log "$MCMD: basic execution OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: basic execution failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
### Sortie non vide
|
||||||
|
```bash
|
||||||
|
((ntest++))
|
||||||
|
if [ -s "${TMPDIR}/output.fasta" ]
|
||||||
|
then
|
||||||
|
log "$MCMD: output not empty OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: output empty - failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
### Comptage
|
||||||
|
```bash
|
||||||
|
((ntest++))
|
||||||
|
count=$(grep -c "^>" "${TMPDIR}/output.fasta")
|
||||||
|
if [ "$count" -gt 0 ]
|
||||||
|
then
|
||||||
|
log "$MCMD: extracted $count sequences OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: no sequences - failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
### Présence de contenu
|
||||||
|
```bash
|
||||||
|
((ntest++))
|
||||||
|
if grep -q "expected_string" "${TMPDIR}/output.fasta"
|
||||||
|
then
|
||||||
|
log "$MCMD: expected content found OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: content not found - failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
### Comparaison avec référence
|
||||||
|
```bash
|
||||||
|
((ntest++))
|
||||||
|
if diff "${TEST_DIR}/expected.fasta" "${TMPDIR}/output.fasta" > /dev/null
|
||||||
|
then
|
||||||
|
log "$MCMD: matches reference OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: differs from reference - failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test avec options
|
||||||
|
```bash
|
||||||
|
((ntest++))
|
||||||
|
if $CMD --opt value "${TEST_DIR}/input.fasta" > "${TMPDIR}/out.fasta" 2>&1
|
||||||
|
then
|
||||||
|
log "$MCMD: with option OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: with option failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
## Variables importantes
|
||||||
|
|
||||||
|
- **TEST_DIR** - Répertoire du test (données d'entrée)
|
||||||
|
- **TMPDIR** - Répertoire temporaire (sorties)
|
||||||
|
- **CMD** - Nom de la commande
|
||||||
|
- **MCMD** - Nom formaté pour les logs
|
||||||
|
|
||||||
|
## Règles d'or
|
||||||
|
|
||||||
|
✅ **Entrées** → `${TEST_DIR}/`
|
||||||
|
✅ **Sorties** → `${TMPDIR}/`
|
||||||
|
✅ **Toujours rediriger** → `> file 2>&1`
|
||||||
|
✅ **Incrémenter ntest** → Avant chaque test
|
||||||
|
✅ **Messages clairs** → Descriptions explicites
|
||||||
|
|
||||||
|
❌ **Pas de chemins en dur**
|
||||||
|
❌ **Pas de /tmp direct**
|
||||||
|
❌ **Pas de sortie vers TEST_DIR**
|
||||||
|
❌ **Pas de commandes sans redirection**
|
||||||
|
|
||||||
|
## Données de test
|
||||||
|
|
||||||
|
Créer un fichier minimal (< 500 bytes) :
|
||||||
|
|
||||||
|
```fasta
|
||||||
|
>seq1
|
||||||
|
ACGTACGTACGTACGT
|
||||||
|
>seq2
|
||||||
|
AAAACCCCGGGGTTTT
|
||||||
|
>seq3
|
||||||
|
ATCGATCGATCGATCG
|
||||||
|
```
|
||||||
|
|
||||||
|
## Création rapide
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Créer le répertoire
|
||||||
|
mkdir -p obitests/obitools/<commande>
|
||||||
|
cd obitests/obitools/<commande>
|
||||||
|
|
||||||
|
# 2. Créer les données de test
|
||||||
|
cat > test_data.fasta << 'EOF'
|
||||||
|
>seq1
|
||||||
|
ACGTACGTACGTACGT
|
||||||
|
>seq2
|
||||||
|
AAAACCCCGGGGTTTT
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# 3. Copier le template dans test.sh
|
||||||
|
# 4. Adapter le TEST_NAME et CMD
|
||||||
|
# 5. Ajouter les tests
|
||||||
|
# 6. Rendre exécutable
|
||||||
|
chmod +x test.sh
|
||||||
|
|
||||||
|
# 7. Tester
|
||||||
|
./test.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Checklist
|
||||||
|
|
||||||
|
- [ ] `test.sh` exécutable (`chmod +x`)
|
||||||
|
- [ ] Test d'aide inclus
|
||||||
|
- [ ] Données < 1 KB
|
||||||
|
- [ ] Sorties vers `${TMPDIR}/`
|
||||||
|
- [ ] Entrées depuis `${TEST_DIR}/`
|
||||||
|
- [ ] Redirections `2>&1`
|
||||||
|
- [ ] Messages clairs
|
||||||
|
- [ ] Testé localement
|
||||||
|
- [ ] Exit code 0 si succès
|
||||||
|
|
||||||
|
## Debug
|
||||||
|
|
||||||
|
Conserver TMPDIR pour inspection :
|
||||||
|
```bash
|
||||||
|
cleanup() {
|
||||||
|
echo "Temporary directory: $TMPDIR" 1>&2
|
||||||
|
# rm -rf "$TMPDIR" # Commenté
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Mode verbose :
|
||||||
|
```bash
|
||||||
|
set -x # Au début du script
|
||||||
|
```
|
||||||
|
|
||||||
|
## Exemples
|
||||||
|
|
||||||
|
**Simple (1 test)** - obimicrosat
|
||||||
|
```bash
|
||||||
|
# Juste l'aide
|
||||||
|
```
|
||||||
|
|
||||||
|
**Moyen (4-5 tests)** - obisuperkmer
|
||||||
|
```bash
|
||||||
|
# Aide + exécution + validation sortie + contenu
|
||||||
|
```
|
||||||
|
|
||||||
|
**Complet (7+ tests)** - obiuniq
|
||||||
|
```bash
|
||||||
|
# Aide + exécution + comparaison CSV + options + multiples cas
|
||||||
|
```
|
||||||
|
|
||||||
|
## Commandes utiles
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Compter séquences
|
||||||
|
grep -c "^>" file.fasta
|
||||||
|
|
||||||
|
# Fichier non vide
|
||||||
|
[ -s file ]
|
||||||
|
|
||||||
|
# Comparer
|
||||||
|
diff file1 file2 > /dev/null
|
||||||
|
|
||||||
|
# Comparer compressés
|
||||||
|
zdiff file1.gz file2.gz
|
||||||
|
|
||||||
|
# Compter bases
|
||||||
|
grep -v "^>" file | tr -d '\n' | wc -c
|
||||||
|
```
|
||||||
|
|
||||||
|
## Ce qu'il faut retenir
|
||||||
|
|
||||||
|
Un bon test est **COURT**, **RAPIDE** et **SIMPLE** :
|
||||||
|
- 3-10 tests maximum
|
||||||
|
- Données < 1 KB
|
||||||
|
- Exécution < 10 secondes
|
||||||
|
- Pattern standard respecté
|
||||||
268
blackboard/architechture/obisuperkmer-implementation.md
Normal file
268
blackboard/architechture/obisuperkmer-implementation.md
Normal file
@@ -0,0 +1,268 @@
|
|||||||
|
# Implémentation de la commande obisuperkmer
|
||||||
|
|
||||||
|
## Vue d'ensemble
|
||||||
|
|
||||||
|
La commande `obisuperkmer` a été implémentée en suivant l'architecture standard des commandes OBITools décrite dans `architecture-commande-obitools.md`. Cette commande permet d'extraire les super k-mers de fichiers de séquences biologiques.
|
||||||
|
|
||||||
|
## Qu'est-ce qu'un super k-mer ?
|
||||||
|
|
||||||
|
Un super k-mer est une sous-séquence maximale dans laquelle tous les k-mers consécutifs partagent le même minimiseur. Cette décomposition est utile pour :
|
||||||
|
- L'indexation efficace de k-mers
|
||||||
|
- La réduction de la redondance dans les analyses
|
||||||
|
- L'optimisation de la mémoire pour les structures de données de k-mers
|
||||||
|
|
||||||
|
## Structure de l'implémentation
|
||||||
|
|
||||||
|
### 1. Package `pkg/obitools/obisuperkmer/`
|
||||||
|
|
||||||
|
Le package contient trois fichiers :
|
||||||
|
|
||||||
|
#### `obisuperkmer.go`
|
||||||
|
Documentation du package avec une description de son rôle.
|
||||||
|
|
||||||
|
#### `options.go`
|
||||||
|
Définit les options de ligne de commande :
|
||||||
|
|
||||||
|
```go
|
||||||
|
var _KmerSize = 21 // Taille des k-mers (par défaut 21)
|
||||||
|
var _MinimizerSize = 11 // Taille des minimiseurs (par défaut 11)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Options CLI disponibles :**
|
||||||
|
- `--kmer-size` / `-k` : Taille des k-mers (entre m+1 et 31)
|
||||||
|
- `--minimizer-size` / `-m` : Taille des minimiseurs (entre 1 et k-1)
|
||||||
|
|
||||||
|
**Fonctions d'accès :**
|
||||||
|
- `CLIKmerSize()` : retourne la taille des k-mers
|
||||||
|
- `CLIMinimizerSize()` : retourne la taille des minimiseurs
|
||||||
|
- `SetKmerSize(k int)` : définit la taille des k-mers
|
||||||
|
- `SetMinimizerSize(m int)` : définit la taille des minimiseurs
|
||||||
|
|
||||||
|
#### `superkmer.go`
|
||||||
|
Implémente la logique de traitement :
|
||||||
|
|
||||||
|
```go
|
||||||
|
func CLIExtractSuperKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence
|
||||||
|
```
|
||||||
|
|
||||||
|
Cette fonction :
|
||||||
|
1. Récupère les paramètres k et m depuis les options CLI
|
||||||
|
2. Valide les paramètres (m < k, k <= 31, etc.)
|
||||||
|
3. Crée un worker utilisant `obikmer.SuperKmerWorker(k, m)`
|
||||||
|
4. Applique le worker en parallèle sur l'itérateur de séquences
|
||||||
|
5. Retourne un itérateur de super k-mers
|
||||||
|
|
||||||
|
### 2. Exécutable `cmd/obitools/obisuperkmer/main.go`
|
||||||
|
|
||||||
|
L'exécutable suit le pattern standard minimal :
|
||||||
|
|
||||||
|
```go
|
||||||
|
func main() {
|
||||||
|
// 1. Génération du parser d'options
|
||||||
|
optionParser := obioptions.GenerateOptionParser(
|
||||||
|
"obisuperkmer",
|
||||||
|
"extract super k-mers from sequence files",
|
||||||
|
obisuperkmer.OptionSet)
|
||||||
|
|
||||||
|
// 2. Parsing des arguments
|
||||||
|
_, args := optionParser(os.Args)
|
||||||
|
|
||||||
|
// 3. Lecture des séquences
|
||||||
|
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||||
|
obiconvert.OpenSequenceDataErrorMessage(args, err)
|
||||||
|
|
||||||
|
// 4. Extraction des super k-mers
|
||||||
|
superkmers := obisuperkmer.CLIExtractSuperKmers(sequences)
|
||||||
|
|
||||||
|
// 5. Écriture des résultats
|
||||||
|
obiconvert.CLIWriteBioSequences(superkmers, true)
|
||||||
|
|
||||||
|
// 6. Attente de la fin du pipeline
|
||||||
|
obiutils.WaitForLastPipe()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Utilisation du package `obikmer`
|
||||||
|
|
||||||
|
L'implémentation s'appuie sur le package `obikmer` qui fournit :
|
||||||
|
|
||||||
|
### `SuperKmerWorker(k int, m int) obiseq.SeqWorker`
|
||||||
|
|
||||||
|
Crée un worker qui :
|
||||||
|
- Extrait les super k-mers d'une BioSequence
|
||||||
|
- Retourne une slice de BioSequence, une par super k-mer
|
||||||
|
- Chaque super k-mer contient les attributs suivants :
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Métadonnées ajoutées à chaque super k-mer :
|
||||||
|
{
|
||||||
|
"minimizer_value": uint64, // Valeur canonique du minimiseur
|
||||||
|
"minimizer_seq": string, // Séquence ADN du minimiseur
|
||||||
|
"k": int, // Taille des k-mers utilisée
|
||||||
|
"m": int, // Taille des minimiseurs utilisée
|
||||||
|
"start": int, // Position de début (0-indexé)
|
||||||
|
"end": int, // Position de fin (exclusif)
|
||||||
|
"parent_id": string, // ID de la séquence parente
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Algorithme sous-jacent
|
||||||
|
|
||||||
|
Le package `obikmer` utilise :
|
||||||
|
- `IterSuperKmers(seq []byte, k int, m int)` : itérateur sur les super k-mers
|
||||||
|
- Une deque monotone pour suivre les minimiseurs dans une fenêtre glissante
|
||||||
|
- Complexité temporelle : O(n) où n est la longueur de la séquence
|
||||||
|
- Complexité spatiale : O(k-m+1) pour la deque
|
||||||
|
|
||||||
|
## Exemple d'utilisation
|
||||||
|
|
||||||
|
### Ligne de commande
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Extraction avec paramètres par défaut (k=21, m=11)
|
||||||
|
obisuperkmer sequences.fasta > superkmers.fasta
|
||||||
|
|
||||||
|
# Spécifier les tailles de k-mers et minimiseurs
|
||||||
|
obisuperkmer -k 25 -m 13 sequences.fasta -o superkmers.fasta
|
||||||
|
|
||||||
|
# Avec plusieurs fichiers d'entrée
|
||||||
|
obisuperkmer --kmer-size 31 --minimizer-size 15 file1.fasta file2.fasta > output.fasta
|
||||||
|
|
||||||
|
# Format FASTQ en entrée, FASTA en sortie
|
||||||
|
obisuperkmer sequences.fastq --fasta-output -o superkmers.fasta
|
||||||
|
|
||||||
|
# Avec compression
|
||||||
|
obisuperkmer sequences.fasta -o superkmers.fasta.gz --compress
|
||||||
|
```
|
||||||
|
|
||||||
|
### Exemple de sortie
|
||||||
|
|
||||||
|
Pour une séquence d'entrée :
|
||||||
|
```
|
||||||
|
>seq1
|
||||||
|
ACGTACGTACGTACGTACGTACGT
|
||||||
|
```
|
||||||
|
|
||||||
|
La sortie contiendra plusieurs super k-mers :
|
||||||
|
```
|
||||||
|
>seq1_superkmer_0_15 {"minimizer_value":123456,"minimizer_seq":"acgtacgt","k":21,"m":11,"start":0,"end":15,"parent_id":"seq1"}
|
||||||
|
ACGTACGTACGTACG
|
||||||
|
>seq1_superkmer_8_24 {"minimizer_value":789012,"minimizer_seq":"gtacgtac","k":21,"m":11,"start":8,"end":24,"parent_id":"seq1"}
|
||||||
|
TACGTACGTACGTACGT
|
||||||
|
```
|
||||||
|
|
||||||
|
## Options héritées de `obiconvert`
|
||||||
|
|
||||||
|
La commande hérite de toutes les options standard d'OBITools :
|
||||||
|
|
||||||
|
### Options d'entrée
|
||||||
|
- `--fasta` : forcer le format FASTA
|
||||||
|
- `--fastq` : forcer le format FASTQ
|
||||||
|
- `--ecopcr` : format ecoPCR
|
||||||
|
- `--embl` : format EMBL
|
||||||
|
- `--genbank` : format GenBank
|
||||||
|
- `--input-json-header` : en-têtes JSON
|
||||||
|
- `--input-OBI-header` : en-têtes OBI
|
||||||
|
|
||||||
|
### Options de sortie
|
||||||
|
- `--out` / `-o` : fichier de sortie (défaut : stdout)
|
||||||
|
- `--fasta-output` : sortie en format FASTA
|
||||||
|
- `--fastq-output` : sortie en format FASTQ
|
||||||
|
- `--json-output` : sortie en format JSON
|
||||||
|
- `--output-json-header` : en-têtes JSON en sortie
|
||||||
|
- `--output-OBI-header` / `-O` : en-têtes OBI en sortie
|
||||||
|
- `--compress` / `-Z` : compression gzip
|
||||||
|
- `--skip-empty` : ignorer les séquences vides
|
||||||
|
- `--no-progressbar` : désactiver la barre de progression
|
||||||
|
|
||||||
|
## Compilation
|
||||||
|
|
||||||
|
Pour compiler la commande :
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /chemin/vers/obitools4
|
||||||
|
go build -o bin/obisuperkmer ./cmd/obitools/obisuperkmer/
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tests
|
||||||
|
|
||||||
|
Pour tester la commande :
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Créer un fichier de test
|
||||||
|
echo -e ">test\nACGTACGTACGTACGTACGTACGTACGTACGT" > test.fasta
|
||||||
|
|
||||||
|
# Exécuter obisuperkmer
|
||||||
|
obisuperkmer test.fasta
|
||||||
|
|
||||||
|
# Vérifier avec des paramètres différents
|
||||||
|
obisuperkmer -k 15 -m 7 test.fasta
|
||||||
|
```
|
||||||
|
|
||||||
|
## Validation des paramètres
|
||||||
|
|
||||||
|
La commande valide automatiquement :
|
||||||
|
- `1 <= m < k` : le minimiseur doit être plus petit que le k-mer
|
||||||
|
- `2 <= k <= 31` : contrainte du codage sur 64 bits
|
||||||
|
- `len(sequence) >= k` : la séquence doit être assez longue
|
||||||
|
|
||||||
|
En cas de paramètres invalides, la commande affiche une erreur explicite et s'arrête.
|
||||||
|
|
||||||
|
## Intégration avec le pipeline OBITools
|
||||||
|
|
||||||
|
La commande s'intègre naturellement dans les pipelines OBITools :
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Pipeline complet d'analyse
|
||||||
|
obiconvert sequences.fastq --fasta-output | \
|
||||||
|
obisuperkmer -k 21 -m 11 | \
|
||||||
|
obiuniq | \
|
||||||
|
obigrep -p "minimizer_value>1000" > filtered_superkmers.fasta
|
||||||
|
```
|
||||||
|
|
||||||
|
## Parallélisation
|
||||||
|
|
||||||
|
La commande utilise automatiquement :
|
||||||
|
- `obidefault.ParallelWorkers()` pour le traitement parallèle
|
||||||
|
- Les workers sont distribués sur les séquences d'entrée
|
||||||
|
- La parallélisation est transparente pour l'utilisateur
|
||||||
|
|
||||||
|
## Conformité avec l'architecture OBITools
|
||||||
|
|
||||||
|
L'implémentation respecte tous les principes de l'architecture :
|
||||||
|
|
||||||
|
✅ Séparation des responsabilités (package + commande)
|
||||||
|
✅ Convention de nommage cohérente (CLI*, Set*, _variables)
|
||||||
|
✅ Réutilisation de `obiconvert` pour l'I/O
|
||||||
|
✅ Options standard partagées
|
||||||
|
✅ Pattern Worker pour le traitement
|
||||||
|
✅ Validation des paramètres
|
||||||
|
✅ Logging avec `logrus`
|
||||||
|
✅ Gestion d'erreurs cohérente
|
||||||
|
✅ Documentation complète
|
||||||
|
|
||||||
|
## Fichiers créés
|
||||||
|
|
||||||
|
```
|
||||||
|
pkg/obitools/obisuperkmer/
|
||||||
|
├── obisuperkmer.go # Documentation du package
|
||||||
|
├── options.go # Définition des options CLI
|
||||||
|
└── superkmer.go # Implémentation du traitement
|
||||||
|
|
||||||
|
cmd/obitools/obisuperkmer/
|
||||||
|
└── main.go # Point d'entrée de la commande
|
||||||
|
```
|
||||||
|
|
||||||
|
## Prochaines étapes
|
||||||
|
|
||||||
|
1. **Compilation** : Compiler la commande avec `go build`
|
||||||
|
2. **Tests unitaires** : Créer des tests dans `pkg/obitools/obisuperkmer/superkmer_test.go`
|
||||||
|
3. **Documentation utilisateur** : Ajouter la documentation de la commande
|
||||||
|
4. **Intégration CI/CD** : Ajouter aux tests d'intégration
|
||||||
|
5. **Benchmarks** : Mesurer les performances sur différents jeux de données
|
||||||
|
|
||||||
|
## Références
|
||||||
|
|
||||||
|
- Architecture des commandes OBITools : `architecture-commande-obitools.md`
|
||||||
|
- Package `obikmer` : `pkg/obikmer/`
|
||||||
|
- Tests du package : `pkg/obikmer/superkmer_iter_test.go`
|
||||||
440
blackboard/architechture/obisuperkmer-tests.md
Normal file
440
blackboard/architechture/obisuperkmer-tests.md
Normal file
@@ -0,0 +1,440 @@
|
|||||||
|
# Tests automatisés pour obisuperkmer
|
||||||
|
|
||||||
|
## Vue d'ensemble
|
||||||
|
|
||||||
|
Des tests automatisés ont été créés pour la commande `obisuperkmer` dans le répertoire `obitests/obitools/obisuperkmer/`. Ces tests suivent le pattern standard utilisé par toutes les commandes OBITools et sont conçus pour être exécutés dans un environnement CI/CD.
|
||||||
|
|
||||||
|
## Fichiers créés
|
||||||
|
|
||||||
|
```
|
||||||
|
obitests/obitools/obisuperkmer/
|
||||||
|
├── test.sh # Script de test principal (6.7 KB)
|
||||||
|
├── test_sequences.fasta # Données de test (117 bytes)
|
||||||
|
└── README.md # Documentation (4.1 KB)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Taille totale : ~11 KB
|
||||||
|
|
||||||
|
Cette taille minimale est idéale pour un dépôt Git et des tests CI/CD rapides.
|
||||||
|
|
||||||
|
## Jeu de données de test
|
||||||
|
|
||||||
|
### Fichier : `test_sequences.fasta` (117 bytes)
|
||||||
|
|
||||||
|
Le fichier contient 3 séquences de 32 nucléotides chacune :
|
||||||
|
|
||||||
|
```fasta
|
||||||
|
>seq1
|
||||||
|
ACGTACGTACGTACGTACGTACGTACGTACGT
|
||||||
|
>seq2
|
||||||
|
AAAACCCCGGGGTTTTAAAACCCCGGGGTTTT
|
||||||
|
>seq3
|
||||||
|
ATCGATCGATCGATCGATCGATCGATCGATCG
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Justification du choix
|
||||||
|
|
||||||
|
1. **seq1** : Motif répétitif simple (ACGT)
|
||||||
|
- Teste l'extraction de super k-mers sur une séquence avec faible complexité
|
||||||
|
- Les minimiseurs devraient être assez réguliers
|
||||||
|
|
||||||
|
2. **seq2** : Blocs homopolymères
|
||||||
|
- Teste le comportement avec des régions de très faible complexité
|
||||||
|
- Les minimiseurs varieront entre les blocs A, C, G et T
|
||||||
|
|
||||||
|
3. **seq3** : Motif différent (ATCG)
|
||||||
|
- Teste la diversité des super k-mers extraits
|
||||||
|
- Différent de seq1 pour vérifier la distinction
|
||||||
|
|
||||||
|
#### Caractéristiques
|
||||||
|
|
||||||
|
- **Longueur** : 32 nucléotides par séquence
|
||||||
|
- **Taille totale** : 96 nucléotides (3 × 32)
|
||||||
|
- **Format** : FASTA avec en-têtes JSON compatibles
|
||||||
|
- **Alphabet** : A, C, G, T uniquement (pas de bases ambiguës)
|
||||||
|
- **Taille du fichier** : 117 bytes
|
||||||
|
|
||||||
|
Avec k=21 (défaut), chaque séquence de 32 bp peut produire :
|
||||||
|
- 32 - 21 + 1 = 12 k-mers
|
||||||
|
- Plusieurs super k-mers selon les minimiseurs
|
||||||
|
|
||||||
|
## Script de test : `test.sh`
|
||||||
|
|
||||||
|
### Structure
|
||||||
|
|
||||||
|
Le script suit le pattern standard OBITools :
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
TEST_NAME=obisuperkmer
|
||||||
|
CMD=obisuperkmer
|
||||||
|
|
||||||
|
# Variables et fonctions standard
|
||||||
|
TEST_DIR="..."
|
||||||
|
OBITOOLS_DIR="..."
|
||||||
|
TMPDIR="$(mktemp -d)"
|
||||||
|
ntest=0
|
||||||
|
success=0
|
||||||
|
failed=0
|
||||||
|
|
||||||
|
cleanup() { ... }
|
||||||
|
log() { ... }
|
||||||
|
|
||||||
|
# Tests (12 au total)
|
||||||
|
# ...
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
```
|
||||||
|
|
||||||
|
### Tests implémentés
|
||||||
|
|
||||||
|
#### 1. Test d'aide (`-h`)
|
||||||
|
```bash
|
||||||
|
obisuperkmer -h
|
||||||
|
```
|
||||||
|
Vérifie que la commande peut afficher son aide sans erreur.
|
||||||
|
|
||||||
|
#### 2. Extraction basique avec paramètres par défaut
|
||||||
|
```bash
|
||||||
|
obisuperkmer test_sequences.fasta > output_default.fasta
|
||||||
|
```
|
||||||
|
Teste l'exécution avec k=21, m=11 (défaut).
|
||||||
|
|
||||||
|
#### 3. Vérification de sortie non vide
|
||||||
|
```bash
|
||||||
|
[ -s output_default.fasta ]
|
||||||
|
```
|
||||||
|
S'assure que la commande produit un résultat.
|
||||||
|
|
||||||
|
#### 4. Comptage des super k-mers
|
||||||
|
```bash
|
||||||
|
grep -c "^>" output_default.fasta
|
||||||
|
```
|
||||||
|
Vérifie qu'au moins un super k-mer a été extrait.
|
||||||
|
|
||||||
|
#### 5. Présence des métadonnées
|
||||||
|
```bash
|
||||||
|
grep -q "minimizer_value" output_default.fasta
|
||||||
|
grep -q "minimizer_seq" output_default.fasta
|
||||||
|
grep -q "parent_id" output_default.fasta
|
||||||
|
```
|
||||||
|
Vérifie que les attributs requis sont présents.
|
||||||
|
|
||||||
|
#### 6. Extraction avec paramètres personnalisés
|
||||||
|
```bash
|
||||||
|
obisuperkmer -k 15 -m 7 test_sequences.fasta > output_k15_m7.fasta
|
||||||
|
```
|
||||||
|
Teste la configuration de k et m.
|
||||||
|
|
||||||
|
#### 7. Validation des paramètres personnalisés
|
||||||
|
```bash
|
||||||
|
grep -q '"k":15' output_k15_m7.fasta
|
||||||
|
grep -q '"m":7' output_k15_m7.fasta
|
||||||
|
```
|
||||||
|
Vérifie que les paramètres sont correctement enregistrés.
|
||||||
|
|
||||||
|
#### 8. Format de sortie FASTA
|
||||||
|
```bash
|
||||||
|
obisuperkmer --fasta-output test_sequences.fasta > output_fasta.fasta
|
||||||
|
```
|
||||||
|
Teste l'option de format explicite.
|
||||||
|
|
||||||
|
#### 9. Vérification des IDs
|
||||||
|
```bash
|
||||||
|
grep "^>" output_default.fasta | grep -q "superkmer"
|
||||||
|
```
|
||||||
|
S'assure que les IDs contiennent "superkmer".
|
||||||
|
|
||||||
|
#### 10. Préservation des IDs parents
|
||||||
|
```bash
|
||||||
|
grep -q "seq1" output_default.fasta
|
||||||
|
grep -q "seq2" output_default.fasta
|
||||||
|
grep -q "seq3" output_default.fasta
|
||||||
|
```
|
||||||
|
Vérifie que les IDs des séquences parentes sont préservés.
|
||||||
|
|
||||||
|
#### 11. Option de fichier de sortie (`-o`)
|
||||||
|
```bash
|
||||||
|
obisuperkmer -o output_file.fasta test_sequences.fasta
|
||||||
|
```
|
||||||
|
Teste la redirection vers un fichier.
|
||||||
|
|
||||||
|
#### 12. Vérification de création du fichier
|
||||||
|
```bash
|
||||||
|
[ -s output_file.fasta ]
|
||||||
|
```
|
||||||
|
S'assure que le fichier a été créé.
|
||||||
|
|
||||||
|
#### 13. Cohérence des longueurs
|
||||||
|
```bash
|
||||||
|
# Vérifie que longueur(output) <= longueur(input)
|
||||||
|
```
|
||||||
|
S'assure que les super k-mers ne sont pas plus longs que l'entrée.
|
||||||
|
|
||||||
|
### Compteurs
|
||||||
|
|
||||||
|
- **ntest** : Nombre de tests exécutés
|
||||||
|
- **success** : Nombre de tests réussis
|
||||||
|
- **failed** : Nombre de tests échoués
|
||||||
|
|
||||||
|
### Sortie du script
|
||||||
|
|
||||||
|
#### En cas de succès
|
||||||
|
```
|
||||||
|
========================================
|
||||||
|
## Results of the obisuperkmer tests:
|
||||||
|
|
||||||
|
- 12 tests run
|
||||||
|
- 12 successfully completed
|
||||||
|
- 0 failed tests
|
||||||
|
|
||||||
|
Cleaning up the temporary directory...
|
||||||
|
|
||||||
|
========================================
|
||||||
|
```
|
||||||
|
|
||||||
|
Exit code : **0**
|
||||||
|
|
||||||
|
#### En cas d'échec
|
||||||
|
```
|
||||||
|
========================================
|
||||||
|
## Results of the obisuperkmer tests:
|
||||||
|
|
||||||
|
- 12 tests run
|
||||||
|
- 10 successfully completed
|
||||||
|
- 2 failed tests
|
||||||
|
|
||||||
|
Cleaning up the temporary directory...
|
||||||
|
|
||||||
|
========================================
|
||||||
|
```
|
||||||
|
|
||||||
|
Exit code : **1**
|
||||||
|
|
||||||
|
## Intégration CI/CD
|
||||||
|
|
||||||
|
### Exécution automatique
|
||||||
|
|
||||||
|
Le script est conçu pour être exécuté automatiquement dans un pipeline CI/CD :
|
||||||
|
|
||||||
|
1. Le build produit l'exécutable dans `build/obisuperkmer`
|
||||||
|
2. Le script de test ajoute `build/` au PATH
|
||||||
|
3. Les tests s'exécutent
|
||||||
|
4. Le code de retour indique le succès (0) ou l'échec (1)
|
||||||
|
|
||||||
|
### Exemple de configuration CI/CD
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .github/workflows/test.yml ou équivalent
|
||||||
|
test-obisuperkmer:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: Build obitools
|
||||||
|
run: make build
|
||||||
|
- name: Test obisuperkmer
|
||||||
|
run: ./obitests/obitools/obisuperkmer/test.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Avantages
|
||||||
|
|
||||||
|
✅ **Rapidité** : Données de test minimales (117 bytes)
|
||||||
|
✅ **Fiabilité** : Tests reproductibles
|
||||||
|
✅ **Isolation** : Utilisation d'un répertoire temporaire
|
||||||
|
✅ **Nettoyage automatique** : Pas de fichiers résiduels
|
||||||
|
✅ **Logging** : Messages horodatés et détaillés
|
||||||
|
✅ **Compatibilité** : Pattern standard OBITools
|
||||||
|
|
||||||
|
## Exécution locale
|
||||||
|
|
||||||
|
### Prérequis
|
||||||
|
|
||||||
|
1. Compiler obisuperkmer :
|
||||||
|
```bash
|
||||||
|
cd /chemin/vers/obitools4
|
||||||
|
go build -o build/obisuperkmer ./cmd/obitools/obisuperkmer/
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Se placer dans le répertoire de test :
|
||||||
|
```bash
|
||||||
|
cd obitests/obitools/obisuperkmer
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Exécuter le script :
|
||||||
|
```bash
|
||||||
|
./test.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Exemple de sortie
|
||||||
|
|
||||||
|
```
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:00 CET 2026] Testing obisuperkmer...
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:00 CET 2026] Test directory is /path/to/obitests/obitools/obisuperkmer
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:00 CET 2026] obitools directory is /path/to/build
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:00 CET 2026] Temporary directory is /tmp/tmp.abc123
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:00 CET 2026] files: README.md test.sh test_sequences.fasta
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:01 CET 2026] OBISuperkmer: printing help OK
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:02 CET 2026] OBISuperkmer: basic extraction with default parameters OK
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:02 CET 2026] OBISuperkmer: output file is not empty OK
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:02 CET 2026] OBISuperkmer: extracted 8 super k-mers OK
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:02 CET 2026] OBISuperkmer: super k-mers contain required metadata OK
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:03 CET 2026] OBISuperkmer: extraction with custom k=15, m=7 OK
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:03 CET 2026] OBISuperkmer: custom parameters correctly set in metadata OK
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:03 CET 2026] OBISuperkmer: FASTA output format OK
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:03 CET 2026] OBISuperkmer: super k-mer IDs contain 'superkmer' OK
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:03 CET 2026] OBISuperkmer: parent sequence IDs preserved OK
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:04 CET 2026] OBISuperkmer: output to file with -o option OK
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:04 CET 2026] OBISuperkmer: output file created with -o option OK
|
||||||
|
[obisuperkmer @ Fri Feb 7 13:00:04 CET 2026] OBISuperkmer: super k-mer total length <= input length OK
|
||||||
|
========================================
|
||||||
|
## Results of the obisuperkmer tests:
|
||||||
|
|
||||||
|
- 12 tests run
|
||||||
|
- 12 successfully completed
|
||||||
|
- 0 failed tests
|
||||||
|
|
||||||
|
Cleaning up the temporary directory...
|
||||||
|
|
||||||
|
========================================
|
||||||
|
```
|
||||||
|
|
||||||
|
## Debugging des tests
|
||||||
|
|
||||||
|
### Conserver les fichiers temporaires
|
||||||
|
|
||||||
|
Modifier temporairement la fonction `cleanup()` :
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cleanup() {
|
||||||
|
echo "Temporary directory: $TMPDIR" 1>&2
|
||||||
|
# Commenter cette ligne pour conserver les fichiers
|
||||||
|
# rm -rf "$TMPDIR"
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Activer le mode verbose
|
||||||
|
|
||||||
|
Ajouter au début du script :
|
||||||
|
|
||||||
|
```bash
|
||||||
|
set -x # Active l'affichage de toutes les commandes
|
||||||
|
```
|
||||||
|
|
||||||
|
### Tester une seule commande
|
||||||
|
|
||||||
|
Extraire et exécuter manuellement :
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export TEST_DIR=/chemin/vers/obitests/obitools/obisuperkmer
|
||||||
|
export TMPDIR=$(mktemp -d)
|
||||||
|
obisuperkmer "${TEST_DIR}/test_sequences.fasta" > "${TMPDIR}/output.fasta"
|
||||||
|
cat "${TMPDIR}/output.fasta"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Ajout de nouveaux tests
|
||||||
|
|
||||||
|
Pour ajouter un test supplémentaire :
|
||||||
|
|
||||||
|
1. Incrémenter le compteur `ntest`
|
||||||
|
2. Écrire la condition de test
|
||||||
|
3. Logger le succès ou l'échec
|
||||||
|
4. Incrémenter le bon compteur
|
||||||
|
|
||||||
|
```bash
|
||||||
|
((ntest++))
|
||||||
|
if ma_nouvelle_commande_de_test
|
||||||
|
then
|
||||||
|
log "Description du test: OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "Description du test: failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
## Comparaison avec d'autres tests
|
||||||
|
|
||||||
|
### Taille des données de test
|
||||||
|
|
||||||
|
| Commande | Taille des données | Nombre de fichiers |
|
||||||
|
|----------|-------------------|-------------------|
|
||||||
|
| obiconvert | 925 KB | 1 fichier |
|
||||||
|
| obiuniq | ~600 bytes | 4 fichiers |
|
||||||
|
| obimicrosat | 0 bytes | 0 fichiers (génère à la volée) |
|
||||||
|
| **obisuperkmer** | **117 bytes** | **1 fichier** |
|
||||||
|
|
||||||
|
Notre test `obisuperkmer` est parmi les plus légers, ce qui est optimal pour CI/CD.
|
||||||
|
|
||||||
|
### Nombre de tests
|
||||||
|
|
||||||
|
| Commande | Nombre de tests |
|
||||||
|
|----------|----------------|
|
||||||
|
| obiconvert | 3 tests |
|
||||||
|
| obiuniq | 7 tests |
|
||||||
|
| obimicrosat | 1 test |
|
||||||
|
| **obisuperkmer** | **12 tests** |
|
||||||
|
|
||||||
|
Notre test `obisuperkmer` offre une couverture complète avec 12 tests différents.
|
||||||
|
|
||||||
|
## Couverture de test
|
||||||
|
|
||||||
|
Les tests couvrent :
|
||||||
|
|
||||||
|
✅ Affichage de l'aide
|
||||||
|
✅ Exécution basique
|
||||||
|
✅ Paramètres par défaut (k=21, m=11)
|
||||||
|
✅ Paramètres personnalisés (k=15, m=7)
|
||||||
|
✅ Formats de sortie (FASTA)
|
||||||
|
✅ Redirection vers fichier (`-o`)
|
||||||
|
✅ Présence des métadonnées
|
||||||
|
✅ Validation des IDs
|
||||||
|
✅ Préservation des IDs parents
|
||||||
|
✅ Cohérence des longueurs
|
||||||
|
✅ Production de résultats non vides
|
||||||
|
|
||||||
|
## Maintenance
|
||||||
|
|
||||||
|
### Mise à jour des tests
|
||||||
|
|
||||||
|
Si l'implémentation de `obisuperkmer` change :
|
||||||
|
|
||||||
|
1. Vérifier que les tests existants passent toujours
|
||||||
|
2. Ajouter de nouveaux tests pour les nouvelles fonctionnalités
|
||||||
|
3. Mettre à jour `README.md` si nécessaire
|
||||||
|
4. Documenter les changements
|
||||||
|
|
||||||
|
### Vérification régulière
|
||||||
|
|
||||||
|
Exécuter périodiquement :
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd obitests/obitools/obisuperkmer
|
||||||
|
./test.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Ou via l'ensemble des tests :
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd obitests
|
||||||
|
for dir in obitools/*/; do
|
||||||
|
if [ -f "$dir/test.sh" ]; then
|
||||||
|
echo "Testing $(basename $dir)..."
|
||||||
|
(cd "$dir" && ./test.sh) || echo "FAILED: $(basename $dir)"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
Les tests pour `obisuperkmer` sont :
|
||||||
|
|
||||||
|
- ✅ **Complets** : 12 tests couvrant toutes les fonctionnalités principales
|
||||||
|
- ✅ **Légers** : 117 bytes de données de test
|
||||||
|
- ✅ **Rapides** : Exécution en quelques secondes
|
||||||
|
- ✅ **Fiables** : Pattern éprouvé utilisé par toutes les commandes OBITools
|
||||||
|
- ✅ **Maintenables** : Structure claire et documentée
|
||||||
|
- ✅ **CI/CD ready** : Code de retour approprié et nettoyage automatique
|
||||||
|
|
||||||
|
Ils garantissent que la commande fonctionne correctement à chaque commit et facilitent la détection précoce des régressions.
|
||||||
34
cmd/obitools/obisuperkmer/main.go
Normal file
34
cmd/obitools/obisuperkmer/main.go
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obisuperkmer"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
// Generate option parser
|
||||||
|
optionParser := obioptions.GenerateOptionParser(
|
||||||
|
"obisuperkmer",
|
||||||
|
"extract super k-mers from sequence files",
|
||||||
|
obisuperkmer.OptionSet)
|
||||||
|
|
||||||
|
// Parse command-line arguments
|
||||||
|
_, args := optionParser(os.Args)
|
||||||
|
|
||||||
|
// Read input sequences
|
||||||
|
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||||
|
obiconvert.OpenSequenceDataErrorMessage(args, err)
|
||||||
|
|
||||||
|
// Extract super k-mers
|
||||||
|
superkmers := obisuperkmer.CLIExtractSuperKmers(sequences)
|
||||||
|
|
||||||
|
// Write output sequences
|
||||||
|
obiconvert.CLIWriteBioSequences(superkmers, true)
|
||||||
|
|
||||||
|
// Wait for pipeline completion
|
||||||
|
obiutils.WaitForLastPipe()
|
||||||
|
}
|
||||||
148
obitests/obitools/obisuperkmer/README.md
Normal file
148
obitests/obitools/obisuperkmer/README.md
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
# Tests pour obisuperkmer
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
Ce répertoire contient les tests automatisés pour la commande `obisuperkmer`.
|
||||||
|
|
||||||
|
## Fichiers
|
||||||
|
|
||||||
|
- `test.sh` : Script de test principal (exécutable)
|
||||||
|
- `test_sequences.fasta` : Jeu de données de test minimal (3 séquences courtes)
|
||||||
|
- `README.md` : Ce fichier
|
||||||
|
|
||||||
|
## Jeu de données de test
|
||||||
|
|
||||||
|
Le fichier `test_sequences.fasta` contient 3 séquences de 32 nucléotides chacune :
|
||||||
|
|
||||||
|
1. **seq1** : Répétition du motif ACGT (séquence régulière)
|
||||||
|
2. **seq2** : Alternance de blocs homopolymères (AAAA, CCCC, GGGG, TTTT)
|
||||||
|
3. **seq3** : Répétition du motif ATCG (différent de seq1)
|
||||||
|
|
||||||
|
Ces séquences sont volontairement courtes pour :
|
||||||
|
- Minimiser la taille du dépôt Git
|
||||||
|
- Accélérer l'exécution des tests en CI/CD
|
||||||
|
- Tester différents cas d'extraction de super k-mers
|
||||||
|
|
||||||
|
## Tests effectués
|
||||||
|
|
||||||
|
Le script `test.sh` effectue 12 tests :
|
||||||
|
|
||||||
|
### Test 1 : Affichage de l'aide
|
||||||
|
Vérifie que `obisuperkmer -h` s'exécute correctement.
|
||||||
|
|
||||||
|
### Test 2 : Extraction basique avec paramètres par défaut
|
||||||
|
Exécute `obisuperkmer` avec k=21, m=11 (valeurs par défaut).
|
||||||
|
|
||||||
|
### Test 3 : Vérification du fichier de sortie non vide
|
||||||
|
S'assure que la commande produit une sortie.
|
||||||
|
|
||||||
|
### Test 4 : Comptage des super k-mers extraits
|
||||||
|
Vérifie qu'au moins un super k-mer a été extrait.
|
||||||
|
|
||||||
|
### Test 5 : Présence des métadonnées requises
|
||||||
|
Vérifie que chaque super k-mer contient :
|
||||||
|
- `minimizer_value`
|
||||||
|
- `minimizer_seq`
|
||||||
|
- `parent_id`
|
||||||
|
|
||||||
|
### Test 6 : Extraction avec paramètres personnalisés
|
||||||
|
Teste avec k=15 et m=7.
|
||||||
|
|
||||||
|
### Test 7 : Vérification des paramètres dans les métadonnées
|
||||||
|
S'assure que les valeurs k=15 et m=7 sont présentes dans la sortie.
|
||||||
|
|
||||||
|
### Test 8 : Format de sortie FASTA explicite
|
||||||
|
Teste l'option `--fasta-output`.
|
||||||
|
|
||||||
|
### Test 9 : Vérification des IDs des super k-mers
|
||||||
|
S'assure que tous les IDs contiennent "superkmer".
|
||||||
|
|
||||||
|
### Test 10 : Préservation des IDs parents
|
||||||
|
Vérifie que seq1, seq2 et seq3 apparaissent dans la sortie.
|
||||||
|
|
||||||
|
### Test 11 : Option -o pour fichier de sortie
|
||||||
|
Teste la redirection vers un fichier avec `-o`.
|
||||||
|
|
||||||
|
### Test 12 : Vérification de la création du fichier avec -o
|
||||||
|
S'assure que le fichier de sortie a été créé.
|
||||||
|
|
||||||
|
### Test 13 : Cohérence des longueurs
|
||||||
|
Vérifie que la somme des longueurs des super k-mers est inférieure ou égale à la longueur totale des séquences d'entrée.
|
||||||
|
|
||||||
|
## Exécution des tests
|
||||||
|
|
||||||
|
### Localement
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /chemin/vers/obitools4/obitests/obitools/obisuperkmer
|
||||||
|
./test.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### En CI/CD
|
||||||
|
|
||||||
|
Les tests sont automatiquement exécutés lors de chaque commit via le système CI/CD configuré pour le projet.
|
||||||
|
|
||||||
|
### Prérequis
|
||||||
|
|
||||||
|
- La commande `obisuperkmer` doit être compilée et disponible dans `../../build/`
|
||||||
|
- Les dépendances système : bash, grep, etc.
|
||||||
|
|
||||||
|
## Structure du script de test
|
||||||
|
|
||||||
|
Le script suit le pattern standard utilisé par tous les tests OBITools :
|
||||||
|
|
||||||
|
1. **En-tête** : Définition du nom du test et de la commande
|
||||||
|
2. **Variables** : Configuration des chemins et compteurs
|
||||||
|
3. **Fonction cleanup()** : Affiche les résultats et nettoie le répertoire temporaire
|
||||||
|
4. **Fonction log()** : Affiche les messages horodatés
|
||||||
|
5. **Tests** : Série de tests avec incrémentation des compteurs
|
||||||
|
6. **Appel cleanup()** : Nettoyage et sortie avec code de retour approprié
|
||||||
|
|
||||||
|
## Format de sortie
|
||||||
|
|
||||||
|
Chaque test affiche :
|
||||||
|
```
|
||||||
|
[obisuperkmer @ date] message
|
||||||
|
```
|
||||||
|
|
||||||
|
En fin d'exécution :
|
||||||
|
```
|
||||||
|
========================================
|
||||||
|
## Results of the obisuperkmer tests:
|
||||||
|
|
||||||
|
- 12 tests run
|
||||||
|
- 12 successfully completed
|
||||||
|
- 0 failed tests
|
||||||
|
|
||||||
|
Cleaning up the temporary directory...
|
||||||
|
|
||||||
|
========================================
|
||||||
|
```
|
||||||
|
|
||||||
|
## Codes de retour
|
||||||
|
|
||||||
|
- **0** : Tous les tests ont réussi
|
||||||
|
- **1** : Au moins un test a échoué
|
||||||
|
|
||||||
|
## Ajout de nouveaux tests
|
||||||
|
|
||||||
|
Pour ajouter un nouveau test, suivre le pattern :
|
||||||
|
|
||||||
|
```bash
|
||||||
|
((ntest++))
|
||||||
|
if commande_test arguments
|
||||||
|
then
|
||||||
|
log "Description: OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "Description: failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Les fichiers temporaires sont créés dans `$TMPDIR` (créé par mktemp)
|
||||||
|
- Les fichiers de données sont dans `$TEST_DIR`
|
||||||
|
- La commande testée doit être dans `$OBITOOLS_DIR` (../../build/)
|
||||||
|
- Le répertoire temporaire est automatiquement nettoyé à la fin
|
||||||
256
obitests/obitools/obisuperkmer/test.sh
Executable file
256
obitests/obitools/obisuperkmer/test.sh
Executable file
@@ -0,0 +1,256 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#
|
||||||
|
# Here give the name of the test serie
|
||||||
|
#
|
||||||
|
|
||||||
|
TEST_NAME=obisuperkmer
|
||||||
|
CMD=obisuperkmer
|
||||||
|
|
||||||
|
######
|
||||||
|
#
|
||||||
|
# Some variable and function definitions: please don't change them
|
||||||
|
#
|
||||||
|
######
|
||||||
|
TEST_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
|
||||||
|
OBITOOLS_DIR="${TEST_DIR/obitest*/}build"
|
||||||
|
export PATH="${OBITOOLS_DIR}:${PATH}"
|
||||||
|
|
||||||
|
MCMD="$(echo "${CMD:0:4}" | tr '[:lower:]' '[:upper:]')$(echo "${CMD:4}" | tr '[:upper:]' '[:lower:]')"
|
||||||
|
|
||||||
|
TMPDIR="$(mktemp -d)"
|
||||||
|
ntest=0
|
||||||
|
success=0
|
||||||
|
failed=0
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
echo "========================================" 1>&2
|
||||||
|
echo "## Results of the $TEST_NAME tests:" 1>&2
|
||||||
|
|
||||||
|
echo 1>&2
|
||||||
|
echo "- $ntest tests run" 1>&2
|
||||||
|
echo "- $success successfully completed" 1>&2
|
||||||
|
echo "- $failed failed tests" 1>&2
|
||||||
|
echo 1>&2
|
||||||
|
echo "Cleaning up the temporary directory..." 1>&2
|
||||||
|
echo 1>&2
|
||||||
|
echo "========================================" 1>&2
|
||||||
|
|
||||||
|
rm -rf "$TMPDIR" # Suppress the temporary directory
|
||||||
|
|
||||||
|
if [ $failed -gt 0 ]; then
|
||||||
|
log "$TEST_NAME tests failed"
|
||||||
|
log
|
||||||
|
log
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log
|
||||||
|
log
|
||||||
|
|
||||||
|
exit 0
|
||||||
|
}
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo -e "[$TEST_NAME @ $(date)] $*" 1>&2
|
||||||
|
}
|
||||||
|
|
||||||
|
log "Testing $TEST_NAME..."
|
||||||
|
log "Test directory is $TEST_DIR"
|
||||||
|
log "obitools directory is $OBITOOLS_DIR"
|
||||||
|
log "Temporary directory is $TMPDIR"
|
||||||
|
log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
|
||||||
|
|
||||||
|
######################################################################
|
||||||
|
####
|
||||||
|
#### Below are the tests
|
||||||
|
####
|
||||||
|
#### Before each test :
|
||||||
|
#### - increment the variable ntest
|
||||||
|
####
|
||||||
|
#### Run the command as the condition of an if / then /else
|
||||||
|
#### - The command must return 0 on success
|
||||||
|
#### - The command must return an exit code different from 0 on failure
|
||||||
|
#### - The datafiles are stored in the same directory than the test script
|
||||||
|
#### - The test script directory is stored in the TEST_DIR variable
|
||||||
|
#### - If result files have to be produced they must be stored
|
||||||
|
#### in the temporary directory (TMPDIR variable)
|
||||||
|
####
|
||||||
|
#### then clause is executed on success of the command
|
||||||
|
#### - Write a success message using the log function
|
||||||
|
#### - increment the variable success
|
||||||
|
####
|
||||||
|
#### else clause is executed on failure of the command
|
||||||
|
#### - Write a failure message using the log function
|
||||||
|
#### - increment the variable failed
|
||||||
|
####
|
||||||
|
######################################################################
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
((ntest++))
|
||||||
|
if $CMD -h > "${TMPDIR}/help.txt" 2>&1
|
||||||
|
then
|
||||||
|
log "$MCMD: printing help OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: printing help failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test 1: Basic super k-mer extraction with default parameters
|
||||||
|
((ntest++))
|
||||||
|
if obisuperkmer "${TEST_DIR}/test_sequences.fasta" \
|
||||||
|
> "${TMPDIR}/output_default.fasta" 2>&1
|
||||||
|
then
|
||||||
|
log "$MCMD: basic extraction with default parameters OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: basic extraction with default parameters failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test 2: Verify output is not empty
|
||||||
|
((ntest++))
|
||||||
|
if [ -s "${TMPDIR}/output_default.fasta" ]
|
||||||
|
then
|
||||||
|
log "$MCMD: output file is not empty OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: output file is empty - failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test 3: Count number of super k-mers extracted (should be > 0)
|
||||||
|
((ntest++))
|
||||||
|
num_sequences=$(grep -c "^>" "${TMPDIR}/output_default.fasta")
|
||||||
|
if [ "$num_sequences" -gt 0 ]
|
||||||
|
then
|
||||||
|
log "$MCMD: extracted $num_sequences super k-mers OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: no super k-mers extracted - failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test 4: Verify super k-mers have required metadata attributes
|
||||||
|
((ntest++))
|
||||||
|
if grep -q "minimizer_value" "${TMPDIR}/output_default.fasta" && \
|
||||||
|
grep -q "minimizer_seq" "${TMPDIR}/output_default.fasta" && \
|
||||||
|
grep -q "parent_id" "${TMPDIR}/output_default.fasta"
|
||||||
|
then
|
||||||
|
log "$MCMD: super k-mers contain required metadata OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: super k-mers missing metadata - failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test 5: Extract super k-mers with custom k and m parameters
|
||||||
|
((ntest++))
|
||||||
|
if obisuperkmer -k 15 -m 7 "${TEST_DIR}/test_sequences.fasta" \
|
||||||
|
> "${TMPDIR}/output_k15_m7.fasta" 2>&1
|
||||||
|
then
|
||||||
|
log "$MCMD: extraction with custom k=15, m=7 OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: extraction with custom k=15, m=7 failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test 6: Verify custom parameters in output metadata
|
||||||
|
((ntest++))
|
||||||
|
if grep -q '"k":15' "${TMPDIR}/output_k15_m7.fasta" && \
|
||||||
|
grep -q '"m":7' "${TMPDIR}/output_k15_m7.fasta"
|
||||||
|
then
|
||||||
|
log "$MCMD: custom parameters correctly set in metadata OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: custom parameters not in metadata - failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test 7: Test with different output format (FASTA output explicitly)
|
||||||
|
((ntest++))
|
||||||
|
if obisuperkmer --fasta-output -k 21 -m 11 \
|
||||||
|
"${TEST_DIR}/test_sequences.fasta" \
|
||||||
|
> "${TMPDIR}/output_fasta.fasta" 2>&1
|
||||||
|
then
|
||||||
|
log "$MCMD: FASTA output format OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: FASTA output format failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test 8: Verify all super k-mers have superkmer in their ID
|
||||||
|
((ntest++))
|
||||||
|
if grep "^>" "${TMPDIR}/output_default.fasta" | grep -q "superkmer"
|
||||||
|
then
|
||||||
|
log "$MCMD: super k-mer IDs contain 'superkmer' OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: super k-mer IDs missing 'superkmer' - failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test 9: Verify parent sequence IDs are preserved
|
||||||
|
((ntest++))
|
||||||
|
if grep -q "seq1" "${TMPDIR}/output_default.fasta" && \
|
||||||
|
grep -q "seq2" "${TMPDIR}/output_default.fasta" && \
|
||||||
|
grep -q "seq3" "${TMPDIR}/output_default.fasta"
|
||||||
|
then
|
||||||
|
log "$MCMD: parent sequence IDs preserved OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: parent sequence IDs not preserved - failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test 10: Test with output file option
|
||||||
|
((ntest++))
|
||||||
|
if obisuperkmer -o "${TMPDIR}/output_file.fasta" \
|
||||||
|
"${TEST_DIR}/test_sequences.fasta" 2>&1
|
||||||
|
then
|
||||||
|
log "$MCMD: output to file with -o option OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: output to file with -o option failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test 11: Verify output file was created with -o option
|
||||||
|
((ntest++))
|
||||||
|
if [ -s "${TMPDIR}/output_file.fasta" ]
|
||||||
|
then
|
||||||
|
log "$MCMD: output file created with -o option OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: output file not created with -o option - failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test 12: Verify super k-mers are shorter than or equal to parent sequences
|
||||||
|
((ntest++))
|
||||||
|
# Count nucleotides in input sequences (excluding headers)
|
||||||
|
input_bases=$(grep -v "^>" "${TEST_DIR}/test_sequences.fasta" | tr -d '\n' | wc -c)
|
||||||
|
# Count nucleotides in output sequences (excluding headers)
|
||||||
|
output_bases=$(grep -v "^>" "${TMPDIR}/output_default.fasta" | tr -d '\n' | wc -c)
|
||||||
|
|
||||||
|
if [ "$output_bases" -le "$input_bases" ]
|
||||||
|
then
|
||||||
|
log "$MCMD: super k-mer total length <= input length OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD: super k-mer total length > input length - failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
#
|
||||||
|
# At the end of the tests
|
||||||
|
# the cleanup function is called
|
||||||
|
#
|
||||||
|
#########################################
|
||||||
|
|
||||||
|
cleanup
|
||||||
6
obitests/obitools/obisuperkmer/test_sequences.fasta
Normal file
6
obitests/obitools/obisuperkmer/test_sequences.fasta
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
>seq1
|
||||||
|
ACGTACGTACGTACGTACGTACGTACGTACGT
|
||||||
|
>seq2
|
||||||
|
AAAACCCCGGGGTTTTAAAACCCCGGGGTTTT
|
||||||
|
>seq3
|
||||||
|
ATCGATCGATCGATCGATCGATCGATCGATCG
|
||||||
@@ -195,6 +195,59 @@ else
|
|||||||
((failed++))
|
((failed++))
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
##
|
||||||
|
## Test merge attributes consistency between in-memory and on-disk paths
|
||||||
|
## This test catches the bug where the shared classifier in the on-disk
|
||||||
|
## dereplication path caused incorrect merged attributes.
|
||||||
|
##
|
||||||
|
|
||||||
|
((ntest++))
|
||||||
|
if obiuniq -m a -m b --in-memory \
|
||||||
|
"${TEST_DIR}/touniq.fasta" \
|
||||||
|
> "${TMPDIR}/touniq_u_merge_mem.fasta" 2>/dev/null
|
||||||
|
then
|
||||||
|
log "OBIUniq merge in-memory: running OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "OBIUniq merge in-memory: running failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
((ntest++))
|
||||||
|
if obiuniq -m a -m b --chunk-count 4 \
|
||||||
|
"${TEST_DIR}/touniq.fasta" \
|
||||||
|
> "${TMPDIR}/touniq_u_merge_disk.fasta" 2>/dev/null
|
||||||
|
then
|
||||||
|
log "OBIUniq merge on-disk: running OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "OBIUniq merge on-disk: running failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract sorted annotations (JSON attributes) from both outputs
|
||||||
|
# to compare merge results independently of sequence ordering
|
||||||
|
grep '^>' "${TMPDIR}/touniq_u_merge_mem.fasta" \
|
||||||
|
| sed 's/^>seq[0-9]* //' \
|
||||||
|
| sort \
|
||||||
|
> "${TMPDIR}/touniq_u_merge_mem.json"
|
||||||
|
|
||||||
|
grep '^>' "${TMPDIR}/touniq_u_merge_disk.fasta" \
|
||||||
|
| sed 's/^>seq[0-9]* //' \
|
||||||
|
| sort \
|
||||||
|
> "${TMPDIR}/touniq_u_merge_disk.json"
|
||||||
|
|
||||||
|
((ntest++))
|
||||||
|
if diff "${TMPDIR}/touniq_u_merge_mem.json" \
|
||||||
|
"${TMPDIR}/touniq_u_merge_disk.json" > /dev/null
|
||||||
|
then
|
||||||
|
log "OBIUniq merge on-disk vs in-memory: result OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "OBIUniq merge on-disk vs in-memory: result failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
#########################################
|
#########################################
|
||||||
#
|
#
|
||||||
# At the end of the tests
|
# At the end of the tests
|
||||||
|
|||||||
@@ -110,6 +110,7 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
|
|||||||
log.Infof("Data splitted over %d batches", nbatch)
|
log.Infof("Data splitted over %d batches", nbatch)
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
|
localClassifier := uniqueClassifier.Clone()
|
||||||
|
|
||||||
for order, file := range fileNames {
|
for order, file := range fileNames {
|
||||||
iseq, err := obiformats.ReadSequencesFromFile(file)
|
iseq, err := obiformats.ReadSequencesFromFile(file)
|
||||||
@@ -121,7 +122,7 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
|
|||||||
if dereplicate {
|
if dereplicate {
|
||||||
u := make(map[string]*obiseq.BioSequence)
|
u := make(map[string]*obiseq.BioSequence)
|
||||||
var source string
|
var source string
|
||||||
uniqueClassifier.Reset()
|
localClassifier.Reset()
|
||||||
|
|
||||||
for iseq.Next() {
|
for iseq.Next() {
|
||||||
batch := iseq.Get()
|
batch := iseq.Get()
|
||||||
@@ -129,8 +130,8 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
|
|||||||
|
|
||||||
for _, seq := range batch.Slice() {
|
for _, seq := range batch.Slice() {
|
||||||
// Use composite key: sequence + categories
|
// Use composite key: sequence + categories
|
||||||
code := uniqueClassifier.Code(seq)
|
code := localClassifier.Code(seq)
|
||||||
key := uniqueClassifier.Value(code)
|
key := localClassifier.Value(code)
|
||||||
prev, ok := u[key]
|
prev, ok := u[key]
|
||||||
if ok {
|
if ok {
|
||||||
prev.Merge(seq, na, true, statsOn)
|
prev.Merge(seq, na, true, statsOn)
|
||||||
|
|||||||
@@ -27,22 +27,26 @@ func AhoCorazickWorker(slot string, patterns []string) obiseq.SeqWorker {
|
|||||||
npar := min(obidefault.ParallelWorkers(), nmatcher)
|
npar := min(obidefault.ParallelWorkers(), nmatcher)
|
||||||
mutex.Add(npar)
|
mutex.Add(npar)
|
||||||
|
|
||||||
pbopt := make([]progressbar.Option, 0, 5)
|
var bar *progressbar.ProgressBar
|
||||||
pbopt = append(pbopt,
|
if obidefault.ProgressBar() {
|
||||||
progressbar.OptionSetWriter(os.Stderr),
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
progressbar.OptionSetWidth(15),
|
pbopt = append(pbopt,
|
||||||
progressbar.OptionShowCount(),
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
progressbar.OptionShowIts(),
|
progressbar.OptionSetWidth(15),
|
||||||
progressbar.OptionSetDescription("Building AhoCorasick matcher..."),
|
progressbar.OptionShowCount(),
|
||||||
)
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetDescription("Building AhoCorasick matcher..."),
|
||||||
|
)
|
||||||
|
|
||||||
bar := progressbar.NewOptions(nmatcher, pbopt...)
|
bar = progressbar.NewOptions(nmatcher, pbopt...)
|
||||||
bar.Add(0)
|
}
|
||||||
|
|
||||||
builder := func() {
|
builder := func() {
|
||||||
for i := range ieme {
|
for i := range ieme {
|
||||||
matchers[i] = ahocorasick.CompileStrings(patterns[i*sizebatch:min((i+1)*sizebatch,len(patterns))])
|
matchers[i] = ahocorasick.CompileStrings(patterns[i*sizebatch:min((i+1)*sizebatch,len(patterns))])
|
||||||
bar.Add(1)
|
if bar != nil {
|
||||||
|
bar.Add(1)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
mutex.Done()
|
mutex.Done()
|
||||||
}
|
}
|
||||||
|
|||||||
19
pkg/obidefault/progressbar.go
Normal file
19
pkg/obidefault/progressbar.go
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
package obidefault
|
||||||
|
|
||||||
|
var __no_progress_bar__ = false
|
||||||
|
|
||||||
|
func ProgressBar() bool {
|
||||||
|
return !__no_progress_bar__
|
||||||
|
}
|
||||||
|
|
||||||
|
func NoProgressBar() bool {
|
||||||
|
return __no_progress_bar__
|
||||||
|
}
|
||||||
|
|
||||||
|
func SetNoProgressBar(b bool) {
|
||||||
|
__no_progress_bar__ = b
|
||||||
|
}
|
||||||
|
|
||||||
|
func NoProgressBarPtr() *bool {
|
||||||
|
return &__no_progress_bar__
|
||||||
|
}
|
||||||
@@ -5,18 +5,30 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||||
"github.com/schollz/progressbar/v3"
|
"github.com/schollz/progressbar/v3"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (iterator IBioSequence) Speed(message string, size ...int) IBioSequence {
|
func (iterator IBioSequence) Speed(message string, size ...int) IBioSequence {
|
||||||
|
|
||||||
// If the STDERR is redicted and doesn't end up to a terminal
|
// If the progress bar is disabled via --no-progressbar option
|
||||||
|
if !obidefault.ProgressBar() {
|
||||||
|
return iterator
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the STDERR is redirected and doesn't end up to a terminal
|
||||||
// No progress bar is printed.
|
// No progress bar is printed.
|
||||||
o, _ := os.Stderr.Stat()
|
o, _ := os.Stderr.Stat()
|
||||||
if (o.Mode() & os.ModeCharDevice) != os.ModeCharDevice {
|
if (o.Mode() & os.ModeCharDevice) != os.ModeCharDevice {
|
||||||
return iterator
|
return iterator
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If stdout is piped, no progress bar is printed.
|
||||||
|
oo, _ := os.Stdout.Stat()
|
||||||
|
if (oo.Mode() & os.ModeNamedPipe) == os.ModeNamedPipe {
|
||||||
|
return iterator
|
||||||
|
}
|
||||||
|
|
||||||
newIter := MakeIBioSequence()
|
newIter := MakeIBioSequence()
|
||||||
|
|
||||||
newIter.Add(1)
|
newIter.Add(1)
|
||||||
|
|||||||
@@ -447,141 +447,6 @@ func IterCanonicalKmers(seq []byte, k int) iter.Seq[uint64] {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// SuperKmer represents a maximal subsequence where all consecutive k-mers
|
|
||||||
// share the same minimizer. A minimizer is the smallest canonical m-mer
|
|
||||||
// among the (k-m+1) m-mers contained in a k-mer.
|
|
||||||
type SuperKmer struct {
|
|
||||||
Minimizer uint64 // The canonical minimizer value (normalized m-mer)
|
|
||||||
Start int // Starting position in the original sequence (0-indexed)
|
|
||||||
End int // Ending position (exclusive, like Go slice notation)
|
|
||||||
Sequence []byte // The actual DNA subsequence [Start:End]
|
|
||||||
}
|
|
||||||
|
|
||||||
// dequeItem represents an element in the monotone deque used for
|
|
||||||
// tracking minimizers in a sliding window.
|
|
||||||
type dequeItem struct {
|
|
||||||
position int // Position of the m-mer in the sequence
|
|
||||||
canonical uint64 // Canonical (normalized) m-mer value
|
|
||||||
}
|
|
||||||
|
|
||||||
// ExtractSuperKmers extracts super k-mers from a DNA sequence.
|
|
||||||
// A super k-mer is a maximal subsequence where all consecutive k-mers
|
|
||||||
// share the same minimizer. The minimizer of a k-mer is the smallest
|
|
||||||
// canonical m-mer among its (k-m+1) constituent m-mers.
|
|
||||||
//
|
|
||||||
// The algorithm uses:
|
|
||||||
// - Simultaneous forward/reverse m-mer encoding for O(1) canonical m-mer computation
|
|
||||||
// - Monotone deque for O(1) amortized minimizer tracking per position
|
|
||||||
//
|
|
||||||
// The maximum k-mer size is 31 (using 62 bits), leaving the top 2 bits
|
|
||||||
// available for error markers if needed.
|
|
||||||
//
|
|
||||||
// Parameters:
|
|
||||||
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
|
||||||
// - k: k-mer size (must be between m+1 and 31)
|
|
||||||
// - m: minimizer size (must be between 1 and k-1)
|
|
||||||
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
|
||||||
//
|
|
||||||
// Returns:
|
|
||||||
// - slice of SuperKmer structs representing maximal subsequences
|
|
||||||
// - nil if parameters are invalid or sequence is too short
|
|
||||||
//
|
|
||||||
// Time complexity: O(n) where n is the sequence length
|
|
||||||
// Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results
|
|
||||||
func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer {
|
|
||||||
if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
var result []SuperKmer
|
|
||||||
if buffer == nil {
|
|
||||||
estimatedSize := len(seq) / k
|
|
||||||
if estimatedSize < 1 {
|
|
||||||
estimatedSize = 1
|
|
||||||
}
|
|
||||||
result = make([]SuperKmer, 0, estimatedSize)
|
|
||||||
} else {
|
|
||||||
result = (*buffer)[:0]
|
|
||||||
}
|
|
||||||
|
|
||||||
deque := make([]dequeItem, 0, k-m+1)
|
|
||||||
|
|
||||||
mMask := uint64(1)<<(m*2) - 1
|
|
||||||
rcShift := uint((m - 1) * 2)
|
|
||||||
|
|
||||||
var fwdMmer, rvcMmer uint64
|
|
||||||
for i := 0; i < m-1 && i < len(seq); i++ {
|
|
||||||
code := uint64(__single_base_code__[seq[i]&31])
|
|
||||||
fwdMmer = (fwdMmer << 2) | code
|
|
||||||
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
|
||||||
}
|
|
||||||
|
|
||||||
superKmerStart := 0
|
|
||||||
var currentMinimizer uint64
|
|
||||||
firstKmer := true
|
|
||||||
|
|
||||||
for pos := m - 1; pos < len(seq); pos++ {
|
|
||||||
code := uint64(__single_base_code__[seq[pos]&31])
|
|
||||||
fwdMmer = ((fwdMmer << 2) | code) & mMask
|
|
||||||
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
|
||||||
|
|
||||||
canonical := fwdMmer
|
|
||||||
if rvcMmer < fwdMmer {
|
|
||||||
canonical = rvcMmer
|
|
||||||
}
|
|
||||||
|
|
||||||
mmerPos := pos - m + 1
|
|
||||||
|
|
||||||
if pos >= k-1 {
|
|
||||||
windowStart := pos - k + 1
|
|
||||||
for len(deque) > 0 && deque[0].position < windowStart {
|
|
||||||
deque = deque[1:]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for len(deque) > 0 && deque[len(deque)-1].canonical >= canonical {
|
|
||||||
deque = deque[:len(deque)-1]
|
|
||||||
}
|
|
||||||
|
|
||||||
deque = append(deque, dequeItem{position: mmerPos, canonical: canonical})
|
|
||||||
|
|
||||||
if pos >= k-1 {
|
|
||||||
newMinimizer := deque[0].canonical
|
|
||||||
kmerStart := pos - k + 1
|
|
||||||
|
|
||||||
if firstKmer {
|
|
||||||
currentMinimizer = newMinimizer
|
|
||||||
firstKmer = false
|
|
||||||
} else if newMinimizer != currentMinimizer {
|
|
||||||
endPos := kmerStart + k - 1
|
|
||||||
superKmer := SuperKmer{
|
|
||||||
Minimizer: currentMinimizer,
|
|
||||||
Start: superKmerStart,
|
|
||||||
End: endPos,
|
|
||||||
Sequence: seq[superKmerStart:endPos],
|
|
||||||
}
|
|
||||||
result = append(result, superKmer)
|
|
||||||
|
|
||||||
superKmerStart = kmerStart
|
|
||||||
currentMinimizer = newMinimizer
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !firstKmer {
|
|
||||||
superKmer := SuperKmer{
|
|
||||||
Minimizer: currentMinimizer,
|
|
||||||
Start: superKmerStart,
|
|
||||||
End: len(seq),
|
|
||||||
Sequence: seq[superKmerStart:],
|
|
||||||
}
|
|
||||||
result = append(result, superKmer)
|
|
||||||
}
|
|
||||||
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
|
|
||||||
// ReverseComplement computes the reverse complement of an encoded k-mer.
|
// ReverseComplement computes the reverse complement of an encoded k-mer.
|
||||||
// The k-mer is encoded with 2 bits per nucleotide (A=00, C=01, G=10, T=11).
|
// The k-mer is encoded with 2 bits per nucleotide (A=00, C=01, G=10, T=11).
|
||||||
// The complement is: A↔T (00↔11), C↔G (01↔10), which is simply XOR with 11.
|
// The complement is: A↔T (00↔11), C↔G (01↔10), which is simply XOR with 11.
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"sort"
|
"sort"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obilog"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obilog"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
@@ -267,20 +268,23 @@ func NewKmerMap[T obifp.FPUint[T]](
|
|||||||
}
|
}
|
||||||
|
|
||||||
n := len(sequences)
|
n := len(sequences)
|
||||||
pbopt := make([]progressbar.Option, 0, 5)
|
var bar *progressbar.ProgressBar
|
||||||
pbopt = append(pbopt,
|
if obidefault.ProgressBar() {
|
||||||
progressbar.OptionSetWriter(os.Stderr),
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
progressbar.OptionSetWidth(15),
|
pbopt = append(pbopt,
|
||||||
progressbar.OptionShowCount(),
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
progressbar.OptionShowIts(),
|
progressbar.OptionSetWidth(15),
|
||||||
progressbar.OptionSetDescription("Indexing kmers"),
|
progressbar.OptionShowCount(),
|
||||||
)
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetDescription("Indexing kmers"),
|
||||||
|
)
|
||||||
|
|
||||||
bar := progressbar.NewOptions(n, pbopt...)
|
bar = progressbar.NewOptions(n, pbopt...)
|
||||||
|
}
|
||||||
|
|
||||||
for i, sequence := range sequences {
|
for i, sequence := range sequences {
|
||||||
kmap.Push(sequence, maxoccurs)
|
kmap.Push(sequence, maxoccurs)
|
||||||
if i%100 == 0 {
|
if bar != nil && i%100 == 0 {
|
||||||
bar.Add(100)
|
bar.Add(100)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
59
pkg/obikmer/superkmer.go
Normal file
59
pkg/obikmer/superkmer.go
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
package obikmer
|
||||||
|
|
||||||
|
// SuperKmer represents a maximal subsequence where all consecutive k-mers
|
||||||
|
// share the same minimizer.
|
||||||
|
type SuperKmer struct {
|
||||||
|
Minimizer uint64 // The canonical minimizer value (normalized m-mer)
|
||||||
|
Start int // Starting position in the original sequence (0-indexed)
|
||||||
|
End int // Ending position (exclusive, like Go slice notation)
|
||||||
|
Sequence []byte // The actual DNA subsequence [Start:End]
|
||||||
|
}
|
||||||
|
|
||||||
|
// dequeItem represents an element in the monotone deque used for
|
||||||
|
// tracking minimizers in a sliding window.
|
||||||
|
type dequeItem struct {
|
||||||
|
position int // Position of the m-mer in the sequence
|
||||||
|
canonical uint64 // Canonical (normalized) m-mer value
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExtractSuperKmers extracts super k-mers from a DNA sequence.
|
||||||
|
// A super k-mer is a maximal subsequence where all consecutive k-mers
|
||||||
|
// share the same minimizer. The minimizer of a k-mer is the smallest
|
||||||
|
// canonical m-mer among its (k-m+1) constituent m-mers.
|
||||||
|
//
|
||||||
|
// This function uses IterSuperKmers internally and collects results into a slice.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
||||||
|
// - k: k-mer size (must be between m+1 and 31)
|
||||||
|
// - m: minimizer size (must be between 1 and k-1)
|
||||||
|
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - slice of SuperKmer structs representing maximal subsequences
|
||||||
|
// - nil if parameters are invalid or sequence is too short
|
||||||
|
//
|
||||||
|
// Time complexity: O(n) where n is the sequence length
|
||||||
|
// Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results
|
||||||
|
func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer {
|
||||||
|
if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var result []SuperKmer
|
||||||
|
if buffer == nil {
|
||||||
|
estimatedSize := len(seq) / k
|
||||||
|
if estimatedSize < 1 {
|
||||||
|
estimatedSize = 1
|
||||||
|
}
|
||||||
|
result = make([]SuperKmer, 0, estimatedSize)
|
||||||
|
} else {
|
||||||
|
result = (*buffer)[:0]
|
||||||
|
}
|
||||||
|
|
||||||
|
for sk := range IterSuperKmers(seq, k, m) {
|
||||||
|
result = append(result, sk)
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
215
pkg/obikmer/superkmer_iter.go
Normal file
215
pkg/obikmer/superkmer_iter.go
Normal file
@@ -0,0 +1,215 @@
|
|||||||
|
package obikmer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"iter"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
|
)
|
||||||
|
|
||||||
|
// IterSuperKmers returns an iterator over super k-mers extracted from a DNA sequence.
|
||||||
|
// It uses the same algorithm as ExtractSuperKmers but yields super k-mers one at a time.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
||||||
|
// - k: k-mer size (must be between m+1 and 31)
|
||||||
|
// - m: minimizer size (must be between 1 and k-1)
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - An iterator that yields SuperKmer structs
|
||||||
|
//
|
||||||
|
// Example:
|
||||||
|
//
|
||||||
|
// for sk := range IterSuperKmers(sequence, 21, 11) {
|
||||||
|
// fmt.Printf("SuperKmer at %d-%d with minimizer %d\n", sk.Start, sk.End, sk.Minimizer)
|
||||||
|
// }
|
||||||
|
func IterSuperKmers(seq []byte, k int, m int) iter.Seq[SuperKmer] {
|
||||||
|
return func(yield func(SuperKmer) bool) {
|
||||||
|
if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
deque := make([]dequeItem, 0, k-m+1)
|
||||||
|
|
||||||
|
mMask := uint64(1)<<(m*2) - 1
|
||||||
|
rcShift := uint((m - 1) * 2)
|
||||||
|
|
||||||
|
var fwdMmer, rvcMmer uint64
|
||||||
|
for i := 0; i < m-1 && i < len(seq); i++ {
|
||||||
|
code := uint64(__single_base_code__[seq[i]&31])
|
||||||
|
fwdMmer = (fwdMmer << 2) | code
|
||||||
|
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||||
|
}
|
||||||
|
|
||||||
|
superKmerStart := 0
|
||||||
|
var currentMinimizer uint64
|
||||||
|
firstKmer := true
|
||||||
|
|
||||||
|
for pos := m - 1; pos < len(seq); pos++ {
|
||||||
|
code := uint64(__single_base_code__[seq[pos]&31])
|
||||||
|
fwdMmer = ((fwdMmer << 2) | code) & mMask
|
||||||
|
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||||
|
|
||||||
|
canonical := fwdMmer
|
||||||
|
if rvcMmer < fwdMmer {
|
||||||
|
canonical = rvcMmer
|
||||||
|
}
|
||||||
|
|
||||||
|
mmerPos := pos - m + 1
|
||||||
|
|
||||||
|
if pos >= k-1 {
|
||||||
|
windowStart := pos - k + 1
|
||||||
|
for len(deque) > 0 && deque[0].position < windowStart {
|
||||||
|
deque = deque[1:]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for len(deque) > 0 && deque[len(deque)-1].canonical >= canonical {
|
||||||
|
deque = deque[:len(deque)-1]
|
||||||
|
}
|
||||||
|
|
||||||
|
deque = append(deque, dequeItem{position: mmerPos, canonical: canonical})
|
||||||
|
|
||||||
|
if pos >= k-1 {
|
||||||
|
newMinimizer := deque[0].canonical
|
||||||
|
kmerStart := pos - k + 1
|
||||||
|
|
||||||
|
if firstKmer {
|
||||||
|
currentMinimizer = newMinimizer
|
||||||
|
firstKmer = false
|
||||||
|
} else if newMinimizer != currentMinimizer {
|
||||||
|
endPos := kmerStart + k - 1
|
||||||
|
superKmer := SuperKmer{
|
||||||
|
Minimizer: currentMinimizer,
|
||||||
|
Start: superKmerStart,
|
||||||
|
End: endPos,
|
||||||
|
Sequence: seq[superKmerStart:endPos],
|
||||||
|
}
|
||||||
|
if !yield(superKmer) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
superKmerStart = kmerStart
|
||||||
|
currentMinimizer = newMinimizer
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !firstKmer && len(seq[superKmerStart:]) >= k {
|
||||||
|
superKmer := SuperKmer{
|
||||||
|
Minimizer: currentMinimizer,
|
||||||
|
Start: superKmerStart,
|
||||||
|
End: len(seq),
|
||||||
|
Sequence: seq[superKmerStart:],
|
||||||
|
}
|
||||||
|
yield(superKmer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ToBioSequence converts a SuperKmer to a BioSequence with metadata.
|
||||||
|
//
|
||||||
|
// The resulting BioSequence contains:
|
||||||
|
// - ID: "{parentID}_superkmer_{start}_{end}"
|
||||||
|
// - Sequence: the actual DNA subsequence
|
||||||
|
// - Attributes:
|
||||||
|
// - "minimizer_value" (uint64): the canonical minimizer value
|
||||||
|
// - "minimizer_seq" (string): the DNA sequence of the minimizer
|
||||||
|
// - "k" (int): the k-mer size
|
||||||
|
// - "m" (int): the minimizer size
|
||||||
|
// - "start" (int): starting position in original sequence
|
||||||
|
// - "end" (int): ending position in original sequence
|
||||||
|
// - "parent_id" (string): ID of the parent sequence
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - k: k-mer size used for extraction
|
||||||
|
// - m: minimizer size used for extraction
|
||||||
|
// - parentID: ID of the parent sequence
|
||||||
|
// - parentSource: source field from the parent sequence
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - *obiseq.BioSequence: A new BioSequence representing this super k-mer
|
||||||
|
func (sk *SuperKmer) ToBioSequence(k int, m int, parentID string, parentSource string) *obiseq.BioSequence {
|
||||||
|
// Create ID for the super-kmer
|
||||||
|
var id string
|
||||||
|
if parentID != "" {
|
||||||
|
id = fmt.Sprintf("%s_superkmer_%d_%d", parentID, sk.Start, sk.End)
|
||||||
|
} else {
|
||||||
|
id = fmt.Sprintf("superkmer_%d_%d", sk.Start, sk.End)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the BioSequence
|
||||||
|
seq := obiseq.NewBioSequence(id, sk.Sequence, "")
|
||||||
|
|
||||||
|
// Copy source from parent
|
||||||
|
if parentSource != "" {
|
||||||
|
seq.SetSource(parentSource)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set attributes
|
||||||
|
seq.SetAttribute("minimizer_value", sk.Minimizer)
|
||||||
|
|
||||||
|
// Decode the minimizer to get its DNA sequence
|
||||||
|
minimizerSeq := DecodeKmer(sk.Minimizer, m, nil)
|
||||||
|
seq.SetAttribute("minimizer_seq", string(minimizerSeq))
|
||||||
|
|
||||||
|
seq.SetAttribute("k", k)
|
||||||
|
seq.SetAttribute("m", m)
|
||||||
|
seq.SetAttribute("start", sk.Start)
|
||||||
|
seq.SetAttribute("end", sk.End)
|
||||||
|
|
||||||
|
if parentID != "" {
|
||||||
|
seq.SetAttribute("parent_id", parentID)
|
||||||
|
}
|
||||||
|
|
||||||
|
return seq
|
||||||
|
}
|
||||||
|
|
||||||
|
// SuperKmerWorker creates a SeqWorker that extracts super k-mers from a BioSequence
|
||||||
|
// and returns them as a slice of BioSequence objects.
|
||||||
|
//
|
||||||
|
// The worker copies the source field from the parent sequence to all extracted super k-mers.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - k: k-mer size (must be between m+1 and 31)
|
||||||
|
// - m: minimizer size (must be between 1 and k-1)
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - SeqWorker: A worker function that can be used in obiiter pipelines
|
||||||
|
//
|
||||||
|
// Example:
|
||||||
|
//
|
||||||
|
// worker := SuperKmerWorker(21, 11)
|
||||||
|
// iterator := iterator.MakeIWorker(worker, false)
|
||||||
|
func SuperKmerWorker(k int, m int) obiseq.SeqWorker {
|
||||||
|
return func(seq *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||||
|
if seq == nil {
|
||||||
|
return obiseq.BioSequenceSlice{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate parameters
|
||||||
|
if m < 1 || m >= k || k < 2 || k > 31 {
|
||||||
|
return obiseq.BioSequenceSlice{}, fmt.Errorf(
|
||||||
|
"invalid parameters: k=%d, m=%d (need 1 <= m < k <= 31)",
|
||||||
|
k, m)
|
||||||
|
}
|
||||||
|
|
||||||
|
sequence := seq.Sequence()
|
||||||
|
if len(sequence) < k {
|
||||||
|
return obiseq.BioSequenceSlice{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
parentID := seq.Id()
|
||||||
|
parentSource := seq.Source()
|
||||||
|
|
||||||
|
// Extract super k-mers and convert to BioSequences
|
||||||
|
result := make(obiseq.BioSequenceSlice, 0)
|
||||||
|
|
||||||
|
for sk := range IterSuperKmers(sequence, k, m) {
|
||||||
|
bioSeq := sk.ToBioSequence(k, m, parentID, parentSource)
|
||||||
|
result = append(result, bioSeq)
|
||||||
|
}
|
||||||
|
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
198
pkg/obikmer/superkmer_iter_test.go
Normal file
198
pkg/obikmer/superkmer_iter_test.go
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
package obikmer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestIterSuperKmers(t *testing.T) {
|
||||||
|
seq := []byte("ACGTACGTGGGGAAAA")
|
||||||
|
k := 5
|
||||||
|
m := 3
|
||||||
|
|
||||||
|
count := 0
|
||||||
|
for sk := range IterSuperKmers(seq, k, m) {
|
||||||
|
count++
|
||||||
|
t.Logf("SuperKmer %d: Minimizer=%d, Start=%d, End=%d, Seq=%s",
|
||||||
|
count, sk.Minimizer, sk.Start, sk.End, string(sk.Sequence))
|
||||||
|
|
||||||
|
// Verify sequence boundaries
|
||||||
|
if sk.Start < 0 || sk.End > len(seq) {
|
||||||
|
t.Errorf("Invalid boundaries: Start=%d, End=%d, seqLen=%d",
|
||||||
|
sk.Start, sk.End, len(seq))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify sequence content
|
||||||
|
if string(sk.Sequence) != string(seq[sk.Start:sk.End]) {
|
||||||
|
t.Errorf("Sequence mismatch: expected %s, got %s",
|
||||||
|
string(seq[sk.Start:sk.End]), string(sk.Sequence))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if count == 0 {
|
||||||
|
t.Error("No super k-mers extracted")
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Logf("Total super k-mers extracted: %d", count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIterSuperKmersVsSlice(t *testing.T) {
|
||||||
|
seq := []byte("ACGTACGTGGGGAAAAACGTACGT")
|
||||||
|
k := 7
|
||||||
|
m := 4
|
||||||
|
|
||||||
|
// Extract using slice version
|
||||||
|
sliceResult := ExtractSuperKmers(seq, k, m, nil)
|
||||||
|
|
||||||
|
// Extract using iterator version
|
||||||
|
var iterResult []SuperKmer
|
||||||
|
for sk := range IterSuperKmers(seq, k, m) {
|
||||||
|
iterResult = append(iterResult, sk)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare counts
|
||||||
|
if len(sliceResult) != len(iterResult) {
|
||||||
|
t.Errorf("Different number of super k-mers: slice=%d, iter=%d",
|
||||||
|
len(sliceResult), len(iterResult))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare each super k-mer
|
||||||
|
for i := 0; i < len(sliceResult) && i < len(iterResult); i++ {
|
||||||
|
slice := sliceResult[i]
|
||||||
|
iter := iterResult[i]
|
||||||
|
|
||||||
|
if slice.Minimizer != iter.Minimizer {
|
||||||
|
t.Errorf("SuperKmer %d: different minimizers: slice=%d, iter=%d",
|
||||||
|
i, slice.Minimizer, iter.Minimizer)
|
||||||
|
}
|
||||||
|
|
||||||
|
if slice.Start != iter.Start || slice.End != iter.End {
|
||||||
|
t.Errorf("SuperKmer %d: different boundaries: slice=[%d:%d], iter=[%d:%d]",
|
||||||
|
i, slice.Start, slice.End, iter.Start, iter.End)
|
||||||
|
}
|
||||||
|
|
||||||
|
if string(slice.Sequence) != string(iter.Sequence) {
|
||||||
|
t.Errorf("SuperKmer %d: different sequences: slice=%s, iter=%s",
|
||||||
|
i, string(slice.Sequence), string(iter.Sequence))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestSuperKmerMinimizerBijection validates the intrinsic property that
|
||||||
|
// a super k-mer sequence has one and only one minimizer (bijection property).
|
||||||
|
// This test ensures that:
|
||||||
|
// 1. All k-mers in a super k-mer share the same minimizer
|
||||||
|
// 2. Two identical super k-mer sequences must have the same minimizer
|
||||||
|
func TestSuperKmerMinimizerBijection(t *testing.T) {
|
||||||
|
testCases := []struct {
|
||||||
|
name string
|
||||||
|
seq []byte
|
||||||
|
k int
|
||||||
|
m int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "simple sequence",
|
||||||
|
seq: []byte("ACGTACGTACGTACGTACGTACGTACGTACGT"),
|
||||||
|
k: 21,
|
||||||
|
m: 11,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "homopolymer blocks",
|
||||||
|
seq: []byte("AAAACCCCGGGGTTTTAAAACCCCGGGGTTTT"),
|
||||||
|
k: 21,
|
||||||
|
m: 11,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "complex sequence",
|
||||||
|
seq: []byte("ATCGATCGATCGATCGATCGATCGATCGATCG"),
|
||||||
|
k: 15,
|
||||||
|
m: 7,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "longer sequence",
|
||||||
|
seq: []byte("ACGTACGTGGGGAAAAACGTACGTTTTTCCCCACGTACGT"),
|
||||||
|
k: 13,
|
||||||
|
m: 7,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range testCases {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
// Map to track sequence -> minimizer
|
||||||
|
seqToMinimizer := make(map[string]uint64)
|
||||||
|
|
||||||
|
for sk := range IterSuperKmers(tc.seq, tc.k, tc.m) {
|
||||||
|
seqStr := string(sk.Sequence)
|
||||||
|
|
||||||
|
// Check if we've seen this sequence before
|
||||||
|
if prevMinimizer, exists := seqToMinimizer[seqStr]; exists {
|
||||||
|
if prevMinimizer != sk.Minimizer {
|
||||||
|
t.Errorf("BIJECTION VIOLATION: sequence %s has two different minimizers:\n"+
|
||||||
|
" First: %d\n"+
|
||||||
|
" Second: %d\n"+
|
||||||
|
" This violates the super k-mer definition!",
|
||||||
|
seqStr, prevMinimizer, sk.Minimizer)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
seqToMinimizer[seqStr] = sk.Minimizer
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify all k-mers in this super k-mer have the same minimizer
|
||||||
|
if len(sk.Sequence) >= tc.k {
|
||||||
|
for i := 0; i <= len(sk.Sequence)-tc.k; i++ {
|
||||||
|
kmerSeq := sk.Sequence[i : i+tc.k]
|
||||||
|
minimizer := findMinimizer(kmerSeq, tc.k, tc.m)
|
||||||
|
if minimizer != sk.Minimizer {
|
||||||
|
t.Errorf("K-mer at position %d in super k-mer has different minimizer:\n"+
|
||||||
|
" K-mer: %s\n"+
|
||||||
|
" Expected minimizer: %d\n"+
|
||||||
|
" Actual minimizer: %d\n"+
|
||||||
|
" Super k-mer: %s",
|
||||||
|
i, string(kmerSeq), sk.Minimizer, minimizer, seqStr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// findMinimizer computes the minimizer of a k-mer for testing purposes
|
||||||
|
func findMinimizer(kmer []byte, k int, m int) uint64 {
|
||||||
|
if len(kmer) != k {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
mMask := uint64(1)<<(m*2) - 1
|
||||||
|
rcShift := uint((m - 1) * 2)
|
||||||
|
|
||||||
|
minMinimizer := uint64(^uint64(0)) // max uint64
|
||||||
|
|
||||||
|
// Scan all m-mers in the k-mer
|
||||||
|
var fwdMmer, rvcMmer uint64
|
||||||
|
for i := 0; i < m-1 && i < len(kmer); i++ {
|
||||||
|
code := uint64(__single_base_code__[kmer[i]&31])
|
||||||
|
fwdMmer = (fwdMmer << 2) | code
|
||||||
|
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := m - 1; i < len(kmer); i++ {
|
||||||
|
code := uint64(__single_base_code__[kmer[i]&31])
|
||||||
|
fwdMmer = ((fwdMmer << 2) | code) & mMask
|
||||||
|
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||||
|
|
||||||
|
canonical := fwdMmer
|
||||||
|
if rvcMmer < fwdMmer {
|
||||||
|
canonical = rvcMmer
|
||||||
|
}
|
||||||
|
|
||||||
|
if canonical < minMinimizer {
|
||||||
|
minMinimizer = canonical
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return minMinimizer
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: Tests for ToBioSequence and SuperKmerWorker are in a separate
|
||||||
|
// integration test package to avoid circular dependencies between
|
||||||
|
// obikmer and obiseq packages.
|
||||||
@@ -3,7 +3,7 @@ package obioptions
|
|||||||
// Version is automatically updated by the Makefile from version.txt
|
// Version is automatically updated by the Makefile from version.txt
|
||||||
// The patch number (third digit) is incremented on each push to the repository
|
// The patch number (third digit) is incremented on each push to the repository
|
||||||
|
|
||||||
var _Version = "Release 4.4.11"
|
var _Version = "Release 4.4.12"
|
||||||
|
|
||||||
// Version returns the version of the obitools package.
|
// Version returns the version of the obitools package.
|
||||||
//
|
//
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ import (
|
|||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
|
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||||
"github.com/schollz/progressbar/v3"
|
"github.com/schollz/progressbar/v3"
|
||||||
)
|
)
|
||||||
@@ -69,16 +70,18 @@ func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
|
|||||||
}
|
}
|
||||||
defer destfile.Close()
|
defer destfile.Close()
|
||||||
|
|
||||||
pbopt := make([]progressbar.Option, 0, 5)
|
var bar *progressbar.ProgressBar
|
||||||
pbopt = append(pbopt,
|
if obidefault.ProgressBar() {
|
||||||
progressbar.OptionSetWriter(os.Stderr),
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
progressbar.OptionSetWidth(15),
|
pbopt = append(pbopt,
|
||||||
progressbar.OptionShowIts(),
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
progressbar.OptionSetPredictTime(true),
|
progressbar.OptionSetWidth(15),
|
||||||
progressbar.OptionSetDescription("[Save CSV stat ratio file]"),
|
progressbar.OptionShowIts(),
|
||||||
)
|
progressbar.OptionSetPredictTime(true),
|
||||||
|
progressbar.OptionSetDescription("[Save CSV stat ratio file]"),
|
||||||
bar := progressbar.NewOptions(len(data), pbopt...)
|
)
|
||||||
|
bar = progressbar.NewOptions(len(data), pbopt...)
|
||||||
|
}
|
||||||
|
|
||||||
fmt.Fprintln(destfile, "Sample,Origin_id,Origin_status,Origin,Mutant,Origin_Weight,Mutant_Weight,Origin_Count,Mutant_Count,Position,Origin_length,A,C,G,T")
|
fmt.Fprintln(destfile, "Sample,Origin_id,Origin_status,Origin,Mutant,Origin_Weight,Mutant_Weight,Origin_Count,Mutant_Count,Position,Origin_length,A,C,G,T")
|
||||||
for code, dist := range data {
|
for code, dist := range data {
|
||||||
@@ -101,7 +104,9 @@ func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
|
|||||||
ratio.T,
|
ratio.T,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
bar.Add(1)
|
if bar != nil {
|
||||||
|
bar.Add(1)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -181,16 +186,18 @@ func SaveGMLGraphs(dirname string,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pbopt := make([]progressbar.Option, 0, 5)
|
var bar *progressbar.ProgressBar
|
||||||
pbopt = append(pbopt,
|
if obidefault.ProgressBar() {
|
||||||
progressbar.OptionSetWriter(os.Stderr),
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
progressbar.OptionSetWidth(15),
|
pbopt = append(pbopt,
|
||||||
progressbar.OptionShowIts(),
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
progressbar.OptionSetPredictTime(true),
|
progressbar.OptionSetWidth(15),
|
||||||
progressbar.OptionSetDescription("[Save GML Graph files]"),
|
progressbar.OptionShowIts(),
|
||||||
)
|
progressbar.OptionSetPredictTime(true),
|
||||||
|
progressbar.OptionSetDescription("[Save GML Graph files]"),
|
||||||
bar := progressbar.NewOptions(len(samples), pbopt...)
|
)
|
||||||
|
bar = progressbar.NewOptions(len(samples), pbopt...)
|
||||||
|
}
|
||||||
|
|
||||||
for name, seqs := range samples {
|
for name, seqs := range samples {
|
||||||
|
|
||||||
@@ -204,7 +211,9 @@ func SaveGMLGraphs(dirname string,
|
|||||||
file.WriteString(Gml(seqs, name, statThreshold))
|
file.WriteString(Gml(seqs, name, statThreshold))
|
||||||
file.Close()
|
file.Close()
|
||||||
|
|
||||||
bar.Add(1)
|
if bar != nil {
|
||||||
|
bar.Add(1)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -495,37 +504,44 @@ func BuildSeqGraph(samples map[string]*[]*seqPCR,
|
|||||||
npairs += nseq * (nseq - 1) / 2
|
npairs += nseq * (nseq - 1) / 2
|
||||||
}
|
}
|
||||||
|
|
||||||
pbopt := make([]progressbar.Option, 0, 5)
|
var bar *progressbar.ProgressBar
|
||||||
pbopt = append(pbopt,
|
if obidefault.ProgressBar() {
|
||||||
progressbar.OptionSetWriter(os.Stderr),
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
progressbar.OptionSetWidth(15),
|
|
||||||
progressbar.OptionShowIts(),
|
|
||||||
progressbar.OptionSetPredictTime(true),
|
|
||||||
progressbar.OptionSetDescription("[One error graph]"),
|
|
||||||
)
|
|
||||||
|
|
||||||
bar := progressbar.NewOptions(npairs, pbopt...)
|
|
||||||
for _, seqs := range samples {
|
|
||||||
np := buildSamplePairs(seqs, workers)
|
|
||||||
|
|
||||||
bar.Add(np)
|
|
||||||
}
|
|
||||||
|
|
||||||
if maxError > 1 {
|
|
||||||
pbopt = make([]progressbar.Option, 0, 5)
|
|
||||||
pbopt = append(pbopt,
|
pbopt = append(pbopt,
|
||||||
progressbar.OptionSetWriter(os.Stderr),
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
progressbar.OptionSetWidth(15),
|
progressbar.OptionSetWidth(15),
|
||||||
progressbar.OptionShowIts(),
|
progressbar.OptionShowIts(),
|
||||||
progressbar.OptionSetPredictTime(true),
|
progressbar.OptionSetPredictTime(true),
|
||||||
progressbar.OptionSetDescription("[Adds multiple errors]"),
|
progressbar.OptionSetDescription("[One error graph]"),
|
||||||
)
|
)
|
||||||
|
|
||||||
bar = progressbar.NewOptions(npairs, pbopt...)
|
bar = progressbar.NewOptions(npairs, pbopt...)
|
||||||
|
}
|
||||||
|
|
||||||
for _, seqs := range samples {
|
for _, seqs := range samples {
|
||||||
np := extendSimilarityGraph(seqs, maxError, workers)
|
np := buildSamplePairs(seqs, workers)
|
||||||
|
if bar != nil {
|
||||||
bar.Add(np)
|
bar.Add(np)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if maxError > 1 {
|
||||||
|
if obidefault.ProgressBar() {
|
||||||
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
|
pbopt = append(pbopt,
|
||||||
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
|
progressbar.OptionSetWidth(15),
|
||||||
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetPredictTime(true),
|
||||||
|
progressbar.OptionSetDescription("[Adds multiple errors]"),
|
||||||
|
)
|
||||||
|
bar = progressbar.NewOptions(npairs, pbopt...)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, seqs := range samples {
|
||||||
|
np := extendSimilarityGraph(seqs, maxError, workers)
|
||||||
|
if bar != nil {
|
||||||
|
bar.Add(np)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,7 +31,6 @@ var __output_in_json__ = false
|
|||||||
var __output_fastjson_format__ = false
|
var __output_fastjson_format__ = false
|
||||||
var __output_fastobi_format__ = false
|
var __output_fastobi_format__ = false
|
||||||
|
|
||||||
var __no_progress_bar__ = false
|
|
||||||
var __skip_empty__ = false
|
var __skip_empty__ = false
|
||||||
var __skip_on_error__ = false
|
var __skip_on_error__ = false
|
||||||
|
|
||||||
@@ -82,7 +81,7 @@ func InputOptionSet(options *getoptions.GetOpt) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func OutputModeOptionSet(options *getoptions.GetOpt, compressed bool) {
|
func OutputModeOptionSet(options *getoptions.GetOpt, compressed bool) {
|
||||||
options.BoolVar(&__no_progress_bar__, "no-progressbar", false,
|
options.BoolVar(obidefault.NoProgressBarPtr(), "no-progressbar", obidefault.NoProgressBar(),
|
||||||
options.Description("Disable the progress bar printing"))
|
options.Description("Disable the progress bar printing"))
|
||||||
|
|
||||||
if compressed {
|
if compressed {
|
||||||
@@ -224,13 +223,16 @@ func CLIAnalyzeOnly() int {
|
|||||||
|
|
||||||
func CLIProgressBar() bool {
|
func CLIProgressBar() bool {
|
||||||
// If the output is not a terminal, then we do not display the progress bar
|
// If the output is not a terminal, then we do not display the progress bar
|
||||||
o, _ := os.Stderr.Stat()
|
oe, _ := os.Stderr.Stat()
|
||||||
onTerminal := (o.Mode() & os.ModeCharDevice) == os.ModeCharDevice
|
onTerminal := (oe.Mode() & os.ModeCharDevice) == os.ModeCharDevice
|
||||||
if !onTerminal {
|
if !onTerminal {
|
||||||
log.Info("Stderr is redirected, progress bar disabled")
|
log.Info("Stderr is redirected, progress bar disabled")
|
||||||
}
|
}
|
||||||
|
|
||||||
return onTerminal && !__no_progress_bar__
|
oo, _ := os.Stdout.Stat()
|
||||||
|
toPipe := (oo.Mode() & os.ModeNamedPipe) == os.ModeNamedPipe
|
||||||
|
|
||||||
|
return onTerminal && !toPipe && obidefault.ProgressBar()
|
||||||
}
|
}
|
||||||
|
|
||||||
func CLIOutPutFileName() string {
|
func CLIOutPutFileName() string {
|
||||||
|
|||||||
@@ -210,9 +210,7 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if CLIProgressBar() {
|
iterator = iterator.Speed("Reading sequences")
|
||||||
iterator = iterator.Speed("Reading sequences")
|
|
||||||
}
|
|
||||||
|
|
||||||
return iterator, nil
|
return iterator, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,9 +12,7 @@ import (
|
|||||||
func CLIWriteSequenceCSV(iterator obiiter.IBioSequence,
|
func CLIWriteSequenceCSV(iterator obiiter.IBioSequence,
|
||||||
terminalAction bool, filenames ...string) *obiitercsv.ICSVRecord {
|
terminalAction bool, filenames ...string) *obiitercsv.ICSVRecord {
|
||||||
|
|
||||||
if obiconvert.CLIProgressBar() {
|
iterator = iterator.Speed("Writing CSV")
|
||||||
iterator = iterator.Speed("Writing CSV")
|
|
||||||
}
|
|
||||||
|
|
||||||
opts := make([]WithOption, 0, 10)
|
opts := make([]WithOption, 0, 10)
|
||||||
|
|
||||||
|
|||||||
@@ -42,16 +42,19 @@ func MapOnLandmarkSequences(library obiseq.BioSequenceSlice, landmark_idx []int,
|
|||||||
|
|
||||||
seqworld := obiutils.Make2DArray[float64](library_size, n_landmark)
|
seqworld := obiutils.Make2DArray[float64](library_size, n_landmark)
|
||||||
|
|
||||||
pbopt := make([]progressbar.Option, 0, 5)
|
var bar *progressbar.ProgressBar
|
||||||
pbopt = append(pbopt,
|
if obidefault.ProgressBar() {
|
||||||
progressbar.OptionSetWriter(os.Stderr),
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
progressbar.OptionSetWidth(15),
|
pbopt = append(pbopt,
|
||||||
progressbar.OptionShowCount(),
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
progressbar.OptionShowIts(),
|
progressbar.OptionSetWidth(15),
|
||||||
progressbar.OptionSetDescription("[Sequence mapping]"),
|
progressbar.OptionShowCount(),
|
||||||
)
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetDescription("[Sequence mapping]"),
|
||||||
|
)
|
||||||
|
|
||||||
bar := progressbar.NewOptions(library_size, pbopt...)
|
bar = progressbar.NewOptions(library_size, pbopt...)
|
||||||
|
}
|
||||||
|
|
||||||
waiting := sync.WaitGroup{}
|
waiting := sync.WaitGroup{}
|
||||||
waiting.Add(nworkers)
|
waiting.Add(nworkers)
|
||||||
@@ -66,7 +69,9 @@ func MapOnLandmarkSequences(library obiseq.BioSequenceSlice, landmark_idx []int,
|
|||||||
match, lalign := obialign.FastLCSScore(landmark, seq, -1, &buffer)
|
match, lalign := obialign.FastLCSScore(landmark, seq, -1, &buffer)
|
||||||
coord[j] = float64(lalign - match)
|
coord[j] = float64(lalign - match)
|
||||||
}
|
}
|
||||||
bar.Add(1)
|
if bar != nil {
|
||||||
|
bar.Add(1)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
waiting.Done()
|
waiting.Done()
|
||||||
}
|
}
|
||||||
@@ -170,23 +175,26 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
|
|||||||
taxa.Set(i, taxon)
|
taxa.Set(i, taxon)
|
||||||
}
|
}
|
||||||
|
|
||||||
pbopt := make([]progressbar.Option, 0, 5)
|
var bar2 *progressbar.ProgressBar
|
||||||
pbopt = append(pbopt,
|
if obidefault.ProgressBar() {
|
||||||
progressbar.OptionSetWriter(os.Stderr),
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
progressbar.OptionSetWidth(15),
|
pbopt = append(pbopt,
|
||||||
progressbar.OptionShowCount(),
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
progressbar.OptionShowIts(),
|
progressbar.OptionSetWidth(15),
|
||||||
progressbar.OptionSetDescription("[Sequence Indexing]"),
|
progressbar.OptionShowCount(),
|
||||||
)
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetDescription("[Sequence Indexing]"),
|
||||||
|
)
|
||||||
|
|
||||||
bar := progressbar.NewOptions(len(library), pbopt...)
|
bar2 = progressbar.NewOptions(len(library), pbopt...)
|
||||||
|
}
|
||||||
|
|
||||||
for i, seq := range library {
|
for i, seq := range library {
|
||||||
idx := obirefidx.GeomIndexSesquence(i, library, taxa, taxo)
|
idx := obirefidx.GeomIndexSesquence(i, library, taxa, taxo)
|
||||||
seq.SetOBITagGeomRefIndex(idx)
|
seq.SetOBITagGeomRefIndex(idx)
|
||||||
|
|
||||||
if i%10 == 0 {
|
if bar2 != nil && i%10 == 0 {
|
||||||
bar.Add(10)
|
bar2.Add(10)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,8 +8,6 @@ import (
|
|||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
|
||||||
log "github.com/sirupsen/logrus"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// MaskingMode defines how to handle low-complexity regions
|
// MaskingMode defines how to handle low-complexity regions
|
||||||
@@ -37,62 +35,81 @@ const (
|
|||||||
// Returns: a SeqWorker function that can be applied to each sequence
|
// Returns: a SeqWorker function that can be applied to each sequence
|
||||||
func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker {
|
func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker {
|
||||||
|
|
||||||
// ========================================================================
|
nLogN := make([]float64, kmer_size+1)
|
||||||
// FUNCTION 1: emax - Calculate theoretical maximum entropy
|
for i := 1; i <= kmer_size; i++ {
|
||||||
// ========================================================================
|
nLogN[i] = float64(i) * math.Log(float64(i))
|
||||||
// Computes the maximum entropy of a k-mer of length lseq containing words of size word_size.
|
}
|
||||||
//
|
|
||||||
// Maximum entropy depends on the theoretical optimal word distribution:
|
normTables := make([][]int, level_max+1)
|
||||||
// - If we have more positions (nw) than possible canonical words (na),
|
for ws := 1; ws <= level_max; ws++ {
|
||||||
// some words will appear multiple times
|
size := 1 << (ws * 2)
|
||||||
// - We calculate the entropy of a distribution where all words appear
|
normTables[ws] = make([]int, size)
|
||||||
// cov or cov+1 times (most uniform distribution possible)
|
for code := 0; code < size; code++ {
|
||||||
//
|
normTables[ws][code] = int(obikmer.NormalizeCircular(uint64(code), ws))
|
||||||
// IMPORTANT: Uses CanonicalCircularKmerCount to get the actual number of canonical words
|
}
|
||||||
// after circular normalization (e.g., "atg", "tga", "gat" → all "atg").
|
}
|
||||||
// This is much smaller than 4^word_size (e.g., 10 instead of 16 for word_size=2).
|
|
||||||
emax := func(lseq, word_size int) float64 {
|
type pair struct {
|
||||||
nw := lseq - word_size + 1 // Number of words in a k-mer of length lseq
|
index int
|
||||||
na := obikmer.CanonicalCircularKmerCount(word_size) // Number of canonical words after normalization
|
value float64
|
||||||
|
}
|
||||||
// Case 1: Fewer positions than possible words
|
|
||||||
// Maximum entropy is simply log(nw) since we can have at most nw different words
|
slidingMin := func(data []float64, window int) {
|
||||||
if nw < na {
|
if len(data) == 0 || window <= 0 {
|
||||||
return math.Log(float64(nw))
|
return
|
||||||
}
|
}
|
||||||
|
if window >= len(data) {
|
||||||
// Case 2: More positions than possible words
|
minVal := data[0]
|
||||||
// Some words must appear multiple times
|
for i := 1; i < len(data); i++ {
|
||||||
cov := nw / na // Average coverage (average number of occurrences per word)
|
if data[i] < minVal {
|
||||||
remains := nw - (na * cov) // Number of words that will have one additional occurrence
|
minVal = data[i]
|
||||||
|
}
|
||||||
// Calculate frequencies in the optimal distribution:
|
}
|
||||||
// - (na - remains) words appear cov times → frequency f1 = cov/nw
|
for i := range data {
|
||||||
// - remains words appear (cov+1) times → frequency f2 = (cov+1)/nw
|
data[i] = minVal
|
||||||
f1 := float64(cov) / float64(nw)
|
}
|
||||||
f2 := float64(cov+1) / float64(nw)
|
return
|
||||||
|
}
|
||||||
// Shannon entropy: H = -Σ p(i) * log(p(i))
|
|
||||||
// where p(i) is the probability of observing word i
|
deque := make([]pair, 0, window)
|
||||||
return -(float64(na-remains)*f1*math.Log(f1) +
|
|
||||||
float64(remains)*f2*math.Log(f2))
|
for i, v := range data {
|
||||||
|
for len(deque) > 0 && deque[0].index <= i-window {
|
||||||
|
deque = deque[1:]
|
||||||
|
}
|
||||||
|
|
||||||
|
for len(deque) > 0 && deque[len(deque)-1].value >= v {
|
||||||
|
deque = deque[:len(deque)-1]
|
||||||
|
}
|
||||||
|
|
||||||
|
deque = append(deque, pair{index: i, value: v})
|
||||||
|
|
||||||
|
data[i] = deque[0].value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
emaxValues := make([]float64, level_max+1)
|
||||||
|
logNwords := make([]float64, level_max+1)
|
||||||
|
for ws := 1; ws <= level_max; ws++ {
|
||||||
|
nw := kmer_size - ws + 1
|
||||||
|
na := obikmer.CanonicalCircularKmerCount(ws)
|
||||||
|
if nw < na {
|
||||||
|
logNwords[ws] = math.Log(float64(nw))
|
||||||
|
emaxValues[ws] = math.Log(float64(nw))
|
||||||
|
} else {
|
||||||
|
cov := nw / na
|
||||||
|
remains := nw - (na * cov)
|
||||||
|
f1 := float64(cov) / float64(nw)
|
||||||
|
f2 := float64(cov+1) / float64(nw)
|
||||||
|
logNwords[ws] = math.Log(float64(nw))
|
||||||
|
emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
|
||||||
|
float64(remains)*f2*math.Log(f2))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
|
||||||
// FUNCTION 2: maskAmbiguities - Mark positions containing ambiguities
|
|
||||||
// ========================================================================
|
|
||||||
// Identifies positions with ambiguous nucleotides (N, Y, R, etc.) and marks
|
|
||||||
// all k-mers that contain them.
|
|
||||||
//
|
|
||||||
// Returns: a slice where maskPositions[i] = -1 if position i is part of a
|
|
||||||
// k-mer containing an ambiguity, 0 otherwise
|
|
||||||
maskAmbiguities := func(sequence []byte) []int {
|
maskAmbiguities := func(sequence []byte) []int {
|
||||||
maskPositions := make([]int, len(sequence))
|
maskPositions := make([]int, len(sequence))
|
||||||
for i, nuc := range sequence {
|
for i, nuc := range sequence {
|
||||||
// If nucleotide is not a, c, g or t (lowercase), it's an ambiguity
|
|
||||||
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
||||||
// Mark all positions of k-mers that contain this nucleotide
|
|
||||||
// A k-mer starting at position (i - kmer_size + 1) will contain position i
|
|
||||||
end := max(0, i-kmer_size+1)
|
end := max(0, i-kmer_size+1)
|
||||||
for j := i; j >= end; j-- {
|
for j := i; j >= end; j-- {
|
||||||
maskPositions[j] = -1
|
maskPositions[j] = -1
|
||||||
@@ -102,182 +119,87 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
|||||||
return maskPositions
|
return maskPositions
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
|
||||||
// FUNCTION 3: cleanTable - Reset a frequency table to zero
|
|
||||||
// ========================================================================
|
|
||||||
cleanTable := func(table []int, over int) {
|
cleanTable := func(table []int, over int) {
|
||||||
for i := 0; i < over; i++ {
|
for i := 0; i < over; i++ {
|
||||||
table[i] = 0
|
table[i] = 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
|
||||||
// FUNCTION 4: slidingMin - Calculate sliding minimum over a window
|
|
||||||
// ========================================================================
|
|
||||||
// Applies a sliding window of size window over data and replaces each
|
|
||||||
// value with the minimum in the window centered on that position.
|
|
||||||
//
|
|
||||||
// Uses a MinMultiset to efficiently maintain the minimum in the window.
|
|
||||||
slidingMin := func(data []float64, window int) {
|
|
||||||
minimier := obiutils.NewMinMultiset(func(a, b float64) bool { return a < b })
|
|
||||||
ldata := len(data)
|
|
||||||
mem := make([]float64, window) // Circular buffer to store window values
|
|
||||||
|
|
||||||
// Initialize buffer with sentinel value
|
|
||||||
for i := range mem {
|
|
||||||
mem[i] = 10000
|
|
||||||
}
|
|
||||||
|
|
||||||
for i, v := range data {
|
|
||||||
// Get the old value leaving the window
|
|
||||||
m := mem[i%window]
|
|
||||||
mem[i%window] = v
|
|
||||||
|
|
||||||
// Remove old value from multiset if it was valid
|
|
||||||
if m < 10000 {
|
|
||||||
minimier.RemoveOne(m)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add new value if full window is ahead of us
|
|
||||||
if (ldata - i) >= window {
|
|
||||||
minimier.Add(v)
|
|
||||||
}
|
|
||||||
|
|
||||||
// log.Warnf("taille du minimier %d @ %d", minimier.Len(), i)
|
|
||||||
|
|
||||||
// Retrieve and store current minimum
|
|
||||||
var ok bool
|
|
||||||
if data[i], ok = minimier.Min(); !ok {
|
|
||||||
log.Error("problem with minimum entropy")
|
|
||||||
data[i] = 0.0
|
|
||||||
}
|
|
||||||
|
|
||||||
//xx, _ := minimier.Min()
|
|
||||||
//log.Warnf("Pos: %d n: %d min: %.3f -> %.3f", i, minimier.Len(), v, xx)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ========================================================================
|
|
||||||
// FUNCTION 5: computeEntropies - Calculate normalized entropy for each position
|
|
||||||
// ========================================================================
|
|
||||||
// This is the central function that calculates the entropy of each k-mer in the sequence
|
|
||||||
// at a given scale (wordSize).
|
|
||||||
//
|
|
||||||
// Algorithm:
|
|
||||||
// 1. Encode the sequence into words (subsequences of size wordSize)
|
|
||||||
// 2. For each k-mer, count the frequencies of words it contains
|
|
||||||
// 3. Calculate normalized entropy = observed_entropy / maximum_entropy
|
|
||||||
// 4. Apply a sliding min filter to smooth results
|
|
||||||
//
|
|
||||||
// IMPORTANT: Line 147 uses NormalizeInt for circular normalization of words!
|
|
||||||
// This means "atg", "tga", and "gat" are considered the same word.
|
|
||||||
computeEntropies := func(sequence []byte,
|
computeEntropies := func(sequence []byte,
|
||||||
maskPositions []int, // Positions of ambiguities
|
maskPositions []int,
|
||||||
entropies []float64, // Output: normalized entropies for each position
|
entropies []float64,
|
||||||
table []int, // Frequency table for words (reused between calls)
|
table []int,
|
||||||
words []int, // Buffer to store encoded words (reused)
|
words []int,
|
||||||
wordSize int) { // Word size (scale of analysis)
|
wordSize int,
|
||||||
|
normTable []int) {
|
||||||
|
|
||||||
lseq := len(sequence) // Sequence length
|
lseq := len(sequence)
|
||||||
tableSize := 1 << (wordSize * 2) // Actual table size (must fit all codes 0 to 4^wordSize-1)
|
tableSize := 1 << (wordSize * 2)
|
||||||
nwords := kmer_size - wordSize + 1 // Number of words in a k-mer
|
nwords := kmer_size - wordSize + 1
|
||||||
float_nwords := float64(nwords)
|
float_nwords := float64(nwords)
|
||||||
log_nwords := math.Log(float_nwords) // log(nwords) used in entropy calculation
|
log_nwords := logNwords[wordSize]
|
||||||
entropyMax := emax(kmer_size, wordSize) // Theoretical maximum entropy (uses CanonicalKmerCount internally)
|
entropyMax := emaxValues[wordSize]
|
||||||
|
|
||||||
// Reset frequency table (must clear entire table, not just nalpha entries)
|
|
||||||
cleanTable(table, tableSize)
|
cleanTable(table, tableSize)
|
||||||
|
|
||||||
for i := 1; i < lseq; i++ {
|
for i := 1; i < lseq; i++ {
|
||||||
entropies[i] = 6
|
entropies[i] = 6
|
||||||
}
|
}
|
||||||
end := lseq - wordSize + 1 // Last position where a word can start
|
end := lseq - wordSize + 1
|
||||||
|
|
||||||
// ========================================================================
|
mask := (1 << (wordSize * 2)) - 1
|
||||||
// STEP 1: Encode all words in the sequence
|
|
||||||
// ========================================================================
|
|
||||||
// Uses left-shift encoding: each nucleotide is encoded on 2 bits
|
|
||||||
// a=00, c=01, g=10, t=11
|
|
||||||
|
|
||||||
mask := (1 << (wordSize * 2)) - 1 // Mask to keep only last wordSize*2 bits
|
|
||||||
|
|
||||||
// Initialize first word (all nucleotides except the last one)
|
|
||||||
word_index := 0
|
word_index := 0
|
||||||
for i := 0; i < wordSize-1; i++ {
|
for i := 0; i < wordSize-1; i++ {
|
||||||
word_index = (word_index << 2) + int(obikmer.EncodeNucleotide(sequence[i]))
|
word_index = (word_index << 2) + int(obikmer.EncodeNucleotide(sequence[i]))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Encode all words with sliding window
|
|
||||||
for i, j := 0, wordSize-1; i < end; i, j = i+1, j+1 {
|
for i, j := 0, wordSize-1; i < end; i, j = i+1, j+1 {
|
||||||
// Shift left by 2 bits, mask, and add new nucleotide
|
|
||||||
word_index = ((word_index << 2) & mask) + int(obikmer.EncodeNucleotide(sequence[j]))
|
word_index = ((word_index << 2) & mask) + int(obikmer.EncodeNucleotide(sequence[j]))
|
||||||
|
words[i] = normTable[word_index]
|
||||||
// *** CIRCULAR NORMALIZATION ***
|
|
||||||
// Convert word to its canonical form (smallest by circular rotation)
|
|
||||||
// This is where "atg", "tga", "gat" all become "atg"
|
|
||||||
// Now using uint64-based NormalizeCircular for better performance
|
|
||||||
words[i] = int(obikmer.NormalizeCircular(uint64(word_index), wordSize))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
s := 0
|
||||||
// STEP 2: Calculate entropy for each k-mer with sliding window
|
sum_n_logn := 0.0
|
||||||
// ========================================================================
|
entropy := 1.0
|
||||||
s := 0 // Number of words processed in current k-mer
|
cleaned := true
|
||||||
sum_n_logn := 0.0 // Sum of n*log(n) for entropy calculation
|
|
||||||
entropy := 1.0 // Current normalized entropy
|
|
||||||
cleaned := true // Flag indicating if table has been cleaned
|
|
||||||
|
|
||||||
for i := range end {
|
for i := range end {
|
||||||
s++
|
s++
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
// CASE 1: Filling phase (fewer than nwords words collected)
|
|
||||||
case s < nwords:
|
case s < nwords:
|
||||||
cleaned = false
|
cleaned = false
|
||||||
table[words[i]]++ // Increment word frequency
|
table[words[i]]++
|
||||||
|
|
||||||
// CASE 2: Position contains an ambiguity
|
|
||||||
case i >= (nwords-1) && maskPositions[i-nwords+1] < 0:
|
case i >= (nwords-1) && maskPositions[i-nwords+1] < 0:
|
||||||
entropies[i-nwords+1] = 4.0 // Mark entropy as invalid
|
entropies[i-nwords+1] = 4.0
|
||||||
if !cleaned {
|
if !cleaned {
|
||||||
cleanTable(table, tableSize) // Reset table
|
cleanTable(table, tableSize)
|
||||||
}
|
}
|
||||||
cleaned = true
|
cleaned = true
|
||||||
s = 0
|
s = 0
|
||||||
sum_n_logn = 0.0
|
sum_n_logn = 0.0
|
||||||
|
|
||||||
// CASE 3: First complete k-mer (s == nwords)
|
|
||||||
case s == nwords:
|
case s == nwords:
|
||||||
cleaned = false
|
cleaned = false
|
||||||
table[words[i]]++
|
table[words[i]]++
|
||||||
|
|
||||||
// Calculate Shannon entropy: H = -Σ p(i)*log(p(i))
|
|
||||||
// = log(N) - (1/N)*Σ n(i)*log(n(i))
|
|
||||||
// where N = nwords, n(i) = frequency of word i
|
|
||||||
//
|
|
||||||
// NOTE: We iterate over entire table (tableSize = 4^wordSize) to count all frequencies.
|
|
||||||
// Canonical codes are not contiguous (e.g., for k=2: {0,1,2,3,5,6,7,10,11,15})
|
|
||||||
// so we must scan the full table even though only ~10 entries will be non-zero
|
|
||||||
sum_n_logn = 0
|
sum_n_logn = 0
|
||||||
for j := range tableSize {
|
for j := range tableSize {
|
||||||
n := float64(table[j])
|
n := float64(table[j])
|
||||||
if n > 0 {
|
if n > 0 {
|
||||||
sum_n_logn += n * math.Log(n)
|
sum_n_logn += nLogN[int(n)]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Normalized entropy = observed entropy / maximum entropy
|
|
||||||
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
|
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
|
||||||
|
|
||||||
// CASE 4: Sliding window (s > nwords)
|
|
||||||
// Incremental update of entropy by adding a new word
|
|
||||||
// and removing the old one
|
|
||||||
case s > nwords:
|
case s > nwords:
|
||||||
cleaned = false
|
cleaned = false
|
||||||
|
|
||||||
new_word := words[i]
|
new_word := words[i]
|
||||||
old_word := words[i-nwords]
|
old_word := words[i-nwords]
|
||||||
|
|
||||||
// Optimization: only recalculate if word changes
|
|
||||||
if old_word != new_word {
|
if old_word != new_word {
|
||||||
table[new_word]++
|
table[new_word]++
|
||||||
table[old_word]--
|
table[old_word]--
|
||||||
@@ -285,59 +207,39 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
|||||||
n_old := float64(table[old_word])
|
n_old := float64(table[old_word])
|
||||||
n_new := float64(table[new_word])
|
n_new := float64(table[new_word])
|
||||||
|
|
||||||
// Incremental update of sum_n_logn
|
sum_n_logn -= nLogN[int(n_old+1)]
|
||||||
// Remove contribution of old word (before decrement)
|
|
||||||
sum_n_logn -= (n_old + 1) * math.Log(n_old+1)
|
|
||||||
// Add contribution of old word (after decrement)
|
|
||||||
if n_old > 0 {
|
if n_old > 0 {
|
||||||
sum_n_logn += n_old * math.Log(n_old)
|
sum_n_logn += nLogN[int(n_old)]
|
||||||
}
|
}
|
||||||
// Add contribution of new word (after increment)
|
|
||||||
if n_new > 0 {
|
if n_new > 0 {
|
||||||
sum_n_logn += n_new * math.Log(n_new)
|
sum_n_logn += nLogN[int(n_new)]
|
||||||
}
|
}
|
||||||
// Remove contribution of new word (before increment)
|
|
||||||
if n_new > 1 {
|
if n_new > 1 {
|
||||||
sum_n_logn -= (n_new - 1) * math.Log(n_new-1)
|
sum_n_logn -= nLogN[int(n_new-1)]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
|
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
|
||||||
}
|
}
|
||||||
|
|
||||||
// Store entropy for position corresponding to start of k-mer
|
|
||||||
if s >= nwords && maskPositions[i-nwords+1] >= 0 {
|
if s >= nwords && maskPositions[i-nwords+1] >= 0 {
|
||||||
if entropy < 0 {
|
if entropy < 0 {
|
||||||
entropy = 0
|
entropy = 0
|
||||||
|
|
||||||
}
|
}
|
||||||
entropy = math.Round(entropy*10000) / 10000
|
entropy = math.Round(entropy*10000) / 10000
|
||||||
entropies[i-nwords+1] = entropy
|
entropies[i-nwords+1] = entropy
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
|
||||||
// STEP 3: Apply sliding min filter
|
|
||||||
// ========================================================================
|
|
||||||
// Replace each entropy with minimum in window of size kmer_size
|
|
||||||
// This allows robust detection of low-complexity regions
|
|
||||||
slidingMin(entropies, kmer_size)
|
slidingMin(entropies, kmer_size)
|
||||||
// log.Warnf("%v\n%v", e, entropies)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
|
||||||
// FUNCTION 6: applyMaskMode - Apply masking to sequence
|
|
||||||
// ========================================================================
|
|
||||||
applyMaskMode := func(sequence *obiseq.BioSequence, maskPositions []bool, mask byte) (obiseq.BioSequenceSlice, error) {
|
applyMaskMode := func(sequence *obiseq.BioSequence, maskPositions []bool, mask byte) (obiseq.BioSequenceSlice, error) {
|
||||||
// Create copy to avoid modifying original
|
|
||||||
seqCopy := sequence.Copy()
|
seqCopy := sequence.Copy()
|
||||||
sequenceBytes := seqCopy.Sequence()
|
sequenceBytes := seqCopy.Sequence()
|
||||||
|
|
||||||
// Mask identified positions
|
|
||||||
for i := range sequenceBytes {
|
for i := range sequenceBytes {
|
||||||
if maskPositions[i] {
|
if maskPositions[i] {
|
||||||
// Operation &^ 32 converts to UPPERCASE (clears bit 5)
|
|
||||||
// sequenceBytes[i] = sequenceBytes[i] &^ 32
|
|
||||||
sequenceBytes[i] = mask
|
sequenceBytes[i] = mask
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -368,7 +270,6 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle the case where we end in a masked region
|
|
||||||
if inlow && fromlow >= 0 {
|
if inlow && fromlow >= 0 {
|
||||||
frg, err := sequence.Subsequence(fromlow, len(maskPosition), false)
|
frg, err := sequence.Subsequence(fromlow, len(maskPosition), false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -403,7 +304,6 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle the case where we end in an unmasked region
|
|
||||||
if inhigh && fromhigh >= 0 {
|
if inhigh && fromhigh >= 0 {
|
||||||
frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false)
|
frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -415,11 +315,6 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
|||||||
return *rep, nil
|
return *rep, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
|
||||||
// FUNCTION 7: masking - Main masking function
|
|
||||||
// ========================================================================
|
|
||||||
// Calculates entropies at all scales and masks positions
|
|
||||||
// whose minimum entropy is below the threshold.
|
|
||||||
masking := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
masking := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||||
if sequence.Len() < kmer_size {
|
if sequence.Len() < kmer_size {
|
||||||
sequence.SetAttribute("obilowmask_error", "Sequence too short")
|
sequence.SetAttribute("obilowmask_error", "Sequence too short")
|
||||||
@@ -432,45 +327,27 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
|||||||
|
|
||||||
bseq := sequence.Sequence()
|
bseq := sequence.Sequence()
|
||||||
|
|
||||||
// Identify ambiguities
|
|
||||||
maskPositions := maskAmbiguities(bseq)
|
maskPositions := maskAmbiguities(bseq)
|
||||||
|
|
||||||
// Initialize data structures
|
mask := make([]int, len(bseq))
|
||||||
mask := make([]int, len(bseq)) // Stores scale detecting minimum entropy
|
entropies := make([]float64, len(bseq))
|
||||||
entropies := make([]float64, len(bseq)) // Minimum entropy at each position
|
|
||||||
for i := range entropies {
|
for i := range entropies {
|
||||||
entropies[i] = 4.0 // Very high initial value
|
entropies[i] = 4.0
|
||||||
}
|
}
|
||||||
|
|
||||||
freqs := make([]int, 1<<(2*level_max)) // Frequency table (max size)
|
freqs := make([]int, 1<<(2*level_max))
|
||||||
words := make([]int, len(bseq)) // Buffer for encoded words
|
words := make([]int, len(bseq))
|
||||||
|
entropies2 := make([]float64, len(bseq))
|
||||||
|
|
||||||
// ========================================================================
|
computeEntropies(bseq, maskPositions, entropies, freqs, words, level_max, normTables[level_max])
|
||||||
// Calculate entropy at maximum scale (level_max)
|
|
||||||
// ========================================================================
|
|
||||||
computeEntropies(bseq, maskPositions, entropies, freqs, words, level_max)
|
|
||||||
|
|
||||||
// Initialize mask with level_max everywhere (except ambiguities)
|
|
||||||
for i := range bseq {
|
for i := range bseq {
|
||||||
v := level_max
|
v := level_max
|
||||||
// if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
|
||||||
// v = 0
|
|
||||||
// }
|
|
||||||
mask[i] = v
|
mask[i] = v
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
|
||||||
// Calculate entropy at lower scales
|
|
||||||
// ========================================================================
|
|
||||||
entropies2 := make([]float64, len(bseq))
|
|
||||||
|
|
||||||
for ws := level_max - 1; ws > 0; ws-- {
|
for ws := level_max - 1; ws > 0; ws-- {
|
||||||
// *** WARNING: POTENTIAL BUG ***
|
computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws, normTables[ws])
|
||||||
// The parameter passed is level_max instead of ws!
|
|
||||||
// This means we always recalculate with the same scale
|
|
||||||
// Should be: computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws)
|
|
||||||
computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws)
|
|
||||||
// Keep minimum entropy and corresponding scale
|
|
||||||
for i, e2 := range entropies2 {
|
for i, e2 := range entropies2 {
|
||||||
if e2 < entropies[i] {
|
if e2 < entropies[i] {
|
||||||
entropies[i] = e2
|
entropies[i] = e2
|
||||||
@@ -479,22 +356,17 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Force entropy to 0 for ambiguous positions
|
|
||||||
for i, nuc := range bseq {
|
for i, nuc := range bseq {
|
||||||
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
||||||
entropies[i] = 0
|
entropies[i] = 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
|
||||||
// Identify positions to mask
|
|
||||||
// ========================================================================
|
|
||||||
remove := make([]bool, len(entropies))
|
remove := make([]bool, len(entropies))
|
||||||
for i, e := range entropies {
|
for i, e := range entropies {
|
||||||
remove[i] = e <= threshold
|
remove[i] = e <= threshold
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save metadata in sequence attributes
|
|
||||||
sequence.SetAttribute("mask", mask)
|
sequence.SetAttribute("mask", mask)
|
||||||
sequence.SetAttribute("Entropies", entropies)
|
sequence.SetAttribute("Entropies", entropies)
|
||||||
|
|
||||||
@@ -527,9 +399,7 @@ func CLISequenceEntropyMasker(iterator obiiter.IBioSequence) obiiter.IBioSequenc
|
|||||||
CLIMaskingChar(),
|
CLIMaskingChar(),
|
||||||
)
|
)
|
||||||
|
|
||||||
// Apply worker in parallel
|
|
||||||
newIter = iterator.MakeIWorker(worker, false, obidefault.ParallelWorkers())
|
newIter = iterator.MakeIWorker(worker, false, obidefault.ParallelWorkers())
|
||||||
|
|
||||||
// Filter resulting empty sequences
|
|
||||||
return newIter.FilterEmpty()
|
return newIter.FilterEmpty()
|
||||||
}
|
}
|
||||||
|
|||||||
40
pkg/obitools/obilowmask/obilowmask_test.go
Normal file
40
pkg/obitools/obilowmask/obilowmask_test.go
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
package obilowmask
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestLowMaskWorker(t *testing.T) {
|
||||||
|
worker := LowMaskWorker(31, 6, 0.3, Mask, 'n')
|
||||||
|
|
||||||
|
seq := obiseq.NewBioSequence("test", []byte("acgtacgtacgtacgtacgtacgtacgtacgt"), "test")
|
||||||
|
result, err := worker(seq)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Worker failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.Len() != 1 {
|
||||||
|
t.Fatalf("Expected 1 sequence, got %d", result.Len())
|
||||||
|
}
|
||||||
|
|
||||||
|
resultSeq := result[0]
|
||||||
|
if resultSeq.Len() != 32 {
|
||||||
|
t.Fatalf("Expected sequence length 32, got %d", resultSeq.Len())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLowMaskWorkerWithAmbiguity(t *testing.T) {
|
||||||
|
worker := LowMaskWorker(31, 6, 0.3, Mask, 'n')
|
||||||
|
|
||||||
|
seq := obiseq.NewBioSequence("test", []byte("acgtNcgtacgtacgtacgtacgtacgtacgt"), "test")
|
||||||
|
result, err := worker(seq)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Worker failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.Len() != 1 {
|
||||||
|
t.Fatalf("Expected 1 sequence, got %d", result.Len())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -207,16 +207,19 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
|||||||
|
|
||||||
log.Infof("Done. Found %d clusters", clusters.Len())
|
log.Infof("Done. Found %d clusters", clusters.Len())
|
||||||
|
|
||||||
pbopt := make([]progressbar.Option, 0, 5)
|
var bar *progressbar.ProgressBar
|
||||||
pbopt = append(pbopt,
|
if obidefault.ProgressBar() {
|
||||||
progressbar.OptionSetWriter(os.Stderr),
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
progressbar.OptionSetWidth(15),
|
pbopt = append(pbopt,
|
||||||
progressbar.OptionShowCount(),
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
progressbar.OptionShowIts(),
|
progressbar.OptionSetWidth(15),
|
||||||
progressbar.OptionSetDescription("Cluster indexing"),
|
progressbar.OptionShowCount(),
|
||||||
)
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetDescription("Cluster indexing"),
|
||||||
|
)
|
||||||
|
|
||||||
bar := progressbar.NewOptions(len(clusters), pbopt...)
|
bar = progressbar.NewOptions(len(clusters), pbopt...)
|
||||||
|
}
|
||||||
|
|
||||||
limits := make(chan [2]int)
|
limits := make(chan [2]int)
|
||||||
waiting := sync.WaitGroup{}
|
waiting := sync.WaitGroup{}
|
||||||
@@ -233,7 +236,9 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
|||||||
for i := l[0]; i < l[1]; i++ {
|
for i := l[0]; i < l[1]; i++ {
|
||||||
idx := IndexSequence(i, clusters, &kcluster, taxa, taxonomy)
|
idx := IndexSequence(i, clusters, &kcluster, taxa, taxonomy)
|
||||||
clusters[i].SetOBITagRefIndex(idx)
|
clusters[i].SetOBITagRefIndex(idx)
|
||||||
bar.Add(1)
|
if bar != nil {
|
||||||
|
bar.Add(1)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -239,16 +239,19 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
|||||||
|
|
||||||
log.Info("done")
|
log.Info("done")
|
||||||
|
|
||||||
pbopt := make([]progressbar.Option, 0, 5)
|
var bar *progressbar.ProgressBar
|
||||||
pbopt = append(pbopt,
|
if obidefault.ProgressBar() {
|
||||||
progressbar.OptionSetWriter(os.Stderr),
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
progressbar.OptionSetWidth(15),
|
pbopt = append(pbopt,
|
||||||
progressbar.OptionShowCount(),
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
progressbar.OptionShowIts(),
|
progressbar.OptionSetWidth(15),
|
||||||
progressbar.OptionSetDescription("[Sequence Processing]"),
|
progressbar.OptionShowCount(),
|
||||||
)
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetDescription("[Sequence Processing]"),
|
||||||
|
)
|
||||||
|
|
||||||
bar := progressbar.NewOptions(len(references), pbopt...)
|
bar = progressbar.NewOptions(len(references), pbopt...)
|
||||||
|
}
|
||||||
|
|
||||||
limits := make(chan [2]int)
|
limits := make(chan [2]int)
|
||||||
indexed := obiiter.MakeIBioSequence()
|
indexed := obiiter.MakeIBioSequence()
|
||||||
@@ -267,7 +270,9 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
|||||||
iref := references[i].Copy()
|
iref := references[i].Copy()
|
||||||
iref.SetOBITagRefIndex(idx)
|
iref.SetOBITagRefIndex(idx)
|
||||||
sl = append(sl, iref)
|
sl = append(sl, iref)
|
||||||
bar.Add(1)
|
if bar != nil {
|
||||||
|
bar.Add(1)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
indexed.Push(obiiter.MakeBioSequenceBatch(source, l[0]/10, sl))
|
indexed.Push(obiiter.MakeBioSequenceBatch(source, l[0]/10, sl))
|
||||||
}
|
}
|
||||||
|
|||||||
10
pkg/obitools/obisuperkmer/obisuperkmer.go
Normal file
10
pkg/obitools/obisuperkmer/obisuperkmer.go
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
// obisuperkmer function utility package.
|
||||||
|
//
|
||||||
|
// The obitools/obisuperkmer package contains every
|
||||||
|
// function specifically required by the obisuperkmer utility.
|
||||||
|
//
|
||||||
|
// The obisuperkmer command extracts super k-mers from DNA sequences.
|
||||||
|
// A super k-mer is a maximal subsequence where all consecutive k-mers
|
||||||
|
// share the same minimizer. This decomposition is useful for efficient
|
||||||
|
// k-mer indexing and analysis in bioinformatics applications.
|
||||||
|
package obisuperkmer
|
||||||
69
pkg/obitools/obisuperkmer/options.go
Normal file
69
pkg/obitools/obisuperkmer/options.go
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
package obisuperkmer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||||
|
"github.com/DavidGamba/go-getoptions"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Private variables for storing option values
|
||||||
|
var _KmerSize = 31
|
||||||
|
var _MinimizerSize = 13
|
||||||
|
|
||||||
|
// SuperKmerOptionSet defines every option related to super k-mer extraction.
|
||||||
|
//
|
||||||
|
// The function adds to a CLI every option proposed to the user
|
||||||
|
// to tune the parameters of the super k-mer extraction algorithm.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - options: is a pointer to a getoptions.GetOpt instance normally
|
||||||
|
// produced by the obioptions.GenerateOptionParser function.
|
||||||
|
func SuperKmerOptionSet(options *getoptions.GetOpt) {
|
||||||
|
options.IntVar(&_KmerSize, "kmer-size", _KmerSize,
|
||||||
|
options.Alias("k"),
|
||||||
|
options.Description("Size of k-mers (must be between m+1 and 31)."))
|
||||||
|
|
||||||
|
options.IntVar(&_MinimizerSize, "minimizer-size", _MinimizerSize,
|
||||||
|
options.Alias("m"),
|
||||||
|
options.Description("Size of minimizers (must be between 1 and k-1)."))
|
||||||
|
}
|
||||||
|
|
||||||
|
// OptionSet adds to the basic option set every option declared for
|
||||||
|
// the obisuperkmer command.
|
||||||
|
//
|
||||||
|
// It takes a pointer to a GetOpt struct as its parameter and does not return anything.
|
||||||
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
|
obiconvert.OptionSet(false)(options)
|
||||||
|
SuperKmerOptionSet(options)
|
||||||
|
}
|
||||||
|
|
||||||
|
// CLIKmerSize returns the k-mer size to use for super k-mer extraction.
|
||||||
|
//
|
||||||
|
// It does not take any parameters.
|
||||||
|
// It returns an integer representing the k-mer size.
|
||||||
|
func CLIKmerSize() int {
|
||||||
|
return _KmerSize
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetKmerSize sets the k-mer size for super k-mer extraction.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - k: the k-mer size (must be between m+1 and 31).
|
||||||
|
func SetKmerSize(k int) {
|
||||||
|
_KmerSize = k
|
||||||
|
}
|
||||||
|
|
||||||
|
// CLIMinimizerSize returns the minimizer size to use for super k-mer extraction.
|
||||||
|
//
|
||||||
|
// It does not take any parameters.
|
||||||
|
// It returns an integer representing the minimizer size.
|
||||||
|
func CLIMinimizerSize() int {
|
||||||
|
return _MinimizerSize
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetMinimizerSize sets the minimizer size for super k-mer extraction.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - m: the minimizer size (must be between 1 and k-1).
|
||||||
|
func SetMinimizerSize(m int) {
|
||||||
|
_MinimizerSize = m
|
||||||
|
}
|
||||||
59
pkg/obitools/obisuperkmer/superkmer.go
Normal file
59
pkg/obitools/obisuperkmer/superkmer.go
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
package obisuperkmer
|
||||||
|
|
||||||
|
import (
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||||
|
)
|
||||||
|
|
||||||
|
// CLIExtractSuperKmers extracts super k-mers from an iterator of BioSequences.
|
||||||
|
//
|
||||||
|
// This function takes an iterator of BioSequence objects, extracts super k-mers
|
||||||
|
// from each sequence using the k-mer and minimizer sizes specified by CLI options,
|
||||||
|
// and returns a new iterator yielding the extracted super k-mers as BioSequence objects.
|
||||||
|
//
|
||||||
|
// Each super k-mer is a maximal subsequence where all consecutive k-mers share
|
||||||
|
// the same minimizer. The resulting BioSequences contain metadata including:
|
||||||
|
// - minimizer_value: the canonical minimizer value
|
||||||
|
// - minimizer_seq: the DNA sequence of the minimizer
|
||||||
|
// - k: the k-mer size used
|
||||||
|
// - m: the minimizer size used
|
||||||
|
// - start: starting position in the original sequence
|
||||||
|
// - end: ending position in the original sequence
|
||||||
|
// - parent_id: ID of the parent sequence
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - iterator: an iterator yielding BioSequence objects to process.
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - An iterator yielding BioSequence objects representing super k-mers.
|
||||||
|
func CLIExtractSuperKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||||
|
// Get k-mer and minimizer sizes from CLI options
|
||||||
|
k := CLIKmerSize()
|
||||||
|
m := CLIMinimizerSize()
|
||||||
|
|
||||||
|
// Validate parameters
|
||||||
|
if m < 1 || m >= k {
|
||||||
|
log.Fatalf("Invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1)
|
||||||
|
}
|
||||||
|
|
||||||
|
if k < 2 || k > 31 {
|
||||||
|
log.Fatalf("Invalid k-mer size: %d (must be between 2 and 31)", k)
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Printf("Extracting super k-mers with k=%d, m=%d", k, m)
|
||||||
|
|
||||||
|
// Create the worker for super k-mer extraction
|
||||||
|
worker := obikmer.SuperKmerWorker(k, m)
|
||||||
|
|
||||||
|
// Apply the worker to the iterator with parallel processing
|
||||||
|
newIter := iterator.MakeIWorker(
|
||||||
|
worker,
|
||||||
|
false, // don't merge results
|
||||||
|
obidefault.ParallelWorkers(),
|
||||||
|
)
|
||||||
|
|
||||||
|
return newIter
|
||||||
|
}
|
||||||
149
pkg/obitools/obisuperkmer/superkmer_test.go
Normal file
149
pkg/obitools/obisuperkmer/superkmer_test.go
Normal file
@@ -0,0 +1,149 @@
|
|||||||
|
package obisuperkmer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestCLIExtractSuperKmers(t *testing.T) {
|
||||||
|
// Create a test sequence
|
||||||
|
testSeq := obiseq.NewBioSequence(
|
||||||
|
"test_seq",
|
||||||
|
[]byte("ACGTACGTACGTACGTACGTACGTACGTACGT"),
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
|
||||||
|
// Create a batch with the test sequence
|
||||||
|
batch := obiseq.NewBioSequenceBatch()
|
||||||
|
batch.Add(testSeq)
|
||||||
|
|
||||||
|
// Create an iterator from the batch
|
||||||
|
iterator := obiiter.MakeBioSequenceBatchChannel(1)
|
||||||
|
go func() {
|
||||||
|
iterator.Push(batch)
|
||||||
|
iterator.Close()
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Set test parameters
|
||||||
|
SetKmerSize(15)
|
||||||
|
SetMinimizerSize(7)
|
||||||
|
|
||||||
|
// Extract super k-mers
|
||||||
|
result := CLIExtractSuperKmers(iterator)
|
||||||
|
|
||||||
|
// Count the number of super k-mers
|
||||||
|
count := 0
|
||||||
|
for result.Next() {
|
||||||
|
batch := result.Get()
|
||||||
|
for _, sk := range batch.Slice() {
|
||||||
|
count++
|
||||||
|
|
||||||
|
// Verify that the super k-mer has the expected attributes
|
||||||
|
if !sk.HasAttribute("minimizer_value") {
|
||||||
|
t.Error("Super k-mer missing 'minimizer_value' attribute")
|
||||||
|
}
|
||||||
|
if !sk.HasAttribute("minimizer_seq") {
|
||||||
|
t.Error("Super k-mer missing 'minimizer_seq' attribute")
|
||||||
|
}
|
||||||
|
if !sk.HasAttribute("k") {
|
||||||
|
t.Error("Super k-mer missing 'k' attribute")
|
||||||
|
}
|
||||||
|
if !sk.HasAttribute("m") {
|
||||||
|
t.Error("Super k-mer missing 'm' attribute")
|
||||||
|
}
|
||||||
|
if !sk.HasAttribute("start") {
|
||||||
|
t.Error("Super k-mer missing 'start' attribute")
|
||||||
|
}
|
||||||
|
if !sk.HasAttribute("end") {
|
||||||
|
t.Error("Super k-mer missing 'end' attribute")
|
||||||
|
}
|
||||||
|
if !sk.HasAttribute("parent_id") {
|
||||||
|
t.Error("Super k-mer missing 'parent_id' attribute")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify attribute values
|
||||||
|
k, _ := sk.GetIntAttribute("k")
|
||||||
|
m, _ := sk.GetIntAttribute("m")
|
||||||
|
|
||||||
|
if k != 15 {
|
||||||
|
t.Errorf("Expected k=15, got k=%d", k)
|
||||||
|
}
|
||||||
|
if m != 7 {
|
||||||
|
t.Errorf("Expected m=7, got m=%d", m)
|
||||||
|
}
|
||||||
|
|
||||||
|
parentID, _ := sk.GetStringAttribute("parent_id")
|
||||||
|
if parentID != "test_seq" {
|
||||||
|
t.Errorf("Expected parent_id='test_seq', got '%s'", parentID)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if count == 0 {
|
||||||
|
t.Error("No super k-mers were extracted")
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Logf("Extracted %d super k-mers from test sequence", count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestOptionGettersAndSetters(t *testing.T) {
|
||||||
|
// Test initial values
|
||||||
|
if CLIKmerSize() != 21 {
|
||||||
|
t.Errorf("Expected default k-mer size 21, got %d", CLIKmerSize())
|
||||||
|
}
|
||||||
|
if CLIMinimizerSize() != 11 {
|
||||||
|
t.Errorf("Expected default minimizer size 11, got %d", CLIMinimizerSize())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test setters
|
||||||
|
SetKmerSize(25)
|
||||||
|
SetMinimizerSize(13)
|
||||||
|
|
||||||
|
if CLIKmerSize() != 25 {
|
||||||
|
t.Errorf("SetKmerSize failed: expected 25, got %d", CLIKmerSize())
|
||||||
|
}
|
||||||
|
if CLIMinimizerSize() != 13 {
|
||||||
|
t.Errorf("SetMinimizerSize failed: expected 13, got %d", CLIMinimizerSize())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset to defaults
|
||||||
|
SetKmerSize(21)
|
||||||
|
SetMinimizerSize(11)
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkCLIExtractSuperKmers(b *testing.B) {
|
||||||
|
// Create a longer test sequence
|
||||||
|
longSeq := make([]byte, 1000)
|
||||||
|
bases := []byte{'A', 'C', 'G', 'T'}
|
||||||
|
for i := range longSeq {
|
||||||
|
longSeq[i] = bases[i%4]
|
||||||
|
}
|
||||||
|
|
||||||
|
testSeq := obiseq.NewBioSequence("bench_seq", longSeq, "")
|
||||||
|
|
||||||
|
// Set parameters
|
||||||
|
SetKmerSize(21)
|
||||||
|
SetMinimizerSize(11)
|
||||||
|
|
||||||
|
b.ResetTimer()
|
||||||
|
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
batch := obiseq.NewBioSequenceBatch()
|
||||||
|
batch.Add(testSeq)
|
||||||
|
|
||||||
|
iterator := obiiter.MakeBioSequenceBatchChannel(1)
|
||||||
|
go func() {
|
||||||
|
iterator.Push(batch)
|
||||||
|
iterator.Close()
|
||||||
|
}()
|
||||||
|
|
||||||
|
result := CLIExtractSuperKmers(iterator)
|
||||||
|
|
||||||
|
// Consume the iterator
|
||||||
|
for result.Next() {
|
||||||
|
result.Get()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1 +1 @@
|
|||||||
4.4.11
|
4.4.12
|
||||||
|
|||||||
Reference in New Issue
Block a user