mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 21:40:52 +00:00
Compare commits
20 Commits
Release_4.
...
Release_4.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
09d437d10f | ||
|
|
d00ab6f83a | ||
|
|
8037860518 | ||
|
|
43d6cbe56a | ||
|
|
6dadee9371 | ||
|
|
99a8e69d10 | ||
|
|
c0ae49ef92 | ||
|
|
08490420a2 | ||
|
|
1a28d5ed64 | ||
|
|
b2d16721f0 | ||
|
|
7c12b1ee83 | ||
|
|
db98ddb241 | ||
|
|
7a979ba77f | ||
|
|
00c8be6b48 | ||
|
|
4ae331db36 | ||
|
|
f1e2846d2d | ||
|
|
cd5562fb30 | ||
|
|
f79b018430 | ||
|
|
da8d851d4d | ||
|
|
9823bcb41b |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -31,3 +31,5 @@ LLM/**
|
||||
*_files
|
||||
|
||||
entropy.html
|
||||
bug_id.txt
|
||||
obilowmask_ref
|
||||
|
||||
29
Makefile
29
Makefile
@@ -133,14 +133,33 @@ jjpush:
|
||||
@jj auto-describe
|
||||
@echo "$(BLUE)→ Creating new commit for version bump...$(NC)"
|
||||
@jj new
|
||||
@$(MAKE) bump-version
|
||||
@echo "$(BLUE)→ Documenting version bump commit...$(NC)"
|
||||
@jj auto-describe
|
||||
@version=$$(cat version.txt); \
|
||||
@previous_version=$$(cat version.txt); \
|
||||
$(MAKE) bump-version; \
|
||||
version=$$(cat version.txt); \
|
||||
tag_name="Release_$$version"; \
|
||||
previous_tag="Release_$$previous_version"; \
|
||||
echo "$(BLUE)→ Documenting version bump commit...$(NC)"; \
|
||||
jj auto-describe; \
|
||||
echo "$(BLUE)→ Generating release notes from $$previous_tag to current commit...$(NC)"; \
|
||||
if command -v orla >/dev/null 2>&1 && command -v jq >/dev/null 2>&1; then \
|
||||
release_json=$$(ORLA_MAX_TOOL_CALLS=50 jj log -r "$$previous_tag::@" -T 'commit_id.short() ++ " " ++ description' | \
|
||||
orla agent -m ollama:qwen3-coder-next:latest \
|
||||
"Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}"); \
|
||||
release_json=$$(echo "$$release_json" | sed -n '/^{/,/^}/p'); \
|
||||
release_title=$$(echo "$$release_json" | jq -r '.title // empty') ; \
|
||||
release_body=$$(echo "$$release_json" | jq -r '.body // empty') ; \
|
||||
if [ -n "$$release_title" ] && [ -n "$$release_body" ]; then \
|
||||
release_message="$$release_title"$$'\n\n'"$$release_body"; \
|
||||
else \
|
||||
echo "$(YELLOW)⚠ JSON parsing failed, falling back to raw output$(NC)"; \
|
||||
release_message="Release $$version"$$'\n\n'"$$release_json"; \
|
||||
fi; \
|
||||
else \
|
||||
release_message="Release $$version"; \
|
||||
fi; \
|
||||
echo "$(BLUE)→ Pushing commits and creating tag $$tag_name...$(NC)"; \
|
||||
jj git push --change @; \
|
||||
git tag -a "$$tag_name" -m "Release $$version" 2>/dev/null || echo "Tag $$tag_name already exists"; \
|
||||
git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "Tag $$tag_name already exists"; \
|
||||
git push origin "$$tag_name" 2>/dev/null || echo "Tag already pushed"
|
||||
@echo "$(GREEN)✓ Commits and tag pushed to repository$(NC)"
|
||||
|
||||
|
||||
735
blackboard/architechture/architecture-commande-obitools.md
Normal file
735
blackboard/architechture/architecture-commande-obitools.md
Normal file
@@ -0,0 +1,735 @@
|
||||
# Architecture d'une commande OBITools
|
||||
|
||||
## Vue d'ensemble
|
||||
|
||||
Une commande OBITools suit une architecture modulaire et standardisée qui sépare clairement les responsabilités entre :
|
||||
- Le package de la commande dans `pkg/obitools/<nom_commande>/`
|
||||
- L'exécutable dans `cmd/obitools/<nom_commande>/`
|
||||
|
||||
Cette architecture favorise la réutilisabilité du code, la testabilité et la cohérence entre les différentes commandes de la suite OBITools.
|
||||
|
||||
## Structure du projet
|
||||
|
||||
```
|
||||
obitools4/
|
||||
├── pkg/obitools/
|
||||
│ ├── obiconvert/ # Commande de conversion (base pour toutes)
|
||||
│ │ ├── obiconvert.go # Fonctions vides (pas d'implémentation)
|
||||
│ │ ├── options.go # Définition des options CLI
|
||||
│ │ ├── sequence_reader.go # Lecture des séquences
|
||||
│ │ └── sequence_writer.go # Écriture des séquences
|
||||
│ ├── obiuniq/ # Commande de déréplication
|
||||
│ │ ├── obiuniq.go # (fichier vide)
|
||||
│ │ ├── options.go # Options spécifiques à obiuniq
|
||||
│ │ └── unique.go # Implémentation du traitement
|
||||
│ ├── obipairing/ # Assemblage de lectures paired-end
|
||||
│ ├── obisummary/ # Résumé de fichiers de séquences
|
||||
│ └── obimicrosat/ # Détection de microsatellites
|
||||
└── cmd/obitools/
|
||||
├── obiconvert/
|
||||
│ └── main.go # Point d'entrée de la commande
|
||||
├── obiuniq/
|
||||
│ └── main.go
|
||||
├── obipairing/
|
||||
│ └── main.go
|
||||
├── obisummary/
|
||||
│ └── main.go
|
||||
└── obimicrosat/
|
||||
└── main.go
|
||||
```
|
||||
|
||||
## Composants de l'architecture
|
||||
|
||||
### 1. Package `pkg/obitools/<commande>/`
|
||||
|
||||
Chaque commande possède son propre package dans `pkg/obitools/` qui contient l'implémentation complète de la logique métier. Ce package est structuré en plusieurs fichiers :
|
||||
|
||||
#### a) `options.go` - Gestion des options CLI
|
||||
|
||||
Ce fichier définit :
|
||||
- Les **variables globales** privées (préfixées par `_`) stockant les valeurs des options
|
||||
- La fonction **`OptionSet()`** qui configure toutes les options pour la commande
|
||||
- Les fonctions **`CLI*()`** qui retournent les valeurs des options (getters)
|
||||
- Les fonctions **`Set*()`** qui permettent de définir les options programmatiquement (setters)
|
||||
|
||||
**Exemple (obiuniq/options.go) :**
|
||||
|
||||
```go
|
||||
package obiuniq
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// Variables globales privées pour stocker les options
|
||||
var _StatsOn = make([]string, 0, 10)
|
||||
var _Keys = make([]string, 0, 10)
|
||||
var _InMemory = false
|
||||
var _chunks = 100
|
||||
|
||||
// Configuration des options spécifiques à la commande
|
||||
func UniqueOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringSliceVar(&_StatsOn, "merge", 1, 1,
|
||||
options.Alias("m"),
|
||||
options.ArgName("KEY"),
|
||||
options.Description("Adds a merged attribute..."))
|
||||
|
||||
options.BoolVar(&_InMemory, "in-memory", _InMemory,
|
||||
options.Description("Use memory instead of disk..."))
|
||||
|
||||
options.IntVar(&_chunks, "chunk-count", _chunks,
|
||||
options.Description("In how many chunks..."))
|
||||
}
|
||||
|
||||
// OptionSet combine les options de base + les options spécifiques
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(false)(options) // Options de base
|
||||
UniqueOptionSet(options) // Options spécifiques
|
||||
}
|
||||
|
||||
// Getters pour accéder aux valeurs des options
|
||||
func CLIStatsOn() []string {
|
||||
return _StatsOn
|
||||
}
|
||||
|
||||
func CLIUniqueInMemory() bool {
|
||||
return _InMemory
|
||||
}
|
||||
|
||||
// Setters pour définir les options programmatiquement
|
||||
func SetUniqueInMemory(inMemory bool) {
|
||||
_InMemory = inMemory
|
||||
}
|
||||
```
|
||||
|
||||
**Convention de nommage :**
|
||||
- Variables privées : `_NomOption` (underscore préfixe)
|
||||
- Getters : `CLINomOption()` (préfixe CLI)
|
||||
- Setters : `SetNomOption()` (préfixe Set)
|
||||
|
||||
#### b) Fichier(s) d'implémentation
|
||||
|
||||
Un ou plusieurs fichiers contenant la logique métier de la commande :
|
||||
|
||||
**Exemple (obiuniq/unique.go) :**
|
||||
|
||||
```go
|
||||
package obiuniq
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obichunk"
|
||||
)
|
||||
|
||||
// Fonction CLI principale qui orchestre le traitement
|
||||
func CLIUnique(sequences obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
// Récupération des options via les getters CLI*()
|
||||
options := make([]obichunk.WithOption, 0, 30)
|
||||
|
||||
options = append(options,
|
||||
obichunk.OptionBatchCount(CLINumberOfChunks()),
|
||||
)
|
||||
|
||||
if CLIUniqueInMemory() {
|
||||
options = append(options, obichunk.OptionSortOnMemory())
|
||||
} else {
|
||||
options = append(options, obichunk.OptionSortOnDisk())
|
||||
}
|
||||
|
||||
// Appel de la fonction de traitement réelle
|
||||
iUnique, err := obichunk.IUniqueSequence(sequences, options...)
|
||||
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
return iUnique
|
||||
}
|
||||
```
|
||||
|
||||
**Autres exemples d'implémentation :**
|
||||
|
||||
- **obimicrosat/microsat.go** : Contient `MakeMicrosatWorker()` et `CLIAnnotateMicrosat()`
|
||||
- **obisummary/obisummary.go** : Contient `ISummary()` et les structures de données
|
||||
|
||||
#### c) Fichiers utilitaires (optionnel)
|
||||
|
||||
Certaines commandes ont des fichiers additionnels pour des fonctionnalités spécifiques.
|
||||
|
||||
**Exemple (obipairing/options.go) :**
|
||||
|
||||
```go
|
||||
// Fonction spéciale pour créer un itérateur de séquences pairées
|
||||
func CLIPairedSequence() (obiiter.IBioSequence, error) {
|
||||
forward, err := obiconvert.CLIReadBioSequences(_ForwardFile)
|
||||
if err != nil {
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
|
||||
reverse, err := obiconvert.CLIReadBioSequences(_ReverseFile)
|
||||
if err != nil {
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
|
||||
paired := forward.PairTo(reverse)
|
||||
return paired, nil
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Package `obiconvert` - La base commune
|
||||
|
||||
Le package `obiconvert` est spécial car il fournit les fonctionnalités de base utilisées par toutes les autres commandes :
|
||||
|
||||
#### Fonctionnalités fournies :
|
||||
|
||||
1. **Lecture de séquences** (`sequence_reader.go`)
|
||||
- `CLIReadBioSequences()` : lecture depuis fichiers ou stdin
|
||||
- Support de multiples formats (FASTA, FASTQ, EMBL, GenBank, etc.)
|
||||
- Gestion des fichiers multiples
|
||||
- Barre de progression optionnelle
|
||||
|
||||
2. **Écriture de séquences** (`sequence_writer.go`)
|
||||
- `CLIWriteBioSequences()` : écriture vers fichiers ou stdout
|
||||
- Support de multiples formats
|
||||
- Gestion des lectures pairées
|
||||
- Compression optionnelle
|
||||
|
||||
3. **Options communes** (`options.go`)
|
||||
- Options d'entrée (format, skip, etc.)
|
||||
- Options de sortie (format, fichier, compression)
|
||||
- Options de mode (barre de progression, etc.)
|
||||
|
||||
#### Utilisation par les autres commandes :
|
||||
|
||||
Toutes les commandes incluent les options de `obiconvert` via :
|
||||
|
||||
```go
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(false)(options) // false = pas de fichiers pairés
|
||||
MaCommandeOptionSet(options) // Options spécifiques
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Exécutable `cmd/obitools/<commande>/main.go`
|
||||
|
||||
Le fichier `main.go` de chaque commande est volontairement **minimaliste** et suit toujours le même pattern :
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/macommande"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// 1. Configuration optionnelle de paramètres par défaut
|
||||
obidefault.SetBatchSize(10)
|
||||
|
||||
// 2. Génération du parser d'options
|
||||
optionParser := obioptions.GenerateOptionParser(
|
||||
"macommande", // Nom de la commande
|
||||
"description de la commande", // Description
|
||||
macommande.OptionSet) // Fonction de configuration des options
|
||||
|
||||
// 3. Parsing des arguments
|
||||
_, args := optionParser(os.Args)
|
||||
|
||||
// 4. Lecture des séquences d'entrée
|
||||
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||
obiconvert.OpenSequenceDataErrorMessage(args, err)
|
||||
|
||||
// 5. Traitement spécifique de la commande
|
||||
resultat := macommande.CLITraitement(sequences)
|
||||
|
||||
// 6. Écriture des résultats
|
||||
obiconvert.CLIWriteBioSequences(resultat, true)
|
||||
|
||||
// 7. Attente de la fin du pipeline
|
||||
obiutils.WaitForLastPipe()
|
||||
}
|
||||
```
|
||||
|
||||
## Patterns architecturaux
|
||||
|
||||
### Pattern 1 : Pipeline de traitement de séquences
|
||||
|
||||
La plupart des commandes suivent ce pattern :
|
||||
|
||||
```
|
||||
Lecture → Traitement → Écriture
|
||||
```
|
||||
|
||||
**Exemples :**
|
||||
- **obiconvert** : Lecture → Écriture (conversion de format)
|
||||
- **obiuniq** : Lecture → Déréplication → Écriture
|
||||
- **obimicrosat** : Lecture → Annotation → Filtrage → Écriture
|
||||
|
||||
### Pattern 2 : Traitement avec entrées multiples
|
||||
|
||||
Certaines commandes acceptent plusieurs fichiers d'entrée :
|
||||
|
||||
**obipairing** :
|
||||
```
|
||||
Lecture Forward + Lecture Reverse → Pairing → Assemblage → Écriture
|
||||
```
|
||||
|
||||
### Pattern 3 : Traitement sans écriture de séquences
|
||||
|
||||
**obisummary** : produit un résumé JSON/YAML au lieu de séquences
|
||||
|
||||
```go
|
||||
func main() {
|
||||
// ... parsing options et lecture ...
|
||||
|
||||
summary := obisummary.ISummary(fs, obisummary.CLIMapSummary())
|
||||
|
||||
// Formatage et affichage direct
|
||||
if obisummary.CLIOutFormat() == "json" {
|
||||
output, _ := json.MarshalIndent(summary, "", " ")
|
||||
fmt.Print(string(output))
|
||||
} else {
|
||||
output, _ := yaml.Marshal(summary)
|
||||
fmt.Print(string(output))
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Pattern 4 : Utilisation de Workers
|
||||
|
||||
Les commandes qui transforment des séquences utilisent souvent le pattern Worker :
|
||||
|
||||
```go
|
||||
// Création d'un worker
|
||||
worker := MakeMicrosatWorker(
|
||||
CLIMinUnitLength(),
|
||||
CLIMaxUnitLength(),
|
||||
// ... autres paramètres
|
||||
)
|
||||
|
||||
// Application du worker sur l'itérateur
|
||||
newIter = iterator.MakeIWorker(
|
||||
worker,
|
||||
false, // merge results
|
||||
obidefault.ParallelWorkers() // parallélisation
|
||||
)
|
||||
```
|
||||
|
||||
## Étapes d'implémentation d'une nouvelle commande
|
||||
|
||||
### Étape 1 : Créer le package dans `pkg/obitools/`
|
||||
|
||||
```bash
|
||||
mkdir -p pkg/obitools/macommande
|
||||
```
|
||||
|
||||
### Étape 2 : Créer `options.go`
|
||||
|
||||
```go
|
||||
package macommande
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// Variables privées pour les options
|
||||
var _MonOption = "valeur_par_defaut"
|
||||
|
||||
// Configuration des options spécifiques
|
||||
func MaCommandeOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringVar(&_MonOption, "mon-option", _MonOption,
|
||||
options.Alias("o"),
|
||||
options.Description("Description de l'option"))
|
||||
}
|
||||
|
||||
// OptionSet combine options de base + spécifiques
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(false)(options) // false si pas de fichiers pairés
|
||||
MaCommandeOptionSet(options)
|
||||
}
|
||||
|
||||
// Getters
|
||||
func CLIMonOption() string {
|
||||
return _MonOption
|
||||
}
|
||||
|
||||
// Setters
|
||||
func SetMonOption(value string) {
|
||||
_MonOption = value
|
||||
}
|
||||
```
|
||||
|
||||
### Étape 3 : Créer le fichier d'implémentation
|
||||
|
||||
Créer `macommande.go` (ou un nom plus descriptif) :
|
||||
|
||||
```go
|
||||
package macommande
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
// Fonction de traitement principale
|
||||
func CLIMaCommande(sequences obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
// Récupération des options
|
||||
option := CLIMonOption()
|
||||
|
||||
// Implémentation du traitement
|
||||
// ...
|
||||
|
||||
return resultat
|
||||
}
|
||||
```
|
||||
|
||||
### Étape 4 : Créer l'exécutable dans `cmd/obitools/`
|
||||
|
||||
```bash
|
||||
mkdir -p cmd/obitools/macommande
|
||||
```
|
||||
|
||||
Créer `main.go` :
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/macommande"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Parser d'options
|
||||
optionParser := obioptions.GenerateOptionParser(
|
||||
"macommande",
|
||||
"Description courte de ma commande",
|
||||
macommande.OptionSet)
|
||||
|
||||
_, args := optionParser(os.Args)
|
||||
|
||||
// Lecture
|
||||
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||
obiconvert.OpenSequenceDataErrorMessage(args, err)
|
||||
|
||||
// Traitement
|
||||
resultat := macommande.CLIMaCommande(sequences)
|
||||
|
||||
// Écriture
|
||||
obiconvert.CLIWriteBioSequences(resultat, true)
|
||||
|
||||
// Attente
|
||||
obiutils.WaitForLastPipe()
|
||||
}
|
||||
```
|
||||
|
||||
### Étape 5 : Configurations optionnelles
|
||||
|
||||
Dans `main.go`, avant le parsing des options, on peut configurer :
|
||||
|
||||
```go
|
||||
// Taille des batchs de séquences
|
||||
obidefault.SetBatchSize(10)
|
||||
|
||||
// Nombre de workers en lecture (strict)
|
||||
obidefault.SetStrictReadWorker(2)
|
||||
|
||||
// Nombre de workers en écriture
|
||||
obidefault.SetStrictWriteWorker(2)
|
||||
|
||||
// Désactiver la lecture des qualités
|
||||
obidefault.SetReadQualities(false)
|
||||
```
|
||||
|
||||
### Étape 6 : Gestion des erreurs
|
||||
|
||||
Utiliser les fonctions utilitaires pour les messages d'erreur cohérents :
|
||||
|
||||
```go
|
||||
// Pour les erreurs d'ouverture de fichiers
|
||||
obiconvert.OpenSequenceDataErrorMessage(args, err)
|
||||
|
||||
// Pour les erreurs générales
|
||||
if err != nil {
|
||||
log.Errorf("Message d'erreur: %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
```
|
||||
|
||||
### Étape 7 : Tests et debugging (optionnel)
|
||||
|
||||
Des commentaires dans le code montrent comment activer le profiling :
|
||||
|
||||
```go
|
||||
// go tool pprof -http=":8000" ./macommande ./cpu.pprof
|
||||
// f, err := os.Create("cpu.pprof")
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
// pprof.StartCPUProfile(f)
|
||||
// defer pprof.StopCPUProfile()
|
||||
|
||||
// go tool trace cpu.trace
|
||||
// ftrace, err := os.Create("cpu.trace")
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
// trace.Start(ftrace)
|
||||
// defer trace.Stop()
|
||||
```
|
||||
|
||||
## Bonnes pratiques observées
|
||||
|
||||
### 1. Séparation des responsabilités
|
||||
|
||||
- **`main.go`** : orchestration minimale
|
||||
- **`options.go`** : définition et gestion des options
|
||||
- **Fichiers d'implémentation** : logique métier
|
||||
|
||||
### 2. Convention de nommage cohérente
|
||||
|
||||
- Variables d'options : `_NomOption`
|
||||
- Getters CLI : `CLINomOption()`
|
||||
- Setters : `SetNomOption()`
|
||||
- Fonctions de traitement CLI : `CLITraitement()`
|
||||
|
||||
### 3. Réutilisation du code
|
||||
|
||||
- Toutes les commandes réutilisent `obiconvert` pour l'I/O
|
||||
- Les options communes sont partagées
|
||||
- Les fonctions utilitaires sont centralisées
|
||||
|
||||
### 4. Configuration par défaut
|
||||
|
||||
Les valeurs par défaut sont :
|
||||
- Définies lors de l'initialisation des variables
|
||||
- Modifiables via les options CLI
|
||||
- Modifiables programmatiquement via les setters
|
||||
|
||||
### 5. Gestion des formats
|
||||
|
||||
Support automatique de multiples formats :
|
||||
- FASTA / FASTQ (avec compression gzip)
|
||||
- EMBL / GenBank
|
||||
- ecoPCR
|
||||
- CSV
|
||||
- JSON (avec différents formats d'en-têtes)
|
||||
|
||||
### 6. Parallélisation
|
||||
|
||||
Les commandes utilisent les workers parallèles via :
|
||||
- `obidefault.ParallelWorkers()`
|
||||
- `obidefault.SetStrictReadWorker(n)`
|
||||
- `obidefault.SetStrictWriteWorker(n)`
|
||||
|
||||
### 7. Logging cohérent
|
||||
|
||||
Utilisation de `logrus` pour tous les logs :
|
||||
```go
|
||||
log.Printf("Message informatif")
|
||||
log.Errorf("Message d'erreur: %v", err)
|
||||
log.Fatal(err) // Arrêt du programme
|
||||
```
|
||||
|
||||
## Dépendances principales
|
||||
|
||||
### Packages internes OBITools
|
||||
|
||||
- `pkg/obidefault` : valeurs par défaut et configuration globale
|
||||
- `pkg/obioptions` : génération du parser d'options
|
||||
- `pkg/obiiter` : itérateurs de séquences biologiques
|
||||
- `pkg/obiseq` : structures et fonctions pour séquences biologiques
|
||||
- `pkg/obiformats` : lecture/écriture de différents formats
|
||||
- `pkg/obiutils` : fonctions utilitaires diverses
|
||||
- `pkg/obichunk` : traitement par chunks (pour dereplication, etc.)
|
||||
|
||||
### Packages externes
|
||||
|
||||
- `github.com/DavidGamba/go-getoptions` : parsing des options CLI
|
||||
- `github.com/sirupsen/logrus` : logging structuré
|
||||
- `gopkg.in/yaml.v3` : encodage/décodage YAML
|
||||
- `github.com/dlclark/regexp2` : expressions régulières avancées
|
||||
|
||||
## Cas spéciaux
|
||||
|
||||
### Commande avec fichiers pairés (obipairing)
|
||||
|
||||
```go
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OutputOptionSet(options)
|
||||
obiconvert.InputOptionSet(options)
|
||||
PairingOptionSet(options) // Options spécifiques au pairing
|
||||
}
|
||||
|
||||
func CLIPairedSequence() (obiiter.IBioSequence, error) {
|
||||
forward, err := obiconvert.CLIReadBioSequences(_ForwardFile)
|
||||
// ...
|
||||
reverse, err := obiconvert.CLIReadBioSequences(_ReverseFile)
|
||||
// ...
|
||||
paired := forward.PairTo(reverse)
|
||||
return paired, nil
|
||||
}
|
||||
```
|
||||
|
||||
Dans `main.go` :
|
||||
```go
|
||||
pairs, err := obipairing.CLIPairedSequence() // Lecture spéciale
|
||||
if err != nil {
|
||||
log.Errorf("Cannot open file (%v)", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
paired := obipairing.IAssemblePESequencesBatch(
|
||||
pairs,
|
||||
obipairing.CLIGapPenality(),
|
||||
// ... autres paramètres
|
||||
)
|
||||
```
|
||||
|
||||
### Commande sans sortie de séquences (obisummary)
|
||||
|
||||
Au lieu de `obiconvert.CLIWriteBioSequences()`, affichage direct :
|
||||
|
||||
```go
|
||||
summary := obisummary.ISummary(fs, obisummary.CLIMapSummary())
|
||||
|
||||
if obisummary.CLIOutFormat() == "json" {
|
||||
output, _ := json.MarshalIndent(summary, "", " ")
|
||||
fmt.Print(string(output))
|
||||
} else {
|
||||
output, _ := yaml.Marshal(summary)
|
||||
fmt.Print(string(output))
|
||||
}
|
||||
fmt.Printf("\n")
|
||||
```
|
||||
|
||||
### Commande avec Workers personnalisés (obimicrosat)
|
||||
|
||||
```go
|
||||
func CLIAnnotateMicrosat(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
// Création du worker
|
||||
worker := MakeMicrosatWorker(
|
||||
CLIMinUnitLength(),
|
||||
CLIMaxUnitLength(),
|
||||
CLIMinUnitCount(),
|
||||
CLIMinLength(),
|
||||
CLIMinFlankLength(),
|
||||
CLIReoriented(),
|
||||
)
|
||||
|
||||
// Application du worker
|
||||
newIter := iterator.MakeIWorker(
|
||||
worker,
|
||||
false, // pas de merge
|
||||
obidefault.ParallelWorkers(), // parallélisation
|
||||
)
|
||||
|
||||
return newIter.FilterEmpty() // Filtrage des résultats vides
|
||||
}
|
||||
```
|
||||
|
||||
## Diagramme de flux d'exécution
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ cmd/obitools/macommande/main.go │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ 1. Génération du parser d'options │
|
||||
│ obioptions.GenerateOptionParser( │
|
||||
│ "macommande", │
|
||||
│ "description", │
|
||||
│ macommande.OptionSet) │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ pkg/obitools/macommande/options.go │
|
||||
│ ┌─────────────────────────────────────────────────────┐ │
|
||||
│ │ func OptionSet(options *getoptions.GetOpt) │ │
|
||||
│ │ obiconvert.OptionSet(false)(options) ───────────┐ │ │
|
||||
│ │ MaCommandeOptionSet(options) │ │ │
|
||||
│ └───────────────────────────────────────────────────┼─┘ │
|
||||
└────────────────────────────────────────────────────────┼─────┘
|
||||
│ │
|
||||
│ │
|
||||
┌─────────────┘ │
|
||||
│ │
|
||||
▼ ▼
|
||||
┌─────────────────────────────────┐ ┌───────────────────────────────┐
|
||||
│ 2. Parsing des arguments │ │ pkg/obitools/obiconvert/ │
|
||||
│ _, args := optionParser(...) │ │ options.go │
|
||||
└─────────────────────────────────┘ │ - InputOptionSet() │
|
||||
│ │ - OutputOptionSet() │
|
||||
▼ │ - PairedFilesOptionSet() │
|
||||
┌─────────────────────────────────┐ └───────────────────────────────┘
|
||||
│ 3. Lecture des séquences │
|
||||
│ CLIReadBioSequences(args) │
|
||||
└─────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ pkg/obitools/obiconvert/sequence_reader.go │
|
||||
│ - ExpandListOfFiles() │
|
||||
│ - ReadSequencesFromFile() / ReadSequencesFromStdin() │
|
||||
│ - Support: FASTA, FASTQ, EMBL, GenBank, ecoPCR, CSV │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼ obiiter.IBioSequence
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ 4. Traitement spécifique │
|
||||
│ macommande.CLITraitement(sequences) │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ pkg/obitools/macommande/<implementation>.go │
|
||||
│ - Récupération des options via CLI*() getters │
|
||||
│ - Application de la logique métier │
|
||||
│ - Retour d'un nouvel iterator │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼ obiiter.IBioSequence
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ 5. Écriture des résultats │
|
||||
│ CLIWriteBioSequences(resultat, true) │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ pkg/obitools/obiconvert/sequence_writer.go │
|
||||
│ - WriteSequencesToFile() / WriteSequencesToStdout() │
|
||||
│ - Support: FASTA, FASTQ, JSON │
|
||||
│ - Gestion des lectures pairées │
|
||||
│ - Compression optionnelle │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ 6. Attente de fin du pipeline │
|
||||
│ obiutils.WaitForLastPipe() │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Conclusion
|
||||
|
||||
L'architecture des commandes OBITools est conçue pour :
|
||||
|
||||
1. **Maximiser la réutilisation** : `obiconvert` fournit les fonctionnalités communes
|
||||
2. **Simplifier l'ajout de nouvelles commandes** : pattern standardisé et minimaliste
|
||||
3. **Faciliter la maintenance** : séparation claire des responsabilités
|
||||
4. **Garantir la cohérence** : conventions de nommage et structure uniforme
|
||||
5. **Optimiser les performances** : parallélisation intégrée et traitement par batch
|
||||
|
||||
Cette architecture modulaire permet de créer rapidement de nouvelles commandes tout en maintenant une qualité et une cohérence élevées dans toute la suite OBITools.
|
||||
99
blackboard/architechture/definition-superkmer.md
Normal file
99
blackboard/architechture/definition-superkmer.md
Normal file
@@ -0,0 +1,99 @@
|
||||
# Définition du super k-mer
|
||||
|
||||
## Définition
|
||||
|
||||
Un **super k-mer** est une **sous-séquence MAXIMALE** d'une séquence dans laquelle **tous les k-mers consécutifs partagent le même minimiseur**.
|
||||
|
||||
### Termes
|
||||
|
||||
- **k-mer** : sous-séquence de longueur k
|
||||
- **minimiseur** : le plus petit m-mer canonique parmi tous les m-mers d'un k-mer
|
||||
- **k-mers consécutifs** : k-mers aux positions i et i+1 (chevauchement de k-1 nucléotides)
|
||||
- **MAXIMALE** : ne peut être étendue ni à gauche ni à droite
|
||||
|
||||
## RÈGLES ABSOLUES
|
||||
|
||||
### RÈGLE 1 : Longueur minimum = k
|
||||
|
||||
Un super k-mer contient au minimum k nucléotides.
|
||||
|
||||
```
|
||||
longueur(super-kmer) >= k
|
||||
```
|
||||
|
||||
### RÈGLE 2 : Chevauchement obligatoire = k-1
|
||||
|
||||
Deux super-kmers consécutifs se chevauchent d'EXACTEMENT k-1 nucléotides.
|
||||
|
||||
```
|
||||
SK1.End - SK2.Start = k - 1
|
||||
```
|
||||
|
||||
### RÈGLE 3 : Bijection séquence ↔ minimiseur
|
||||
|
||||
Une séquence de super k-mer a UN et UN SEUL minimiseur.
|
||||
|
||||
```
|
||||
Même séquence → Même minimiseur (TOUJOURS)
|
||||
```
|
||||
|
||||
**Si vous observez la même séquence avec deux minimiseurs différents, c'est un BUG.**
|
||||
|
||||
### RÈGLE 4 : Tous les k-mers partagent le minimiseur
|
||||
|
||||
TOUS les k-mers contenus dans un super k-mer ont le même minimiseur.
|
||||
|
||||
```
|
||||
∀ k-mer K dans SK : minimiseur(K) = SK.minimizer
|
||||
```
|
||||
|
||||
### RÈGLE 5 : Maximalité
|
||||
|
||||
Un super k-mer ne peut pas être étendu.
|
||||
|
||||
- Si on ajoute un nucléotide à gauche : le nouveau k-mer a un minimiseur différent
|
||||
- Si on ajoute un nucléotide à droite : le nouveau k-mer a un minimiseur différent
|
||||
|
||||
## VIOLATIONS INTERDITES
|
||||
|
||||
❌ **Super k-mer de longueur < k**
|
||||
❌ **Chevauchement ≠ k-1 entre consécutifs**
|
||||
❌ **Même séquence avec minimiseurs différents**
|
||||
❌ **K-mer dans le super k-mer avec minimiseur différent**
|
||||
❌ **Super k-mer extensible (non-maximal)**
|
||||
|
||||
## CONSÉQUENCES PRATIQUES
|
||||
|
||||
### Pour l'extraction
|
||||
|
||||
L'algorithme doit :
|
||||
1. Calculer le minimiseur de chaque k-mer
|
||||
2. Découper quand le minimiseur change
|
||||
3. Assigner au super k-mer le minimiseur commun à tous ses k-mers
|
||||
4. Garantir que chaque super k-mer contient au moins k nucléotides
|
||||
5. Garantir le chevauchement de k-1 entre consécutifs
|
||||
|
||||
### Pour la validation
|
||||
|
||||
Si après déduplication (obiuniq) on observe :
|
||||
```
|
||||
Séquence: ACGT...
|
||||
Minimiseurs: {M1, M2} // plusieurs minimiseurs
|
||||
```
|
||||
|
||||
C'est la PREUVE d'un bug : l'algorithme a produit cette séquence avec des minimiseurs différents, ce qui viole la RÈGLE 3.
|
||||
|
||||
## DIAGNOSTIC DU BUG
|
||||
|
||||
**Bug observé** : Même séquence avec minimiseurs différents après obiuniq
|
||||
|
||||
**Cause possible** : L'algorithme assigne le mauvais minimiseur OU découpe mal les super-kmers
|
||||
|
||||
**Ce que le bug NE PEUT PAS être** :
|
||||
- Un problème d'obiuniq (révèle le bug, ne le crée pas)
|
||||
- Un problème de chevauchement légitime (k-1 est correct)
|
||||
|
||||
**Ce que le bug DOIT être** :
|
||||
- Minimiseur mal calculé ou mal assigné
|
||||
- Découpage incorrect (mauvais endPos)
|
||||
- Copie incorrecte des données
|
||||
316
blackboard/architechture/guide-redaction-obitest.md
Normal file
316
blackboard/architechture/guide-redaction-obitest.md
Normal file
@@ -0,0 +1,316 @@
|
||||
# Guide de rédaction d'un obitest
|
||||
|
||||
## Règles essentielles
|
||||
|
||||
1. **Données < 1 KB** - Fichiers de test très petits
|
||||
2. **Exécution < 10 sec** - Tests rapides pour CI/CD
|
||||
3. **Auto-contenu** - Pas de dépendances externes
|
||||
4. **Auto-nettoyage** - Pas de fichiers résiduels
|
||||
|
||||
## Structure minimale
|
||||
|
||||
```
|
||||
obitests/obitools/<commande>/
|
||||
├── test.sh # Script exécutable
|
||||
└── data.fasta # Données minimales (optionnel)
|
||||
```
|
||||
|
||||
## Template de test.sh
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
|
||||
TEST_NAME=<commande>
|
||||
CMD=<commande>
|
||||
|
||||
TEST_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
|
||||
OBITOOLS_DIR="${TEST_DIR/obitest*/}build"
|
||||
export PATH="${OBITOOLS_DIR}:${PATH}"
|
||||
|
||||
MCMD="$(echo "${CMD:0:4}" | tr '[:lower:]' '[:upper:]')$(echo "${CMD:4}" | tr '[:upper:]' '[:lower:]')"
|
||||
|
||||
TMPDIR="$(mktemp -d)"
|
||||
ntest=0
|
||||
success=0
|
||||
failed=0
|
||||
|
||||
cleanup() {
|
||||
echo "========================================" 1>&2
|
||||
echo "## Results of the $TEST_NAME tests:" 1>&2
|
||||
echo 1>&2
|
||||
echo "- $ntest tests run" 1>&2
|
||||
echo "- $success successfully completed" 1>&2
|
||||
echo "- $failed failed tests" 1>&2
|
||||
echo 1>&2
|
||||
echo "Cleaning up the temporary directory..." 1>&2
|
||||
echo 1>&2
|
||||
echo "========================================" 1>&2
|
||||
|
||||
rm -rf "$TMPDIR"
|
||||
|
||||
if [ $failed -gt 0 ]; then
|
||||
log "$TEST_NAME tests failed"
|
||||
log
|
||||
log
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log
|
||||
log
|
||||
exit 0
|
||||
}
|
||||
|
||||
log() {
|
||||
echo -e "[$TEST_NAME @ $(date)] $*" 1>&2
|
||||
}
|
||||
|
||||
log "Testing $TEST_NAME..."
|
||||
log "Test directory is $TEST_DIR"
|
||||
log "obitools directory is $OBITOOLS_DIR"
|
||||
log "Temporary directory is $TMPDIR"
|
||||
log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
|
||||
|
||||
########## TESTS ##########
|
||||
|
||||
# Test 1: Help (OBLIGATOIRE)
|
||||
((ntest++))
|
||||
if $CMD -h > "${TMPDIR}/help.txt" 2>&1
|
||||
then
|
||||
log "$MCMD: printing help OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: printing help failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
# Ajoutez vos tests ici...
|
||||
|
||||
###########################
|
||||
|
||||
cleanup
|
||||
```
|
||||
|
||||
## Pattern de test
|
||||
|
||||
```bash
|
||||
((ntest++))
|
||||
if commande args > "${TMPDIR}/output.txt" 2>&1
|
||||
then
|
||||
log "$MCMD: description OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: description failed"
|
||||
((failed++))
|
||||
fi
|
||||
```
|
||||
|
||||
## Tests courants
|
||||
|
||||
### Exécution basique
|
||||
```bash
|
||||
((ntest++))
|
||||
if $CMD "${TEST_DIR}/input.fasta" > "${TMPDIR}/output.fasta" 2>&1
|
||||
then
|
||||
log "$MCMD: basic execution OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: basic execution failed"
|
||||
((failed++))
|
||||
fi
|
||||
```
|
||||
|
||||
### Sortie non vide
|
||||
```bash
|
||||
((ntest++))
|
||||
if [ -s "${TMPDIR}/output.fasta" ]
|
||||
then
|
||||
log "$MCMD: output not empty OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: output empty - failed"
|
||||
((failed++))
|
||||
fi
|
||||
```
|
||||
|
||||
### Comptage
|
||||
```bash
|
||||
((ntest++))
|
||||
count=$(grep -c "^>" "${TMPDIR}/output.fasta")
|
||||
if [ "$count" -gt 0 ]
|
||||
then
|
||||
log "$MCMD: extracted $count sequences OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: no sequences - failed"
|
||||
((failed++))
|
||||
fi
|
||||
```
|
||||
|
||||
### Présence de contenu
|
||||
```bash
|
||||
((ntest++))
|
||||
if grep -q "expected_string" "${TMPDIR}/output.fasta"
|
||||
then
|
||||
log "$MCMD: expected content found OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: content not found - failed"
|
||||
((failed++))
|
||||
fi
|
||||
```
|
||||
|
||||
### Comparaison avec référence
|
||||
```bash
|
||||
((ntest++))
|
||||
if diff "${TEST_DIR}/expected.fasta" "${TMPDIR}/output.fasta" > /dev/null
|
||||
then
|
||||
log "$MCMD: matches reference OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: differs from reference - failed"
|
||||
((failed++))
|
||||
fi
|
||||
```
|
||||
|
||||
### Test avec options
|
||||
```bash
|
||||
((ntest++))
|
||||
if $CMD --opt value "${TEST_DIR}/input.fasta" > "${TMPDIR}/out.fasta" 2>&1
|
||||
then
|
||||
log "$MCMD: with option OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: with option failed"
|
||||
((failed++))
|
||||
fi
|
||||
```
|
||||
|
||||
## Variables importantes
|
||||
|
||||
- **TEST_DIR** - Répertoire du test (données d'entrée)
|
||||
- **TMPDIR** - Répertoire temporaire (sorties)
|
||||
- **CMD** - Nom de la commande
|
||||
- **MCMD** - Nom formaté pour les logs
|
||||
|
||||
## Règles d'or
|
||||
|
||||
✅ **Entrées** → `${TEST_DIR}/`
|
||||
✅ **Sorties** → `${TMPDIR}/`
|
||||
✅ **Toujours rediriger** → `> file 2>&1`
|
||||
✅ **Incrémenter ntest** → Avant chaque test
|
||||
✅ **Messages clairs** → Descriptions explicites
|
||||
|
||||
❌ **Pas de chemins en dur**
|
||||
❌ **Pas de /tmp direct**
|
||||
❌ **Pas de sortie vers TEST_DIR**
|
||||
❌ **Pas de commandes sans redirection**
|
||||
|
||||
## Données de test
|
||||
|
||||
Créer un fichier minimal (< 500 bytes) :
|
||||
|
||||
```fasta
|
||||
>seq1
|
||||
ACGTACGTACGTACGT
|
||||
>seq2
|
||||
AAAACCCCGGGGTTTT
|
||||
>seq3
|
||||
ATCGATCGATCGATCG
|
||||
```
|
||||
|
||||
## Création rapide
|
||||
|
||||
```bash
|
||||
# 1. Créer le répertoire
|
||||
mkdir -p obitests/obitools/<commande>
|
||||
cd obitests/obitools/<commande>
|
||||
|
||||
# 2. Créer les données de test
|
||||
cat > test_data.fasta << 'EOF'
|
||||
>seq1
|
||||
ACGTACGTACGTACGT
|
||||
>seq2
|
||||
AAAACCCCGGGGTTTT
|
||||
EOF
|
||||
|
||||
# 3. Copier le template dans test.sh
|
||||
# 4. Adapter le TEST_NAME et CMD
|
||||
# 5. Ajouter les tests
|
||||
# 6. Rendre exécutable
|
||||
chmod +x test.sh
|
||||
|
||||
# 7. Tester
|
||||
./test.sh
|
||||
```
|
||||
|
||||
## Checklist
|
||||
|
||||
- [ ] `test.sh` exécutable (`chmod +x`)
|
||||
- [ ] Test d'aide inclus
|
||||
- [ ] Données < 1 KB
|
||||
- [ ] Sorties vers `${TMPDIR}/`
|
||||
- [ ] Entrées depuis `${TEST_DIR}/`
|
||||
- [ ] Redirections `2>&1`
|
||||
- [ ] Messages clairs
|
||||
- [ ] Testé localement
|
||||
- [ ] Exit code 0 si succès
|
||||
|
||||
## Debug
|
||||
|
||||
Conserver TMPDIR pour inspection :
|
||||
```bash
|
||||
cleanup() {
|
||||
echo "Temporary directory: $TMPDIR" 1>&2
|
||||
# rm -rf "$TMPDIR" # Commenté
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
Mode verbose :
|
||||
```bash
|
||||
set -x # Au début du script
|
||||
```
|
||||
|
||||
## Exemples
|
||||
|
||||
**Simple (1 test)** - obimicrosat
|
||||
```bash
|
||||
# Juste l'aide
|
||||
```
|
||||
|
||||
**Moyen (4-5 tests)** - obisuperkmer
|
||||
```bash
|
||||
# Aide + exécution + validation sortie + contenu
|
||||
```
|
||||
|
||||
**Complet (7+ tests)** - obiuniq
|
||||
```bash
|
||||
# Aide + exécution + comparaison CSV + options + multiples cas
|
||||
```
|
||||
|
||||
## Commandes utiles
|
||||
|
||||
```bash
|
||||
# Compter séquences
|
||||
grep -c "^>" file.fasta
|
||||
|
||||
# Fichier non vide
|
||||
[ -s file ]
|
||||
|
||||
# Comparer
|
||||
diff file1 file2 > /dev/null
|
||||
|
||||
# Comparer compressés
|
||||
zdiff file1.gz file2.gz
|
||||
|
||||
# Compter bases
|
||||
grep -v "^>" file | tr -d '\n' | wc -c
|
||||
```
|
||||
|
||||
## Ce qu'il faut retenir
|
||||
|
||||
Un bon test est **COURT**, **RAPIDE** et **SIMPLE** :
|
||||
- 3-10 tests maximum
|
||||
- Données < 1 KB
|
||||
- Exécution < 10 secondes
|
||||
- Pattern standard respecté
|
||||
268
blackboard/architechture/obisuperkmer-implementation.md
Normal file
268
blackboard/architechture/obisuperkmer-implementation.md
Normal file
@@ -0,0 +1,268 @@
|
||||
# Implémentation de la commande obisuperkmer
|
||||
|
||||
## Vue d'ensemble
|
||||
|
||||
La commande `obisuperkmer` a été implémentée en suivant l'architecture standard des commandes OBITools décrite dans `architecture-commande-obitools.md`. Cette commande permet d'extraire les super k-mers de fichiers de séquences biologiques.
|
||||
|
||||
## Qu'est-ce qu'un super k-mer ?
|
||||
|
||||
Un super k-mer est une sous-séquence maximale dans laquelle tous les k-mers consécutifs partagent le même minimiseur. Cette décomposition est utile pour :
|
||||
- L'indexation efficace de k-mers
|
||||
- La réduction de la redondance dans les analyses
|
||||
- L'optimisation de la mémoire pour les structures de données de k-mers
|
||||
|
||||
## Structure de l'implémentation
|
||||
|
||||
### 1. Package `pkg/obitools/obisuperkmer/`
|
||||
|
||||
Le package contient trois fichiers :
|
||||
|
||||
#### `obisuperkmer.go`
|
||||
Documentation du package avec une description de son rôle.
|
||||
|
||||
#### `options.go`
|
||||
Définit les options de ligne de commande :
|
||||
|
||||
```go
|
||||
var _KmerSize = 21 // Taille des k-mers (par défaut 21)
|
||||
var _MinimizerSize = 11 // Taille des minimiseurs (par défaut 11)
|
||||
```
|
||||
|
||||
**Options CLI disponibles :**
|
||||
- `--kmer-size` / `-k` : Taille des k-mers (entre m+1 et 31)
|
||||
- `--minimizer-size` / `-m` : Taille des minimiseurs (entre 1 et k-1)
|
||||
|
||||
**Fonctions d'accès :**
|
||||
- `CLIKmerSize()` : retourne la taille des k-mers
|
||||
- `CLIMinimizerSize()` : retourne la taille des minimiseurs
|
||||
- `SetKmerSize(k int)` : définit la taille des k-mers
|
||||
- `SetMinimizerSize(m int)` : définit la taille des minimiseurs
|
||||
|
||||
#### `superkmer.go`
|
||||
Implémente la logique de traitement :
|
||||
|
||||
```go
|
||||
func CLIExtractSuperKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence
|
||||
```
|
||||
|
||||
Cette fonction :
|
||||
1. Récupère les paramètres k et m depuis les options CLI
|
||||
2. Valide les paramètres (m < k, k <= 31, etc.)
|
||||
3. Crée un worker utilisant `obikmer.SuperKmerWorker(k, m)`
|
||||
4. Applique le worker en parallèle sur l'itérateur de séquences
|
||||
5. Retourne un itérateur de super k-mers
|
||||
|
||||
### 2. Exécutable `cmd/obitools/obisuperkmer/main.go`
|
||||
|
||||
L'exécutable suit le pattern standard minimal :
|
||||
|
||||
```go
|
||||
func main() {
|
||||
// 1. Génération du parser d'options
|
||||
optionParser := obioptions.GenerateOptionParser(
|
||||
"obisuperkmer",
|
||||
"extract super k-mers from sequence files",
|
||||
obisuperkmer.OptionSet)
|
||||
|
||||
// 2. Parsing des arguments
|
||||
_, args := optionParser(os.Args)
|
||||
|
||||
// 3. Lecture des séquences
|
||||
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||
obiconvert.OpenSequenceDataErrorMessage(args, err)
|
||||
|
||||
// 4. Extraction des super k-mers
|
||||
superkmers := obisuperkmer.CLIExtractSuperKmers(sequences)
|
||||
|
||||
// 5. Écriture des résultats
|
||||
obiconvert.CLIWriteBioSequences(superkmers, true)
|
||||
|
||||
// 6. Attente de la fin du pipeline
|
||||
obiutils.WaitForLastPipe()
|
||||
}
|
||||
```
|
||||
|
||||
## Utilisation du package `obikmer`
|
||||
|
||||
L'implémentation s'appuie sur le package `obikmer` qui fournit :
|
||||
|
||||
### `SuperKmerWorker(k int, m int) obiseq.SeqWorker`
|
||||
|
||||
Crée un worker qui :
|
||||
- Extrait les super k-mers d'une BioSequence
|
||||
- Retourne une slice de BioSequence, une par super k-mer
|
||||
- Chaque super k-mer contient les attributs suivants :
|
||||
|
||||
```go
|
||||
// Métadonnées ajoutées à chaque super k-mer :
|
||||
{
|
||||
"minimizer_value": uint64, // Valeur canonique du minimiseur
|
||||
"minimizer_seq": string, // Séquence ADN du minimiseur
|
||||
"k": int, // Taille des k-mers utilisée
|
||||
"m": int, // Taille des minimiseurs utilisée
|
||||
"start": int, // Position de début (0-indexé)
|
||||
"end": int, // Position de fin (exclusif)
|
||||
"parent_id": string, // ID de la séquence parente
|
||||
}
|
||||
```
|
||||
|
||||
### Algorithme sous-jacent
|
||||
|
||||
Le package `obikmer` utilise :
|
||||
- `IterSuperKmers(seq []byte, k int, m int)` : itérateur sur les super k-mers
|
||||
- Une deque monotone pour suivre les minimiseurs dans une fenêtre glissante
|
||||
- Complexité temporelle : O(n) où n est la longueur de la séquence
|
||||
- Complexité spatiale : O(k-m+1) pour la deque
|
||||
|
||||
## Exemple d'utilisation
|
||||
|
||||
### Ligne de commande
|
||||
|
||||
```bash
|
||||
# Extraction avec paramètres par défaut (k=21, m=11)
|
||||
obisuperkmer sequences.fasta > superkmers.fasta
|
||||
|
||||
# Spécifier les tailles de k-mers et minimiseurs
|
||||
obisuperkmer -k 25 -m 13 sequences.fasta -o superkmers.fasta
|
||||
|
||||
# Avec plusieurs fichiers d'entrée
|
||||
obisuperkmer --kmer-size 31 --minimizer-size 15 file1.fasta file2.fasta > output.fasta
|
||||
|
||||
# Format FASTQ en entrée, FASTA en sortie
|
||||
obisuperkmer sequences.fastq --fasta-output -o superkmers.fasta
|
||||
|
||||
# Avec compression
|
||||
obisuperkmer sequences.fasta -o superkmers.fasta.gz --compress
|
||||
```
|
||||
|
||||
### Exemple de sortie
|
||||
|
||||
Pour une séquence d'entrée :
|
||||
```
|
||||
>seq1
|
||||
ACGTACGTACGTACGTACGTACGT
|
||||
```
|
||||
|
||||
La sortie contiendra plusieurs super k-mers :
|
||||
```
|
||||
>seq1_superkmer_0_15 {"minimizer_value":123456,"minimizer_seq":"acgtacgt","k":21,"m":11,"start":0,"end":15,"parent_id":"seq1"}
|
||||
ACGTACGTACGTACG
|
||||
>seq1_superkmer_8_24 {"minimizer_value":789012,"minimizer_seq":"gtacgtac","k":21,"m":11,"start":8,"end":24,"parent_id":"seq1"}
|
||||
TACGTACGTACGTACGT
|
||||
```
|
||||
|
||||
## Options héritées de `obiconvert`
|
||||
|
||||
La commande hérite de toutes les options standard d'OBITools :
|
||||
|
||||
### Options d'entrée
|
||||
- `--fasta` : forcer le format FASTA
|
||||
- `--fastq` : forcer le format FASTQ
|
||||
- `--ecopcr` : format ecoPCR
|
||||
- `--embl` : format EMBL
|
||||
- `--genbank` : format GenBank
|
||||
- `--input-json-header` : en-têtes JSON
|
||||
- `--input-OBI-header` : en-têtes OBI
|
||||
|
||||
### Options de sortie
|
||||
- `--out` / `-o` : fichier de sortie (défaut : stdout)
|
||||
- `--fasta-output` : sortie en format FASTA
|
||||
- `--fastq-output` : sortie en format FASTQ
|
||||
- `--json-output` : sortie en format JSON
|
||||
- `--output-json-header` : en-têtes JSON en sortie
|
||||
- `--output-OBI-header` / `-O` : en-têtes OBI en sortie
|
||||
- `--compress` / `-Z` : compression gzip
|
||||
- `--skip-empty` : ignorer les séquences vides
|
||||
- `--no-progressbar` : désactiver la barre de progression
|
||||
|
||||
## Compilation
|
||||
|
||||
Pour compiler la commande :
|
||||
|
||||
```bash
|
||||
cd /chemin/vers/obitools4
|
||||
go build -o bin/obisuperkmer ./cmd/obitools/obisuperkmer/
|
||||
```
|
||||
|
||||
## Tests
|
||||
|
||||
Pour tester la commande :
|
||||
|
||||
```bash
|
||||
# Créer un fichier de test
|
||||
echo -e ">test\nACGTACGTACGTACGTACGTACGTACGTACGT" > test.fasta
|
||||
|
||||
# Exécuter obisuperkmer
|
||||
obisuperkmer test.fasta
|
||||
|
||||
# Vérifier avec des paramètres différents
|
||||
obisuperkmer -k 15 -m 7 test.fasta
|
||||
```
|
||||
|
||||
## Validation des paramètres
|
||||
|
||||
La commande valide automatiquement :
|
||||
- `1 <= m < k` : le minimiseur doit être plus petit que le k-mer
|
||||
- `2 <= k <= 31` : contrainte du codage sur 64 bits
|
||||
- `len(sequence) >= k` : la séquence doit être assez longue
|
||||
|
||||
En cas de paramètres invalides, la commande affiche une erreur explicite et s'arrête.
|
||||
|
||||
## Intégration avec le pipeline OBITools
|
||||
|
||||
La commande s'intègre naturellement dans les pipelines OBITools :
|
||||
|
||||
```bash
|
||||
# Pipeline complet d'analyse
|
||||
obiconvert sequences.fastq --fasta-output | \
|
||||
obisuperkmer -k 21 -m 11 | \
|
||||
obiuniq | \
|
||||
obigrep -p "minimizer_value>1000" > filtered_superkmers.fasta
|
||||
```
|
||||
|
||||
## Parallélisation
|
||||
|
||||
La commande utilise automatiquement :
|
||||
- `obidefault.ParallelWorkers()` pour le traitement parallèle
|
||||
- Les workers sont distribués sur les séquences d'entrée
|
||||
- La parallélisation est transparente pour l'utilisateur
|
||||
|
||||
## Conformité avec l'architecture OBITools
|
||||
|
||||
L'implémentation respecte tous les principes de l'architecture :
|
||||
|
||||
✅ Séparation des responsabilités (package + commande)
|
||||
✅ Convention de nommage cohérente (CLI*, Set*, _variables)
|
||||
✅ Réutilisation de `obiconvert` pour l'I/O
|
||||
✅ Options standard partagées
|
||||
✅ Pattern Worker pour le traitement
|
||||
✅ Validation des paramètres
|
||||
✅ Logging avec `logrus`
|
||||
✅ Gestion d'erreurs cohérente
|
||||
✅ Documentation complète
|
||||
|
||||
## Fichiers créés
|
||||
|
||||
```
|
||||
pkg/obitools/obisuperkmer/
|
||||
├── obisuperkmer.go # Documentation du package
|
||||
├── options.go # Définition des options CLI
|
||||
└── superkmer.go # Implémentation du traitement
|
||||
|
||||
cmd/obitools/obisuperkmer/
|
||||
└── main.go # Point d'entrée de la commande
|
||||
```
|
||||
|
||||
## Prochaines étapes
|
||||
|
||||
1. **Compilation** : Compiler la commande avec `go build`
|
||||
2. **Tests unitaires** : Créer des tests dans `pkg/obitools/obisuperkmer/superkmer_test.go`
|
||||
3. **Documentation utilisateur** : Ajouter la documentation de la commande
|
||||
4. **Intégration CI/CD** : Ajouter aux tests d'intégration
|
||||
5. **Benchmarks** : Mesurer les performances sur différents jeux de données
|
||||
|
||||
## Références
|
||||
|
||||
- Architecture des commandes OBITools : `architecture-commande-obitools.md`
|
||||
- Package `obikmer` : `pkg/obikmer/`
|
||||
- Tests du package : `pkg/obikmer/superkmer_iter_test.go`
|
||||
440
blackboard/architechture/obisuperkmer-tests.md
Normal file
440
blackboard/architechture/obisuperkmer-tests.md
Normal file
@@ -0,0 +1,440 @@
|
||||
# Tests automatisés pour obisuperkmer
|
||||
|
||||
## Vue d'ensemble
|
||||
|
||||
Des tests automatisés ont été créés pour la commande `obisuperkmer` dans le répertoire `obitests/obitools/obisuperkmer/`. Ces tests suivent le pattern standard utilisé par toutes les commandes OBITools et sont conçus pour être exécutés dans un environnement CI/CD.
|
||||
|
||||
## Fichiers créés
|
||||
|
||||
```
|
||||
obitests/obitools/obisuperkmer/
|
||||
├── test.sh # Script de test principal (6.7 KB)
|
||||
├── test_sequences.fasta # Données de test (117 bytes)
|
||||
└── README.md # Documentation (4.1 KB)
|
||||
```
|
||||
|
||||
### Taille totale : ~11 KB
|
||||
|
||||
Cette taille minimale est idéale pour un dépôt Git et des tests CI/CD rapides.
|
||||
|
||||
## Jeu de données de test
|
||||
|
||||
### Fichier : `test_sequences.fasta` (117 bytes)
|
||||
|
||||
Le fichier contient 3 séquences de 32 nucléotides chacune :
|
||||
|
||||
```fasta
|
||||
>seq1
|
||||
ACGTACGTACGTACGTACGTACGTACGTACGT
|
||||
>seq2
|
||||
AAAACCCCGGGGTTTTAAAACCCCGGGGTTTT
|
||||
>seq3
|
||||
ATCGATCGATCGATCGATCGATCGATCGATCG
|
||||
```
|
||||
|
||||
#### Justification du choix
|
||||
|
||||
1. **seq1** : Motif répétitif simple (ACGT)
|
||||
- Teste l'extraction de super k-mers sur une séquence avec faible complexité
|
||||
- Les minimiseurs devraient être assez réguliers
|
||||
|
||||
2. **seq2** : Blocs homopolymères
|
||||
- Teste le comportement avec des régions de très faible complexité
|
||||
- Les minimiseurs varieront entre les blocs A, C, G et T
|
||||
|
||||
3. **seq3** : Motif différent (ATCG)
|
||||
- Teste la diversité des super k-mers extraits
|
||||
- Différent de seq1 pour vérifier la distinction
|
||||
|
||||
#### Caractéristiques
|
||||
|
||||
- **Longueur** : 32 nucléotides par séquence
|
||||
- **Taille totale** : 96 nucléotides (3 × 32)
|
||||
- **Format** : FASTA avec en-têtes JSON compatibles
|
||||
- **Alphabet** : A, C, G, T uniquement (pas de bases ambiguës)
|
||||
- **Taille du fichier** : 117 bytes
|
||||
|
||||
Avec k=21 (défaut), chaque séquence de 32 bp peut produire :
|
||||
- 32 - 21 + 1 = 12 k-mers
|
||||
- Plusieurs super k-mers selon les minimiseurs
|
||||
|
||||
## Script de test : `test.sh`
|
||||
|
||||
### Structure
|
||||
|
||||
Le script suit le pattern standard OBITools :
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
|
||||
TEST_NAME=obisuperkmer
|
||||
CMD=obisuperkmer
|
||||
|
||||
# Variables et fonctions standard
|
||||
TEST_DIR="..."
|
||||
OBITOOLS_DIR="..."
|
||||
TMPDIR="$(mktemp -d)"
|
||||
ntest=0
|
||||
success=0
|
||||
failed=0
|
||||
|
||||
cleanup() { ... }
|
||||
log() { ... }
|
||||
|
||||
# Tests (12 au total)
|
||||
# ...
|
||||
|
||||
cleanup
|
||||
```
|
||||
|
||||
### Tests implémentés
|
||||
|
||||
#### 1. Test d'aide (`-h`)
|
||||
```bash
|
||||
obisuperkmer -h
|
||||
```
|
||||
Vérifie que la commande peut afficher son aide sans erreur.
|
||||
|
||||
#### 2. Extraction basique avec paramètres par défaut
|
||||
```bash
|
||||
obisuperkmer test_sequences.fasta > output_default.fasta
|
||||
```
|
||||
Teste l'exécution avec k=21, m=11 (défaut).
|
||||
|
||||
#### 3. Vérification de sortie non vide
|
||||
```bash
|
||||
[ -s output_default.fasta ]
|
||||
```
|
||||
S'assure que la commande produit un résultat.
|
||||
|
||||
#### 4. Comptage des super k-mers
|
||||
```bash
|
||||
grep -c "^>" output_default.fasta
|
||||
```
|
||||
Vérifie qu'au moins un super k-mer a été extrait.
|
||||
|
||||
#### 5. Présence des métadonnées
|
||||
```bash
|
||||
grep -q "minimizer_value" output_default.fasta
|
||||
grep -q "minimizer_seq" output_default.fasta
|
||||
grep -q "parent_id" output_default.fasta
|
||||
```
|
||||
Vérifie que les attributs requis sont présents.
|
||||
|
||||
#### 6. Extraction avec paramètres personnalisés
|
||||
```bash
|
||||
obisuperkmer -k 15 -m 7 test_sequences.fasta > output_k15_m7.fasta
|
||||
```
|
||||
Teste la configuration de k et m.
|
||||
|
||||
#### 7. Validation des paramètres personnalisés
|
||||
```bash
|
||||
grep -q '"k":15' output_k15_m7.fasta
|
||||
grep -q '"m":7' output_k15_m7.fasta
|
||||
```
|
||||
Vérifie que les paramètres sont correctement enregistrés.
|
||||
|
||||
#### 8. Format de sortie FASTA
|
||||
```bash
|
||||
obisuperkmer --fasta-output test_sequences.fasta > output_fasta.fasta
|
||||
```
|
||||
Teste l'option de format explicite.
|
||||
|
||||
#### 9. Vérification des IDs
|
||||
```bash
|
||||
grep "^>" output_default.fasta | grep -q "superkmer"
|
||||
```
|
||||
S'assure que les IDs contiennent "superkmer".
|
||||
|
||||
#### 10. Préservation des IDs parents
|
||||
```bash
|
||||
grep -q "seq1" output_default.fasta
|
||||
grep -q "seq2" output_default.fasta
|
||||
grep -q "seq3" output_default.fasta
|
||||
```
|
||||
Vérifie que les IDs des séquences parentes sont préservés.
|
||||
|
||||
#### 11. Option de fichier de sortie (`-o`)
|
||||
```bash
|
||||
obisuperkmer -o output_file.fasta test_sequences.fasta
|
||||
```
|
||||
Teste la redirection vers un fichier.
|
||||
|
||||
#### 12. Vérification de création du fichier
|
||||
```bash
|
||||
[ -s output_file.fasta ]
|
||||
```
|
||||
S'assure que le fichier a été créé.
|
||||
|
||||
#### 13. Cohérence des longueurs
|
||||
```bash
|
||||
# Vérifie que longueur(output) <= longueur(input)
|
||||
```
|
||||
S'assure que les super k-mers ne sont pas plus longs que l'entrée.
|
||||
|
||||
### Compteurs
|
||||
|
||||
- **ntest** : Nombre de tests exécutés
|
||||
- **success** : Nombre de tests réussis
|
||||
- **failed** : Nombre de tests échoués
|
||||
|
||||
### Sortie du script
|
||||
|
||||
#### En cas de succès
|
||||
```
|
||||
========================================
|
||||
## Results of the obisuperkmer tests:
|
||||
|
||||
- 12 tests run
|
||||
- 12 successfully completed
|
||||
- 0 failed tests
|
||||
|
||||
Cleaning up the temporary directory...
|
||||
|
||||
========================================
|
||||
```
|
||||
|
||||
Exit code : **0**
|
||||
|
||||
#### En cas d'échec
|
||||
```
|
||||
========================================
|
||||
## Results of the obisuperkmer tests:
|
||||
|
||||
- 12 tests run
|
||||
- 10 successfully completed
|
||||
- 2 failed tests
|
||||
|
||||
Cleaning up the temporary directory...
|
||||
|
||||
========================================
|
||||
```
|
||||
|
||||
Exit code : **1**
|
||||
|
||||
## Intégration CI/CD
|
||||
|
||||
### Exécution automatique
|
||||
|
||||
Le script est conçu pour être exécuté automatiquement dans un pipeline CI/CD :
|
||||
|
||||
1. Le build produit l'exécutable dans `build/obisuperkmer`
|
||||
2. Le script de test ajoute `build/` au PATH
|
||||
3. Les tests s'exécutent
|
||||
4. Le code de retour indique le succès (0) ou l'échec (1)
|
||||
|
||||
### Exemple de configuration CI/CD
|
||||
|
||||
```yaml
|
||||
# .github/workflows/test.yml ou équivalent
|
||||
test-obisuperkmer:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Build obitools
|
||||
run: make build
|
||||
- name: Test obisuperkmer
|
||||
run: ./obitests/obitools/obisuperkmer/test.sh
|
||||
```
|
||||
|
||||
### Avantages
|
||||
|
||||
✅ **Rapidité** : Données de test minimales (117 bytes)
|
||||
✅ **Fiabilité** : Tests reproductibles
|
||||
✅ **Isolation** : Utilisation d'un répertoire temporaire
|
||||
✅ **Nettoyage automatique** : Pas de fichiers résiduels
|
||||
✅ **Logging** : Messages horodatés et détaillés
|
||||
✅ **Compatibilité** : Pattern standard OBITools
|
||||
|
||||
## Exécution locale
|
||||
|
||||
### Prérequis
|
||||
|
||||
1. Compiler obisuperkmer :
|
||||
```bash
|
||||
cd /chemin/vers/obitools4
|
||||
go build -o build/obisuperkmer ./cmd/obitools/obisuperkmer/
|
||||
```
|
||||
|
||||
2. Se placer dans le répertoire de test :
|
||||
```bash
|
||||
cd obitests/obitools/obisuperkmer
|
||||
```
|
||||
|
||||
3. Exécuter le script :
|
||||
```bash
|
||||
./test.sh
|
||||
```
|
||||
|
||||
### Exemple de sortie
|
||||
|
||||
```
|
||||
[obisuperkmer @ Fri Feb 7 13:00:00 CET 2026] Testing obisuperkmer...
|
||||
[obisuperkmer @ Fri Feb 7 13:00:00 CET 2026] Test directory is /path/to/obitests/obitools/obisuperkmer
|
||||
[obisuperkmer @ Fri Feb 7 13:00:00 CET 2026] obitools directory is /path/to/build
|
||||
[obisuperkmer @ Fri Feb 7 13:00:00 CET 2026] Temporary directory is /tmp/tmp.abc123
|
||||
[obisuperkmer @ Fri Feb 7 13:00:00 CET 2026] files: README.md test.sh test_sequences.fasta
|
||||
[obisuperkmer @ Fri Feb 7 13:00:01 CET 2026] OBISuperkmer: printing help OK
|
||||
[obisuperkmer @ Fri Feb 7 13:00:02 CET 2026] OBISuperkmer: basic extraction with default parameters OK
|
||||
[obisuperkmer @ Fri Feb 7 13:00:02 CET 2026] OBISuperkmer: output file is not empty OK
|
||||
[obisuperkmer @ Fri Feb 7 13:00:02 CET 2026] OBISuperkmer: extracted 8 super k-mers OK
|
||||
[obisuperkmer @ Fri Feb 7 13:00:02 CET 2026] OBISuperkmer: super k-mers contain required metadata OK
|
||||
[obisuperkmer @ Fri Feb 7 13:00:03 CET 2026] OBISuperkmer: extraction with custom k=15, m=7 OK
|
||||
[obisuperkmer @ Fri Feb 7 13:00:03 CET 2026] OBISuperkmer: custom parameters correctly set in metadata OK
|
||||
[obisuperkmer @ Fri Feb 7 13:00:03 CET 2026] OBISuperkmer: FASTA output format OK
|
||||
[obisuperkmer @ Fri Feb 7 13:00:03 CET 2026] OBISuperkmer: super k-mer IDs contain 'superkmer' OK
|
||||
[obisuperkmer @ Fri Feb 7 13:00:03 CET 2026] OBISuperkmer: parent sequence IDs preserved OK
|
||||
[obisuperkmer @ Fri Feb 7 13:00:04 CET 2026] OBISuperkmer: output to file with -o option OK
|
||||
[obisuperkmer @ Fri Feb 7 13:00:04 CET 2026] OBISuperkmer: output file created with -o option OK
|
||||
[obisuperkmer @ Fri Feb 7 13:00:04 CET 2026] OBISuperkmer: super k-mer total length <= input length OK
|
||||
========================================
|
||||
## Results of the obisuperkmer tests:
|
||||
|
||||
- 12 tests run
|
||||
- 12 successfully completed
|
||||
- 0 failed tests
|
||||
|
||||
Cleaning up the temporary directory...
|
||||
|
||||
========================================
|
||||
```
|
||||
|
||||
## Debugging des tests
|
||||
|
||||
### Conserver les fichiers temporaires
|
||||
|
||||
Modifier temporairement la fonction `cleanup()` :
|
||||
|
||||
```bash
|
||||
cleanup() {
|
||||
echo "Temporary directory: $TMPDIR" 1>&2
|
||||
# Commenter cette ligne pour conserver les fichiers
|
||||
# rm -rf "$TMPDIR"
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
### Activer le mode verbose
|
||||
|
||||
Ajouter au début du script :
|
||||
|
||||
```bash
|
||||
set -x # Active l'affichage de toutes les commandes
|
||||
```
|
||||
|
||||
### Tester une seule commande
|
||||
|
||||
Extraire et exécuter manuellement :
|
||||
|
||||
```bash
|
||||
export TEST_DIR=/chemin/vers/obitests/obitools/obisuperkmer
|
||||
export TMPDIR=$(mktemp -d)
|
||||
obisuperkmer "${TEST_DIR}/test_sequences.fasta" > "${TMPDIR}/output.fasta"
|
||||
cat "${TMPDIR}/output.fasta"
|
||||
```
|
||||
|
||||
## Ajout de nouveaux tests
|
||||
|
||||
Pour ajouter un test supplémentaire :
|
||||
|
||||
1. Incrémenter le compteur `ntest`
|
||||
2. Écrire la condition de test
|
||||
3. Logger le succès ou l'échec
|
||||
4. Incrémenter le bon compteur
|
||||
|
||||
```bash
|
||||
((ntest++))
|
||||
if ma_nouvelle_commande_de_test
|
||||
then
|
||||
log "Description du test: OK"
|
||||
((success++))
|
||||
else
|
||||
log "Description du test: failed"
|
||||
((failed++))
|
||||
fi
|
||||
```
|
||||
|
||||
## Comparaison avec d'autres tests
|
||||
|
||||
### Taille des données de test
|
||||
|
||||
| Commande | Taille des données | Nombre de fichiers |
|
||||
|----------|-------------------|-------------------|
|
||||
| obiconvert | 925 KB | 1 fichier |
|
||||
| obiuniq | ~600 bytes | 4 fichiers |
|
||||
| obimicrosat | 0 bytes | 0 fichiers (génère à la volée) |
|
||||
| **obisuperkmer** | **117 bytes** | **1 fichier** |
|
||||
|
||||
Notre test `obisuperkmer` est parmi les plus légers, ce qui est optimal pour CI/CD.
|
||||
|
||||
### Nombre de tests
|
||||
|
||||
| Commande | Nombre de tests |
|
||||
|----------|----------------|
|
||||
| obiconvert | 3 tests |
|
||||
| obiuniq | 7 tests |
|
||||
| obimicrosat | 1 test |
|
||||
| **obisuperkmer** | **12 tests** |
|
||||
|
||||
Notre test `obisuperkmer` offre une couverture complète avec 12 tests différents.
|
||||
|
||||
## Couverture de test
|
||||
|
||||
Les tests couvrent :
|
||||
|
||||
✅ Affichage de l'aide
|
||||
✅ Exécution basique
|
||||
✅ Paramètres par défaut (k=21, m=11)
|
||||
✅ Paramètres personnalisés (k=15, m=7)
|
||||
✅ Formats de sortie (FASTA)
|
||||
✅ Redirection vers fichier (`-o`)
|
||||
✅ Présence des métadonnées
|
||||
✅ Validation des IDs
|
||||
✅ Préservation des IDs parents
|
||||
✅ Cohérence des longueurs
|
||||
✅ Production de résultats non vides
|
||||
|
||||
## Maintenance
|
||||
|
||||
### Mise à jour des tests
|
||||
|
||||
Si l'implémentation de `obisuperkmer` change :
|
||||
|
||||
1. Vérifier que les tests existants passent toujours
|
||||
2. Ajouter de nouveaux tests pour les nouvelles fonctionnalités
|
||||
3. Mettre à jour `README.md` si nécessaire
|
||||
4. Documenter les changements
|
||||
|
||||
### Vérification régulière
|
||||
|
||||
Exécuter périodiquement :
|
||||
|
||||
```bash
|
||||
cd obitests/obitools/obisuperkmer
|
||||
./test.sh
|
||||
```
|
||||
|
||||
Ou via l'ensemble des tests :
|
||||
|
||||
```bash
|
||||
cd obitests
|
||||
for dir in obitools/*/; do
|
||||
if [ -f "$dir/test.sh" ]; then
|
||||
echo "Testing $(basename $dir)..."
|
||||
(cd "$dir" && ./test.sh) || echo "FAILED: $(basename $dir)"
|
||||
fi
|
||||
done
|
||||
```
|
||||
|
||||
## Conclusion
|
||||
|
||||
Les tests pour `obisuperkmer` sont :
|
||||
|
||||
- ✅ **Complets** : 12 tests couvrant toutes les fonctionnalités principales
|
||||
- ✅ **Légers** : 117 bytes de données de test
|
||||
- ✅ **Rapides** : Exécution en quelques secondes
|
||||
- ✅ **Fiables** : Pattern éprouvé utilisé par toutes les commandes OBITools
|
||||
- ✅ **Maintenables** : Structure claire et documentée
|
||||
- ✅ **CI/CD ready** : Code de retour approprié et nettoyage automatique
|
||||
|
||||
Ils garantissent que la commande fonctionne correctement à chaque commit et facilitent la détection précoce des régressions.
|
||||
34
cmd/obitools/obisuperkmer/main.go
Normal file
34
cmd/obitools/obisuperkmer/main.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obisuperkmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Generate option parser
|
||||
optionParser := obioptions.GenerateOptionParser(
|
||||
"obisuperkmer",
|
||||
"extract super k-mers from sequence files",
|
||||
obisuperkmer.OptionSet)
|
||||
|
||||
// Parse command-line arguments
|
||||
_, args := optionParser(os.Args)
|
||||
|
||||
// Read input sequences
|
||||
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||
obiconvert.OpenSequenceDataErrorMessage(args, err)
|
||||
|
||||
// Extract super k-mers
|
||||
superkmers := obisuperkmer.CLIExtractSuperKmers(sequences)
|
||||
|
||||
// Write output sequences
|
||||
obiconvert.CLIWriteBioSequences(superkmers, true)
|
||||
|
||||
// Wait for pipeline completion
|
||||
obiutils.WaitForLastPipe()
|
||||
}
|
||||
148
obitests/obitools/obisuperkmer/README.md
Normal file
148
obitests/obitools/obisuperkmer/README.md
Normal file
@@ -0,0 +1,148 @@
|
||||
# Tests pour obisuperkmer
|
||||
|
||||
## Description
|
||||
|
||||
Ce répertoire contient les tests automatisés pour la commande `obisuperkmer`.
|
||||
|
||||
## Fichiers
|
||||
|
||||
- `test.sh` : Script de test principal (exécutable)
|
||||
- `test_sequences.fasta` : Jeu de données de test minimal (3 séquences courtes)
|
||||
- `README.md` : Ce fichier
|
||||
|
||||
## Jeu de données de test
|
||||
|
||||
Le fichier `test_sequences.fasta` contient 3 séquences de 32 nucléotides chacune :
|
||||
|
||||
1. **seq1** : Répétition du motif ACGT (séquence régulière)
|
||||
2. **seq2** : Alternance de blocs homopolymères (AAAA, CCCC, GGGG, TTTT)
|
||||
3. **seq3** : Répétition du motif ATCG (différent de seq1)
|
||||
|
||||
Ces séquences sont volontairement courtes pour :
|
||||
- Minimiser la taille du dépôt Git
|
||||
- Accélérer l'exécution des tests en CI/CD
|
||||
- Tester différents cas d'extraction de super k-mers
|
||||
|
||||
## Tests effectués
|
||||
|
||||
Le script `test.sh` effectue 12 tests :
|
||||
|
||||
### Test 1 : Affichage de l'aide
|
||||
Vérifie que `obisuperkmer -h` s'exécute correctement.
|
||||
|
||||
### Test 2 : Extraction basique avec paramètres par défaut
|
||||
Exécute `obisuperkmer` avec k=21, m=11 (valeurs par défaut).
|
||||
|
||||
### Test 3 : Vérification du fichier de sortie non vide
|
||||
S'assure que la commande produit une sortie.
|
||||
|
||||
### Test 4 : Comptage des super k-mers extraits
|
||||
Vérifie qu'au moins un super k-mer a été extrait.
|
||||
|
||||
### Test 5 : Présence des métadonnées requises
|
||||
Vérifie que chaque super k-mer contient :
|
||||
- `minimizer_value`
|
||||
- `minimizer_seq`
|
||||
- `parent_id`
|
||||
|
||||
### Test 6 : Extraction avec paramètres personnalisés
|
||||
Teste avec k=15 et m=7.
|
||||
|
||||
### Test 7 : Vérification des paramètres dans les métadonnées
|
||||
S'assure que les valeurs k=15 et m=7 sont présentes dans la sortie.
|
||||
|
||||
### Test 8 : Format de sortie FASTA explicite
|
||||
Teste l'option `--fasta-output`.
|
||||
|
||||
### Test 9 : Vérification des IDs des super k-mers
|
||||
S'assure que tous les IDs contiennent "superkmer".
|
||||
|
||||
### Test 10 : Préservation des IDs parents
|
||||
Vérifie que seq1, seq2 et seq3 apparaissent dans la sortie.
|
||||
|
||||
### Test 11 : Option -o pour fichier de sortie
|
||||
Teste la redirection vers un fichier avec `-o`.
|
||||
|
||||
### Test 12 : Vérification de la création du fichier avec -o
|
||||
S'assure que le fichier de sortie a été créé.
|
||||
|
||||
### Test 13 : Cohérence des longueurs
|
||||
Vérifie que la somme des longueurs des super k-mers est inférieure ou égale à la longueur totale des séquences d'entrée.
|
||||
|
||||
## Exécution des tests
|
||||
|
||||
### Localement
|
||||
|
||||
```bash
|
||||
cd /chemin/vers/obitools4/obitests/obitools/obisuperkmer
|
||||
./test.sh
|
||||
```
|
||||
|
||||
### En CI/CD
|
||||
|
||||
Les tests sont automatiquement exécutés lors de chaque commit via le système CI/CD configuré pour le projet.
|
||||
|
||||
### Prérequis
|
||||
|
||||
- La commande `obisuperkmer` doit être compilée et disponible dans `../../build/`
|
||||
- Les dépendances système : bash, grep, etc.
|
||||
|
||||
## Structure du script de test
|
||||
|
||||
Le script suit le pattern standard utilisé par tous les tests OBITools :
|
||||
|
||||
1. **En-tête** : Définition du nom du test et de la commande
|
||||
2. **Variables** : Configuration des chemins et compteurs
|
||||
3. **Fonction cleanup()** : Affiche les résultats et nettoie le répertoire temporaire
|
||||
4. **Fonction log()** : Affiche les messages horodatés
|
||||
5. **Tests** : Série de tests avec incrémentation des compteurs
|
||||
6. **Appel cleanup()** : Nettoyage et sortie avec code de retour approprié
|
||||
|
||||
## Format de sortie
|
||||
|
||||
Chaque test affiche :
|
||||
```
|
||||
[obisuperkmer @ date] message
|
||||
```
|
||||
|
||||
En fin d'exécution :
|
||||
```
|
||||
========================================
|
||||
## Results of the obisuperkmer tests:
|
||||
|
||||
- 12 tests run
|
||||
- 12 successfully completed
|
||||
- 0 failed tests
|
||||
|
||||
Cleaning up the temporary directory...
|
||||
|
||||
========================================
|
||||
```
|
||||
|
||||
## Codes de retour
|
||||
|
||||
- **0** : Tous les tests ont réussi
|
||||
- **1** : Au moins un test a échoué
|
||||
|
||||
## Ajout de nouveaux tests
|
||||
|
||||
Pour ajouter un nouveau test, suivre le pattern :
|
||||
|
||||
```bash
|
||||
((ntest++))
|
||||
if commande_test arguments
|
||||
then
|
||||
log "Description: OK"
|
||||
((success++))
|
||||
else
|
||||
log "Description: failed"
|
||||
((failed++))
|
||||
fi
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- Les fichiers temporaires sont créés dans `$TMPDIR` (créé par mktemp)
|
||||
- Les fichiers de données sont dans `$TEST_DIR`
|
||||
- La commande testée doit être dans `$OBITOOLS_DIR` (../../build/)
|
||||
- Le répertoire temporaire est automatiquement nettoyé à la fin
|
||||
253
obitests/obitools/obisuperkmer/test.sh
Executable file
253
obitests/obitools/obisuperkmer/test.sh
Executable file
@@ -0,0 +1,253 @@
|
||||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Here give the name of the test serie
|
||||
#
|
||||
|
||||
TEST_NAME=obisuperkmer
|
||||
CMD=obisuperkmer
|
||||
|
||||
######
|
||||
#
|
||||
# Some variable and function definitions: please don't change them
|
||||
#
|
||||
######
|
||||
TEST_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
|
||||
OBITOOLS_DIR="${TEST_DIR/obitest*/}build"
|
||||
export PATH="${OBITOOLS_DIR}:${PATH}"
|
||||
|
||||
MCMD="$(echo "${CMD:0:4}" | tr '[:lower:]' '[:upper:]')$(echo "${CMD:4}" | tr '[:upper:]' '[:lower:]')"
|
||||
|
||||
TMPDIR="$(mktemp -d)"
|
||||
ntest=0
|
||||
success=0
|
||||
failed=0
|
||||
|
||||
cleanup() {
|
||||
echo "========================================" 1>&2
|
||||
echo "## Results of the $TEST_NAME tests:" 1>&2
|
||||
|
||||
echo 1>&2
|
||||
echo "- $ntest tests run" 1>&2
|
||||
echo "- $success successfully completed" 1>&2
|
||||
echo "- $failed failed tests" 1>&2
|
||||
echo 1>&2
|
||||
echo "Cleaning up the temporary directory..." 1>&2
|
||||
echo 1>&2
|
||||
echo "========================================" 1>&2
|
||||
|
||||
rm -rf "$TMPDIR" # Suppress the temporary directory
|
||||
|
||||
if [ $failed -gt 0 ]; then
|
||||
log "$TEST_NAME tests failed"
|
||||
log
|
||||
log
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log
|
||||
log
|
||||
|
||||
exit 0
|
||||
}
|
||||
|
||||
log() {
|
||||
echo -e "[$TEST_NAME @ $(date)] $*" 1>&2
|
||||
}
|
||||
|
||||
log "Testing $TEST_NAME..."
|
||||
log "Test directory is $TEST_DIR"
|
||||
log "obitools directory is $OBITOOLS_DIR"
|
||||
log "Temporary directory is $TMPDIR"
|
||||
log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
|
||||
|
||||
######################################################################
|
||||
####
|
||||
#### Below are the tests
|
||||
####
|
||||
#### Before each test :
|
||||
#### - increment the variable ntest
|
||||
####
|
||||
#### Run the command as the condition of an if / then /else
|
||||
#### - The command must return 0 on success
|
||||
#### - The command must return an exit code different from 0 on failure
|
||||
#### - The datafiles are stored in the same directory than the test script
|
||||
#### - The test script directory is stored in the TEST_DIR variable
|
||||
#### - If result files have to be produced they must be stored
|
||||
#### in the temporary directory (TMPDIR variable)
|
||||
####
|
||||
#### then clause is executed on success of the command
|
||||
#### - Write a success message using the log function
|
||||
#### - increment the variable success
|
||||
####
|
||||
#### else clause is executed on failure of the command
|
||||
#### - Write a failure message using the log function
|
||||
#### - increment the variable failed
|
||||
####
|
||||
######################################################################
|
||||
|
||||
|
||||
|
||||
((ntest++))
|
||||
if $CMD -h > "${TMPDIR}/help.txt" 2>&1
|
||||
then
|
||||
log "$MCMD: printing help OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: printing help failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
# Test 1: Basic super k-mer extraction with default parameters
|
||||
((ntest++))
|
||||
if obisuperkmer "${TEST_DIR}/test_sequences.fasta" \
|
||||
> "${TMPDIR}/output_default.fasta" 2>&1
|
||||
then
|
||||
log "$MCMD: basic extraction with default parameters OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: basic extraction with default parameters failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
# Test 2: Verify output is not empty
|
||||
((ntest++))
|
||||
if [ -s "${TMPDIR}/output_default.fasta" ]
|
||||
then
|
||||
log "$MCMD: output file is not empty OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: output file is empty - failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
# Test 3: Count number of super k-mers extracted (should be > 0)
|
||||
((ntest++))
|
||||
num_sequences=$(grep -c "^>" "${TMPDIR}/output_default.fasta")
|
||||
if [ "$num_sequences" -gt 0 ]
|
||||
then
|
||||
log "$MCMD: extracted $num_sequences super k-mers OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: no super k-mers extracted - failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
# Test 4: Verify super k-mers have required metadata attributes
|
||||
((ntest++))
|
||||
if grep -q "minimizer_value" "${TMPDIR}/output_default.fasta" && \
|
||||
grep -q "minimizer_seq" "${TMPDIR}/output_default.fasta" && \
|
||||
grep -q "parent_id" "${TMPDIR}/output_default.fasta"
|
||||
then
|
||||
log "$MCMD: super k-mers contain required metadata OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: super k-mers missing metadata - failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
# Test 5: Extract super k-mers with custom k and m parameters
|
||||
((ntest++))
|
||||
if obisuperkmer -k 15 -m 7 "${TEST_DIR}/test_sequences.fasta" \
|
||||
> "${TMPDIR}/output_k15_m7.fasta" 2>&1
|
||||
then
|
||||
log "$MCMD: extraction with custom k=15, m=7 OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: extraction with custom k=15, m=7 failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
# Test 6: Verify custom parameters in output metadata
|
||||
((ntest++))
|
||||
if grep -q '"k":15' "${TMPDIR}/output_k15_m7.fasta" && \
|
||||
grep -q '"m":7' "${TMPDIR}/output_k15_m7.fasta"
|
||||
then
|
||||
log "$MCMD: custom parameters correctly set in metadata OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: custom parameters not in metadata - failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
# Test 7: Test with different output format (FASTA output explicitly)
|
||||
((ntest++))
|
||||
if obisuperkmer --fasta-output -k 21 -m 11 \
|
||||
"${TEST_DIR}/test_sequences.fasta" \
|
||||
> "${TMPDIR}/output_fasta.fasta" 2>&1
|
||||
then
|
||||
log "$MCMD: FASTA output format OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: FASTA output format failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
# Test 8: Verify all super k-mers have superkmer in their ID
|
||||
((ntest++))
|
||||
if grep "^>" "${TMPDIR}/output_default.fasta" | grep -q "superkmer"
|
||||
then
|
||||
log "$MCMD: super k-mer IDs contain 'superkmer' OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: super k-mer IDs missing 'superkmer' - failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
# Test 9: Verify parent sequence IDs are preserved
|
||||
((ntest++))
|
||||
if grep -q "seq1" "${TMPDIR}/output_default.fasta" && \
|
||||
grep -q "seq2" "${TMPDIR}/output_default.fasta" && \
|
||||
grep -q "seq3" "${TMPDIR}/output_default.fasta"
|
||||
then
|
||||
log "$MCMD: parent sequence IDs preserved OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: parent sequence IDs not preserved - failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
# Test 10: Test with output file option
|
||||
((ntest++))
|
||||
if obisuperkmer -o "${TMPDIR}/output_file.fasta" \
|
||||
"${TEST_DIR}/test_sequences.fasta" 2>&1
|
||||
then
|
||||
log "$MCMD: output to file with -o option OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: output to file with -o option failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
# Test 11: Verify output file was created with -o option
|
||||
((ntest++))
|
||||
if [ -s "${TMPDIR}/output_file.fasta" ]
|
||||
then
|
||||
log "$MCMD: output file created with -o option OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: output file not created with -o option - failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
# Test 12: Verify each super k-mer length is >= k (default k=31)
|
||||
((ntest++))
|
||||
min_len=$(grep -v "^>" "${TMPDIR}/output_default.fasta" | awk '{print length}' | sort -n | head -1)
|
||||
|
||||
if [ "$min_len" -ge 31 ]
|
||||
then
|
||||
log "$MCMD: all super k-mers have length >= k OK"
|
||||
((success++))
|
||||
else
|
||||
log "$MCMD: some super k-mers shorter than k ($min_len < 31) - failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
#########################################
|
||||
#
|
||||
# At the end of the tests
|
||||
# the cleanup function is called
|
||||
#
|
||||
#########################################
|
||||
|
||||
cleanup
|
||||
6
obitests/obitools/obisuperkmer/test_sequences.fasta
Normal file
6
obitests/obitools/obisuperkmer/test_sequences.fasta
Normal file
@@ -0,0 +1,6 @@
|
||||
>seq1
|
||||
ACGTACGTACGTACGTACGTACGTACGTACGT
|
||||
>seq2
|
||||
AAAACCCCGGGGTTTTAAAACCCCGGGGTTTT
|
||||
>seq3
|
||||
ATCGATCGATCGATCGATCGATCGATCGATCG
|
||||
@@ -195,6 +195,59 @@ else
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
##
|
||||
## Test merge attributes consistency between in-memory and on-disk paths
|
||||
## This test catches the bug where the shared classifier in the on-disk
|
||||
## dereplication path caused incorrect merged attributes.
|
||||
##
|
||||
|
||||
((ntest++))
|
||||
if obiuniq -m a -m b --in-memory \
|
||||
"${TEST_DIR}/touniq.fasta" \
|
||||
> "${TMPDIR}/touniq_u_merge_mem.fasta" 2>/dev/null
|
||||
then
|
||||
log "OBIUniq merge in-memory: running OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBIUniq merge in-memory: running failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if obiuniq -m a -m b --chunk-count 4 \
|
||||
"${TEST_DIR}/touniq.fasta" \
|
||||
> "${TMPDIR}/touniq_u_merge_disk.fasta" 2>/dev/null
|
||||
then
|
||||
log "OBIUniq merge on-disk: running OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBIUniq merge on-disk: running failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
# Extract sorted annotations (JSON attributes) from both outputs
|
||||
# to compare merge results independently of sequence ordering
|
||||
grep '^>' "${TMPDIR}/touniq_u_merge_mem.fasta" \
|
||||
| sed 's/^>seq[0-9]* //' \
|
||||
| sort \
|
||||
> "${TMPDIR}/touniq_u_merge_mem.json"
|
||||
|
||||
grep '^>' "${TMPDIR}/touniq_u_merge_disk.fasta" \
|
||||
| sed 's/^>seq[0-9]* //' \
|
||||
| sort \
|
||||
> "${TMPDIR}/touniq_u_merge_disk.json"
|
||||
|
||||
((ntest++))
|
||||
if diff "${TMPDIR}/touniq_u_merge_mem.json" \
|
||||
"${TMPDIR}/touniq_u_merge_disk.json" > /dev/null
|
||||
then
|
||||
log "OBIUniq merge on-disk vs in-memory: result OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBIUniq merge on-disk vs in-memory: result failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
#########################################
|
||||
#
|
||||
# At the end of the tests
|
||||
|
||||
@@ -110,6 +110,7 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
|
||||
log.Infof("Data splitted over %d batches", nbatch)
|
||||
|
||||
go func() {
|
||||
localClassifier := uniqueClassifier.Clone()
|
||||
|
||||
for order, file := range fileNames {
|
||||
iseq, err := obiformats.ReadSequencesFromFile(file)
|
||||
@@ -121,7 +122,7 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
|
||||
if dereplicate {
|
||||
u := make(map[string]*obiseq.BioSequence)
|
||||
var source string
|
||||
uniqueClassifier.Reset()
|
||||
localClassifier.Reset()
|
||||
|
||||
for iseq.Next() {
|
||||
batch := iseq.Get()
|
||||
@@ -129,8 +130,8 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
|
||||
|
||||
for _, seq := range batch.Slice() {
|
||||
// Use composite key: sequence + categories
|
||||
code := uniqueClassifier.Code(seq)
|
||||
key := uniqueClassifier.Value(code)
|
||||
code := localClassifier.Code(seq)
|
||||
key := localClassifier.Value(code)
|
||||
prev, ok := u[key]
|
||||
if ok {
|
||||
prev.Merge(seq, na, true, statsOn)
|
||||
|
||||
@@ -27,22 +27,26 @@ func AhoCorazickWorker(slot string, patterns []string) obiseq.SeqWorker {
|
||||
npar := min(obidefault.ParallelWorkers(), nmatcher)
|
||||
mutex.Add(npar)
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowCount(),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetDescription("Building AhoCorasick matcher..."),
|
||||
)
|
||||
var bar *progressbar.ProgressBar
|
||||
if obidefault.ProgressBar() {
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowCount(),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetDescription("Building AhoCorasick matcher..."),
|
||||
)
|
||||
|
||||
bar := progressbar.NewOptions(nmatcher, pbopt...)
|
||||
bar.Add(0)
|
||||
bar = progressbar.NewOptions(nmatcher, pbopt...)
|
||||
}
|
||||
|
||||
builder := func() {
|
||||
for i := range ieme {
|
||||
matchers[i] = ahocorasick.CompileStrings(patterns[i*sizebatch:min((i+1)*sizebatch,len(patterns))])
|
||||
bar.Add(1)
|
||||
if bar != nil {
|
||||
bar.Add(1)
|
||||
}
|
||||
}
|
||||
mutex.Done()
|
||||
}
|
||||
|
||||
19
pkg/obidefault/progressbar.go
Normal file
19
pkg/obidefault/progressbar.go
Normal file
@@ -0,0 +1,19 @@
|
||||
package obidefault
|
||||
|
||||
var __no_progress_bar__ = false
|
||||
|
||||
func ProgressBar() bool {
|
||||
return !__no_progress_bar__
|
||||
}
|
||||
|
||||
func NoProgressBar() bool {
|
||||
return __no_progress_bar__
|
||||
}
|
||||
|
||||
func SetNoProgressBar(b bool) {
|
||||
__no_progress_bar__ = b
|
||||
}
|
||||
|
||||
func NoProgressBarPtr() *bool {
|
||||
return &__no_progress_bar__
|
||||
}
|
||||
@@ -5,18 +5,30 @@ import (
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"github.com/schollz/progressbar/v3"
|
||||
)
|
||||
|
||||
func (iterator IBioSequence) Speed(message string, size ...int) IBioSequence {
|
||||
|
||||
// If the STDERR is redicted and doesn't end up to a terminal
|
||||
// If the progress bar is disabled via --no-progressbar option
|
||||
if !obidefault.ProgressBar() {
|
||||
return iterator
|
||||
}
|
||||
|
||||
// If the STDERR is redirected and doesn't end up to a terminal
|
||||
// No progress bar is printed.
|
||||
o, _ := os.Stderr.Stat()
|
||||
if (o.Mode() & os.ModeCharDevice) != os.ModeCharDevice {
|
||||
return iterator
|
||||
}
|
||||
|
||||
// If stdout is piped, no progress bar is printed.
|
||||
oo, _ := os.Stdout.Stat()
|
||||
if (oo.Mode() & os.ModeNamedPipe) == os.ModeNamedPipe {
|
||||
return iterator
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
@@ -447,141 +447,6 @@ func IterCanonicalKmers(seq []byte, k int) iter.Seq[uint64] {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SuperKmer represents a maximal subsequence where all consecutive k-mers
|
||||
// share the same minimizer. A minimizer is the smallest canonical m-mer
|
||||
// among the (k-m+1) m-mers contained in a k-mer.
|
||||
type SuperKmer struct {
|
||||
Minimizer uint64 // The canonical minimizer value (normalized m-mer)
|
||||
Start int // Starting position in the original sequence (0-indexed)
|
||||
End int // Ending position (exclusive, like Go slice notation)
|
||||
Sequence []byte // The actual DNA subsequence [Start:End]
|
||||
}
|
||||
|
||||
// dequeItem represents an element in the monotone deque used for
|
||||
// tracking minimizers in a sliding window.
|
||||
type dequeItem struct {
|
||||
position int // Position of the m-mer in the sequence
|
||||
canonical uint64 // Canonical (normalized) m-mer value
|
||||
}
|
||||
|
||||
// ExtractSuperKmers extracts super k-mers from a DNA sequence.
|
||||
// A super k-mer is a maximal subsequence where all consecutive k-mers
|
||||
// share the same minimizer. The minimizer of a k-mer is the smallest
|
||||
// canonical m-mer among its (k-m+1) constituent m-mers.
|
||||
//
|
||||
// The algorithm uses:
|
||||
// - Simultaneous forward/reverse m-mer encoding for O(1) canonical m-mer computation
|
||||
// - Monotone deque for O(1) amortized minimizer tracking per position
|
||||
//
|
||||
// The maximum k-mer size is 31 (using 62 bits), leaving the top 2 bits
|
||||
// available for error markers if needed.
|
||||
//
|
||||
// Parameters:
|
||||
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
||||
// - k: k-mer size (must be between m+1 and 31)
|
||||
// - m: minimizer size (must be between 1 and k-1)
|
||||
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
||||
//
|
||||
// Returns:
|
||||
// - slice of SuperKmer structs representing maximal subsequences
|
||||
// - nil if parameters are invalid or sequence is too short
|
||||
//
|
||||
// Time complexity: O(n) where n is the sequence length
|
||||
// Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results
|
||||
func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer {
|
||||
if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
|
||||
return nil
|
||||
}
|
||||
|
||||
var result []SuperKmer
|
||||
if buffer == nil {
|
||||
estimatedSize := len(seq) / k
|
||||
if estimatedSize < 1 {
|
||||
estimatedSize = 1
|
||||
}
|
||||
result = make([]SuperKmer, 0, estimatedSize)
|
||||
} else {
|
||||
result = (*buffer)[:0]
|
||||
}
|
||||
|
||||
deque := make([]dequeItem, 0, k-m+1)
|
||||
|
||||
mMask := uint64(1)<<(m*2) - 1
|
||||
rcShift := uint((m - 1) * 2)
|
||||
|
||||
var fwdMmer, rvcMmer uint64
|
||||
for i := 0; i < m-1 && i < len(seq); i++ {
|
||||
code := uint64(__single_base_code__[seq[i]&31])
|
||||
fwdMmer = (fwdMmer << 2) | code
|
||||
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||
}
|
||||
|
||||
superKmerStart := 0
|
||||
var currentMinimizer uint64
|
||||
firstKmer := true
|
||||
|
||||
for pos := m - 1; pos < len(seq); pos++ {
|
||||
code := uint64(__single_base_code__[seq[pos]&31])
|
||||
fwdMmer = ((fwdMmer << 2) | code) & mMask
|
||||
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||
|
||||
canonical := fwdMmer
|
||||
if rvcMmer < fwdMmer {
|
||||
canonical = rvcMmer
|
||||
}
|
||||
|
||||
mmerPos := pos - m + 1
|
||||
|
||||
if pos >= k-1 {
|
||||
windowStart := pos - k + 1
|
||||
for len(deque) > 0 && deque[0].position < windowStart {
|
||||
deque = deque[1:]
|
||||
}
|
||||
}
|
||||
|
||||
for len(deque) > 0 && deque[len(deque)-1].canonical >= canonical {
|
||||
deque = deque[:len(deque)-1]
|
||||
}
|
||||
|
||||
deque = append(deque, dequeItem{position: mmerPos, canonical: canonical})
|
||||
|
||||
if pos >= k-1 {
|
||||
newMinimizer := deque[0].canonical
|
||||
kmerStart := pos - k + 1
|
||||
|
||||
if firstKmer {
|
||||
currentMinimizer = newMinimizer
|
||||
firstKmer = false
|
||||
} else if newMinimizer != currentMinimizer {
|
||||
endPos := kmerStart + k - 1
|
||||
superKmer := SuperKmer{
|
||||
Minimizer: currentMinimizer,
|
||||
Start: superKmerStart,
|
||||
End: endPos,
|
||||
Sequence: seq[superKmerStart:endPos],
|
||||
}
|
||||
result = append(result, superKmer)
|
||||
|
||||
superKmerStart = kmerStart
|
||||
currentMinimizer = newMinimizer
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !firstKmer {
|
||||
superKmer := SuperKmer{
|
||||
Minimizer: currentMinimizer,
|
||||
Start: superKmerStart,
|
||||
End: len(seq),
|
||||
Sequence: seq[superKmerStart:],
|
||||
}
|
||||
result = append(result, superKmer)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// ReverseComplement computes the reverse complement of an encoded k-mer.
|
||||
// The k-mer is encoded with 2 bits per nucleotide (A=00, C=01, G=10, T=11).
|
||||
// The complement is: A↔T (00↔11), C↔G (01↔10), which is simply XOR with 11.
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"sort"
|
||||
"unsafe"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obilog"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
@@ -267,20 +268,23 @@ func NewKmerMap[T obifp.FPUint[T]](
|
||||
}
|
||||
|
||||
n := len(sequences)
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowCount(),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetDescription("Indexing kmers"),
|
||||
)
|
||||
var bar *progressbar.ProgressBar
|
||||
if obidefault.ProgressBar() {
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowCount(),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetDescription("Indexing kmers"),
|
||||
)
|
||||
|
||||
bar := progressbar.NewOptions(n, pbopt...)
|
||||
bar = progressbar.NewOptions(n, pbopt...)
|
||||
}
|
||||
|
||||
for i, sequence := range sequences {
|
||||
kmap.Push(sequence, maxoccurs)
|
||||
if i%100 == 0 {
|
||||
if bar != nil && i%100 == 0 {
|
||||
bar.Add(100)
|
||||
}
|
||||
}
|
||||
|
||||
59
pkg/obikmer/superkmer.go
Normal file
59
pkg/obikmer/superkmer.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package obikmer
|
||||
|
||||
// SuperKmer represents a maximal subsequence where all consecutive k-mers
|
||||
// share the same minimizer.
|
||||
type SuperKmer struct {
|
||||
Minimizer uint64 // The canonical minimizer value (normalized m-mer)
|
||||
Start int // Starting position in the original sequence (0-indexed)
|
||||
End int // Ending position (exclusive, like Go slice notation)
|
||||
Sequence []byte // The actual DNA subsequence [Start:End]
|
||||
}
|
||||
|
||||
// dequeItem represents an element in the monotone deque used for
|
||||
// tracking minimizers in a sliding window.
|
||||
type dequeItem struct {
|
||||
position int // Position of the m-mer in the sequence
|
||||
canonical uint64 // Canonical (normalized) m-mer value
|
||||
}
|
||||
|
||||
// ExtractSuperKmers extracts super k-mers from a DNA sequence.
|
||||
// A super k-mer is a maximal subsequence where all consecutive k-mers
|
||||
// share the same minimizer. The minimizer of a k-mer is the smallest
|
||||
// canonical m-mer among its (k-m+1) constituent m-mers.
|
||||
//
|
||||
// This function uses IterSuperKmers internally and collects results into a slice.
|
||||
//
|
||||
// Parameters:
|
||||
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
||||
// - k: k-mer size (must be between m+1 and 31)
|
||||
// - m: minimizer size (must be between 1 and k-1)
|
||||
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
||||
//
|
||||
// Returns:
|
||||
// - slice of SuperKmer structs representing maximal subsequences
|
||||
// - nil if parameters are invalid or sequence is too short
|
||||
//
|
||||
// Time complexity: O(n) where n is the sequence length
|
||||
// Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results
|
||||
func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer {
|
||||
if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
|
||||
return nil
|
||||
}
|
||||
|
||||
var result []SuperKmer
|
||||
if buffer == nil {
|
||||
estimatedSize := len(seq) / k
|
||||
if estimatedSize < 1 {
|
||||
estimatedSize = 1
|
||||
}
|
||||
result = make([]SuperKmer, 0, estimatedSize)
|
||||
} else {
|
||||
result = (*buffer)[:0]
|
||||
}
|
||||
|
||||
for sk := range IterSuperKmers(seq, k, m) {
|
||||
result = append(result, sk)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
215
pkg/obikmer/superkmer_iter.go
Normal file
215
pkg/obikmer/superkmer_iter.go
Normal file
@@ -0,0 +1,215 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"iter"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
// IterSuperKmers returns an iterator over super k-mers extracted from a DNA sequence.
|
||||
// It uses the same algorithm as ExtractSuperKmers but yields super k-mers one at a time.
|
||||
//
|
||||
// Parameters:
|
||||
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
||||
// - k: k-mer size (must be between m+1 and 31)
|
||||
// - m: minimizer size (must be between 1 and k-1)
|
||||
//
|
||||
// Returns:
|
||||
// - An iterator that yields SuperKmer structs
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// for sk := range IterSuperKmers(sequence, 21, 11) {
|
||||
// fmt.Printf("SuperKmer at %d-%d with minimizer %d\n", sk.Start, sk.End, sk.Minimizer)
|
||||
// }
|
||||
func IterSuperKmers(seq []byte, k int, m int) iter.Seq[SuperKmer] {
|
||||
return func(yield func(SuperKmer) bool) {
|
||||
if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
|
||||
return
|
||||
}
|
||||
|
||||
deque := make([]dequeItem, 0, k-m+1)
|
||||
|
||||
mMask := uint64(1)<<(m*2) - 1
|
||||
rcShift := uint((m - 1) * 2)
|
||||
|
||||
var fwdMmer, rvcMmer uint64
|
||||
for i := 0; i < m-1 && i < len(seq); i++ {
|
||||
code := uint64(__single_base_code__[seq[i]&31])
|
||||
fwdMmer = (fwdMmer << 2) | code
|
||||
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||
}
|
||||
|
||||
superKmerStart := 0
|
||||
var currentMinimizer uint64
|
||||
firstKmer := true
|
||||
|
||||
for pos := m - 1; pos < len(seq); pos++ {
|
||||
code := uint64(__single_base_code__[seq[pos]&31])
|
||||
fwdMmer = ((fwdMmer << 2) | code) & mMask
|
||||
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||
|
||||
canonical := fwdMmer
|
||||
if rvcMmer < fwdMmer {
|
||||
canonical = rvcMmer
|
||||
}
|
||||
|
||||
mmerPos := pos - m + 1
|
||||
|
||||
if pos >= k-1 {
|
||||
windowStart := pos - k + 1
|
||||
for len(deque) > 0 && deque[0].position < windowStart {
|
||||
deque = deque[1:]
|
||||
}
|
||||
}
|
||||
|
||||
for len(deque) > 0 && deque[len(deque)-1].canonical >= canonical {
|
||||
deque = deque[:len(deque)-1]
|
||||
}
|
||||
|
||||
deque = append(deque, dequeItem{position: mmerPos, canonical: canonical})
|
||||
|
||||
if pos >= k-1 {
|
||||
newMinimizer := deque[0].canonical
|
||||
kmerStart := pos - k + 1
|
||||
|
||||
if firstKmer {
|
||||
currentMinimizer = newMinimizer
|
||||
firstKmer = false
|
||||
} else if newMinimizer != currentMinimizer {
|
||||
endPos := kmerStart + k - 1
|
||||
superKmer := SuperKmer{
|
||||
Minimizer: currentMinimizer,
|
||||
Start: superKmerStart,
|
||||
End: endPos,
|
||||
Sequence: seq[superKmerStart:endPos],
|
||||
}
|
||||
if !yield(superKmer) {
|
||||
return
|
||||
}
|
||||
|
||||
superKmerStart = kmerStart
|
||||
currentMinimizer = newMinimizer
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !firstKmer && len(seq[superKmerStart:]) >= k {
|
||||
superKmer := SuperKmer{
|
||||
Minimizer: currentMinimizer,
|
||||
Start: superKmerStart,
|
||||
End: len(seq),
|
||||
Sequence: seq[superKmerStart:],
|
||||
}
|
||||
yield(superKmer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ToBioSequence converts a SuperKmer to a BioSequence with metadata.
|
||||
//
|
||||
// The resulting BioSequence contains:
|
||||
// - ID: "{parentID}_superkmer_{start}_{end}"
|
||||
// - Sequence: the actual DNA subsequence
|
||||
// - Attributes:
|
||||
// - "minimizer_value" (uint64): the canonical minimizer value
|
||||
// - "minimizer_seq" (string): the DNA sequence of the minimizer
|
||||
// - "k" (int): the k-mer size
|
||||
// - "m" (int): the minimizer size
|
||||
// - "start" (int): starting position in original sequence
|
||||
// - "end" (int): ending position in original sequence
|
||||
// - "parent_id" (string): ID of the parent sequence
|
||||
//
|
||||
// Parameters:
|
||||
// - k: k-mer size used for extraction
|
||||
// - m: minimizer size used for extraction
|
||||
// - parentID: ID of the parent sequence
|
||||
// - parentSource: source field from the parent sequence
|
||||
//
|
||||
// Returns:
|
||||
// - *obiseq.BioSequence: A new BioSequence representing this super k-mer
|
||||
func (sk *SuperKmer) ToBioSequence(k int, m int, parentID string, parentSource string) *obiseq.BioSequence {
|
||||
// Create ID for the super-kmer
|
||||
var id string
|
||||
if parentID != "" {
|
||||
id = fmt.Sprintf("%s_superkmer_%d_%d", parentID, sk.Start, sk.End)
|
||||
} else {
|
||||
id = fmt.Sprintf("superkmer_%d_%d", sk.Start, sk.End)
|
||||
}
|
||||
|
||||
// Create the BioSequence
|
||||
seq := obiseq.NewBioSequence(id, sk.Sequence, "")
|
||||
|
||||
// Copy source from parent
|
||||
if parentSource != "" {
|
||||
seq.SetSource(parentSource)
|
||||
}
|
||||
|
||||
// Set attributes
|
||||
seq.SetAttribute("minimizer_value", sk.Minimizer)
|
||||
|
||||
// Decode the minimizer to get its DNA sequence
|
||||
minimizerSeq := DecodeKmer(sk.Minimizer, m, nil)
|
||||
seq.SetAttribute("minimizer_seq", string(minimizerSeq))
|
||||
|
||||
seq.SetAttribute("k", k)
|
||||
seq.SetAttribute("m", m)
|
||||
seq.SetAttribute("start", sk.Start)
|
||||
seq.SetAttribute("end", sk.End)
|
||||
|
||||
if parentID != "" {
|
||||
seq.SetAttribute("parent_id", parentID)
|
||||
}
|
||||
|
||||
return seq
|
||||
}
|
||||
|
||||
// SuperKmerWorker creates a SeqWorker that extracts super k-mers from a BioSequence
|
||||
// and returns them as a slice of BioSequence objects.
|
||||
//
|
||||
// The worker copies the source field from the parent sequence to all extracted super k-mers.
|
||||
//
|
||||
// Parameters:
|
||||
// - k: k-mer size (must be between m+1 and 31)
|
||||
// - m: minimizer size (must be between 1 and k-1)
|
||||
//
|
||||
// Returns:
|
||||
// - SeqWorker: A worker function that can be used in obiiter pipelines
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// worker := SuperKmerWorker(21, 11)
|
||||
// iterator := iterator.MakeIWorker(worker, false)
|
||||
func SuperKmerWorker(k int, m int) obiseq.SeqWorker {
|
||||
return func(seq *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
if seq == nil {
|
||||
return obiseq.BioSequenceSlice{}, nil
|
||||
}
|
||||
|
||||
// Validate parameters
|
||||
if m < 1 || m >= k || k < 2 || k > 31 {
|
||||
return obiseq.BioSequenceSlice{}, fmt.Errorf(
|
||||
"invalid parameters: k=%d, m=%d (need 1 <= m < k <= 31)",
|
||||
k, m)
|
||||
}
|
||||
|
||||
sequence := seq.Sequence()
|
||||
if len(sequence) < k {
|
||||
return obiseq.BioSequenceSlice{}, nil
|
||||
}
|
||||
|
||||
parentID := seq.Id()
|
||||
parentSource := seq.Source()
|
||||
|
||||
// Extract super k-mers and convert to BioSequences
|
||||
result := make(obiseq.BioSequenceSlice, 0)
|
||||
|
||||
for sk := range IterSuperKmers(sequence, k, m) {
|
||||
bioSeq := sk.ToBioSequence(k, m, parentID, parentSource)
|
||||
result = append(result, bioSeq)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
}
|
||||
198
pkg/obikmer/superkmer_iter_test.go
Normal file
198
pkg/obikmer/superkmer_iter_test.go
Normal file
@@ -0,0 +1,198 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestIterSuperKmers(t *testing.T) {
|
||||
seq := []byte("ACGTACGTGGGGAAAA")
|
||||
k := 5
|
||||
m := 3
|
||||
|
||||
count := 0
|
||||
for sk := range IterSuperKmers(seq, k, m) {
|
||||
count++
|
||||
t.Logf("SuperKmer %d: Minimizer=%d, Start=%d, End=%d, Seq=%s",
|
||||
count, sk.Minimizer, sk.Start, sk.End, string(sk.Sequence))
|
||||
|
||||
// Verify sequence boundaries
|
||||
if sk.Start < 0 || sk.End > len(seq) {
|
||||
t.Errorf("Invalid boundaries: Start=%d, End=%d, seqLen=%d",
|
||||
sk.Start, sk.End, len(seq))
|
||||
}
|
||||
|
||||
// Verify sequence content
|
||||
if string(sk.Sequence) != string(seq[sk.Start:sk.End]) {
|
||||
t.Errorf("Sequence mismatch: expected %s, got %s",
|
||||
string(seq[sk.Start:sk.End]), string(sk.Sequence))
|
||||
}
|
||||
}
|
||||
|
||||
if count == 0 {
|
||||
t.Error("No super k-mers extracted")
|
||||
}
|
||||
|
||||
t.Logf("Total super k-mers extracted: %d", count)
|
||||
}
|
||||
|
||||
func TestIterSuperKmersVsSlice(t *testing.T) {
|
||||
seq := []byte("ACGTACGTGGGGAAAAACGTACGT")
|
||||
k := 7
|
||||
m := 4
|
||||
|
||||
// Extract using slice version
|
||||
sliceResult := ExtractSuperKmers(seq, k, m, nil)
|
||||
|
||||
// Extract using iterator version
|
||||
var iterResult []SuperKmer
|
||||
for sk := range IterSuperKmers(seq, k, m) {
|
||||
iterResult = append(iterResult, sk)
|
||||
}
|
||||
|
||||
// Compare counts
|
||||
if len(sliceResult) != len(iterResult) {
|
||||
t.Errorf("Different number of super k-mers: slice=%d, iter=%d",
|
||||
len(sliceResult), len(iterResult))
|
||||
}
|
||||
|
||||
// Compare each super k-mer
|
||||
for i := 0; i < len(sliceResult) && i < len(iterResult); i++ {
|
||||
slice := sliceResult[i]
|
||||
iter := iterResult[i]
|
||||
|
||||
if slice.Minimizer != iter.Minimizer {
|
||||
t.Errorf("SuperKmer %d: different minimizers: slice=%d, iter=%d",
|
||||
i, slice.Minimizer, iter.Minimizer)
|
||||
}
|
||||
|
||||
if slice.Start != iter.Start || slice.End != iter.End {
|
||||
t.Errorf("SuperKmer %d: different boundaries: slice=[%d:%d], iter=[%d:%d]",
|
||||
i, slice.Start, slice.End, iter.Start, iter.End)
|
||||
}
|
||||
|
||||
if string(slice.Sequence) != string(iter.Sequence) {
|
||||
t.Errorf("SuperKmer %d: different sequences: slice=%s, iter=%s",
|
||||
i, string(slice.Sequence), string(iter.Sequence))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestSuperKmerMinimizerBijection validates the intrinsic property that
|
||||
// a super k-mer sequence has one and only one minimizer (bijection property).
|
||||
// This test ensures that:
|
||||
// 1. All k-mers in a super k-mer share the same minimizer
|
||||
// 2. Two identical super k-mer sequences must have the same minimizer
|
||||
func TestSuperKmerMinimizerBijection(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
seq []byte
|
||||
k int
|
||||
m int
|
||||
}{
|
||||
{
|
||||
name: "simple sequence",
|
||||
seq: []byte("ACGTACGTACGTACGTACGTACGTACGTACGT"),
|
||||
k: 21,
|
||||
m: 11,
|
||||
},
|
||||
{
|
||||
name: "homopolymer blocks",
|
||||
seq: []byte("AAAACCCCGGGGTTTTAAAACCCCGGGGTTTT"),
|
||||
k: 21,
|
||||
m: 11,
|
||||
},
|
||||
{
|
||||
name: "complex sequence",
|
||||
seq: []byte("ATCGATCGATCGATCGATCGATCGATCGATCG"),
|
||||
k: 15,
|
||||
m: 7,
|
||||
},
|
||||
{
|
||||
name: "longer sequence",
|
||||
seq: []byte("ACGTACGTGGGGAAAAACGTACGTTTTTCCCCACGTACGT"),
|
||||
k: 13,
|
||||
m: 7,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
// Map to track sequence -> minimizer
|
||||
seqToMinimizer := make(map[string]uint64)
|
||||
|
||||
for sk := range IterSuperKmers(tc.seq, tc.k, tc.m) {
|
||||
seqStr := string(sk.Sequence)
|
||||
|
||||
// Check if we've seen this sequence before
|
||||
if prevMinimizer, exists := seqToMinimizer[seqStr]; exists {
|
||||
if prevMinimizer != sk.Minimizer {
|
||||
t.Errorf("BIJECTION VIOLATION: sequence %s has two different minimizers:\n"+
|
||||
" First: %d\n"+
|
||||
" Second: %d\n"+
|
||||
" This violates the super k-mer definition!",
|
||||
seqStr, prevMinimizer, sk.Minimizer)
|
||||
}
|
||||
} else {
|
||||
seqToMinimizer[seqStr] = sk.Minimizer
|
||||
}
|
||||
|
||||
// Verify all k-mers in this super k-mer have the same minimizer
|
||||
if len(sk.Sequence) >= tc.k {
|
||||
for i := 0; i <= len(sk.Sequence)-tc.k; i++ {
|
||||
kmerSeq := sk.Sequence[i : i+tc.k]
|
||||
minimizer := findMinimizer(kmerSeq, tc.k, tc.m)
|
||||
if minimizer != sk.Minimizer {
|
||||
t.Errorf("K-mer at position %d in super k-mer has different minimizer:\n"+
|
||||
" K-mer: %s\n"+
|
||||
" Expected minimizer: %d\n"+
|
||||
" Actual minimizer: %d\n"+
|
||||
" Super k-mer: %s",
|
||||
i, string(kmerSeq), sk.Minimizer, minimizer, seqStr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// findMinimizer computes the minimizer of a k-mer for testing purposes
|
||||
func findMinimizer(kmer []byte, k int, m int) uint64 {
|
||||
if len(kmer) != k {
|
||||
return 0
|
||||
}
|
||||
|
||||
mMask := uint64(1)<<(m*2) - 1
|
||||
rcShift := uint((m - 1) * 2)
|
||||
|
||||
minMinimizer := uint64(^uint64(0)) // max uint64
|
||||
|
||||
// Scan all m-mers in the k-mer
|
||||
var fwdMmer, rvcMmer uint64
|
||||
for i := 0; i < m-1 && i < len(kmer); i++ {
|
||||
code := uint64(__single_base_code__[kmer[i]&31])
|
||||
fwdMmer = (fwdMmer << 2) | code
|
||||
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||
}
|
||||
|
||||
for i := m - 1; i < len(kmer); i++ {
|
||||
code := uint64(__single_base_code__[kmer[i]&31])
|
||||
fwdMmer = ((fwdMmer << 2) | code) & mMask
|
||||
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||
|
||||
canonical := fwdMmer
|
||||
if rvcMmer < fwdMmer {
|
||||
canonical = rvcMmer
|
||||
}
|
||||
|
||||
if canonical < minMinimizer {
|
||||
minMinimizer = canonical
|
||||
}
|
||||
}
|
||||
|
||||
return minMinimizer
|
||||
}
|
||||
|
||||
// Note: Tests for ToBioSequence and SuperKmerWorker are in a separate
|
||||
// integration test package to avoid circular dependencies between
|
||||
// obikmer and obiseq packages.
|
||||
@@ -3,7 +3,7 @@ package obioptions
|
||||
// Version is automatically updated by the Makefile from version.txt
|
||||
// The patch number (third digit) is incremented on each push to the repository
|
||||
|
||||
var _Version = "Release 4.4.10"
|
||||
var _Version = "Release 4.4.12"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
//
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
"github.com/schollz/progressbar/v3"
|
||||
)
|
||||
@@ -69,16 +70,18 @@ func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
|
||||
}
|
||||
defer destfile.Close()
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetPredictTime(true),
|
||||
progressbar.OptionSetDescription("[Save CSV stat ratio file]"),
|
||||
)
|
||||
|
||||
bar := progressbar.NewOptions(len(data), pbopt...)
|
||||
var bar *progressbar.ProgressBar
|
||||
if obidefault.ProgressBar() {
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetPredictTime(true),
|
||||
progressbar.OptionSetDescription("[Save CSV stat ratio file]"),
|
||||
)
|
||||
bar = progressbar.NewOptions(len(data), pbopt...)
|
||||
}
|
||||
|
||||
fmt.Fprintln(destfile, "Sample,Origin_id,Origin_status,Origin,Mutant,Origin_Weight,Mutant_Weight,Origin_Count,Mutant_Count,Position,Origin_length,A,C,G,T")
|
||||
for code, dist := range data {
|
||||
@@ -101,7 +104,9 @@ func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
|
||||
ratio.T,
|
||||
)
|
||||
}
|
||||
bar.Add(1)
|
||||
if bar != nil {
|
||||
bar.Add(1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -181,16 +186,18 @@ func SaveGMLGraphs(dirname string,
|
||||
}
|
||||
}
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetPredictTime(true),
|
||||
progressbar.OptionSetDescription("[Save GML Graph files]"),
|
||||
)
|
||||
|
||||
bar := progressbar.NewOptions(len(samples), pbopt...)
|
||||
var bar *progressbar.ProgressBar
|
||||
if obidefault.ProgressBar() {
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetPredictTime(true),
|
||||
progressbar.OptionSetDescription("[Save GML Graph files]"),
|
||||
)
|
||||
bar = progressbar.NewOptions(len(samples), pbopt...)
|
||||
}
|
||||
|
||||
for name, seqs := range samples {
|
||||
|
||||
@@ -204,7 +211,9 @@ func SaveGMLGraphs(dirname string,
|
||||
file.WriteString(Gml(seqs, name, statThreshold))
|
||||
file.Close()
|
||||
|
||||
bar.Add(1)
|
||||
if bar != nil {
|
||||
bar.Add(1)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -495,37 +504,44 @@ func BuildSeqGraph(samples map[string]*[]*seqPCR,
|
||||
npairs += nseq * (nseq - 1) / 2
|
||||
}
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetPredictTime(true),
|
||||
progressbar.OptionSetDescription("[One error graph]"),
|
||||
)
|
||||
|
||||
bar := progressbar.NewOptions(npairs, pbopt...)
|
||||
for _, seqs := range samples {
|
||||
np := buildSamplePairs(seqs, workers)
|
||||
|
||||
bar.Add(np)
|
||||
}
|
||||
|
||||
if maxError > 1 {
|
||||
pbopt = make([]progressbar.Option, 0, 5)
|
||||
var bar *progressbar.ProgressBar
|
||||
if obidefault.ProgressBar() {
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetPredictTime(true),
|
||||
progressbar.OptionSetDescription("[Adds multiple errors]"),
|
||||
progressbar.OptionSetDescription("[One error graph]"),
|
||||
)
|
||||
|
||||
bar = progressbar.NewOptions(npairs, pbopt...)
|
||||
}
|
||||
|
||||
for _, seqs := range samples {
|
||||
np := extendSimilarityGraph(seqs, maxError, workers)
|
||||
for _, seqs := range samples {
|
||||
np := buildSamplePairs(seqs, workers)
|
||||
if bar != nil {
|
||||
bar.Add(np)
|
||||
}
|
||||
}
|
||||
|
||||
if maxError > 1 {
|
||||
if obidefault.ProgressBar() {
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetPredictTime(true),
|
||||
progressbar.OptionSetDescription("[Adds multiple errors]"),
|
||||
)
|
||||
bar = progressbar.NewOptions(npairs, pbopt...)
|
||||
}
|
||||
|
||||
for _, seqs := range samples {
|
||||
np := extendSimilarityGraph(seqs, maxError, workers)
|
||||
if bar != nil {
|
||||
bar.Add(np)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,7 +31,6 @@ var __output_in_json__ = false
|
||||
var __output_fastjson_format__ = false
|
||||
var __output_fastobi_format__ = false
|
||||
|
||||
var __no_progress_bar__ = false
|
||||
var __skip_empty__ = false
|
||||
var __skip_on_error__ = false
|
||||
|
||||
@@ -82,7 +81,7 @@ func InputOptionSet(options *getoptions.GetOpt) {
|
||||
}
|
||||
|
||||
func OutputModeOptionSet(options *getoptions.GetOpt, compressed bool) {
|
||||
options.BoolVar(&__no_progress_bar__, "no-progressbar", false,
|
||||
options.BoolVar(obidefault.NoProgressBarPtr(), "no-progressbar", obidefault.NoProgressBar(),
|
||||
options.Description("Disable the progress bar printing"))
|
||||
|
||||
if compressed {
|
||||
@@ -224,13 +223,16 @@ func CLIAnalyzeOnly() int {
|
||||
|
||||
func CLIProgressBar() bool {
|
||||
// If the output is not a terminal, then we do not display the progress bar
|
||||
o, _ := os.Stderr.Stat()
|
||||
onTerminal := (o.Mode() & os.ModeCharDevice) == os.ModeCharDevice
|
||||
oe, _ := os.Stderr.Stat()
|
||||
onTerminal := (oe.Mode() & os.ModeCharDevice) == os.ModeCharDevice
|
||||
if !onTerminal {
|
||||
log.Info("Stderr is redirected, progress bar disabled")
|
||||
}
|
||||
|
||||
return onTerminal && !__no_progress_bar__
|
||||
oo, _ := os.Stdout.Stat()
|
||||
toPipe := (oo.Mode() & os.ModeNamedPipe) == os.ModeNamedPipe
|
||||
|
||||
return onTerminal && !toPipe && obidefault.ProgressBar()
|
||||
}
|
||||
|
||||
func CLIOutPutFileName() string {
|
||||
|
||||
@@ -210,9 +210,7 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
|
||||
|
||||
}
|
||||
|
||||
if CLIProgressBar() {
|
||||
iterator = iterator.Speed("Reading sequences")
|
||||
}
|
||||
iterator = iterator.Speed("Reading sequences")
|
||||
|
||||
return iterator, nil
|
||||
}
|
||||
|
||||
@@ -12,9 +12,7 @@ import (
|
||||
func CLIWriteSequenceCSV(iterator obiiter.IBioSequence,
|
||||
terminalAction bool, filenames ...string) *obiitercsv.ICSVRecord {
|
||||
|
||||
if obiconvert.CLIProgressBar() {
|
||||
iterator = iterator.Speed("Writing CSV")
|
||||
}
|
||||
iterator = iterator.Speed("Writing CSV")
|
||||
|
||||
opts := make([]WithOption, 0, 10)
|
||||
|
||||
|
||||
@@ -42,16 +42,19 @@ func MapOnLandmarkSequences(library obiseq.BioSequenceSlice, landmark_idx []int,
|
||||
|
||||
seqworld := obiutils.Make2DArray[float64](library_size, n_landmark)
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowCount(),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetDescription("[Sequence mapping]"),
|
||||
)
|
||||
var bar *progressbar.ProgressBar
|
||||
if obidefault.ProgressBar() {
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowCount(),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetDescription("[Sequence mapping]"),
|
||||
)
|
||||
|
||||
bar := progressbar.NewOptions(library_size, pbopt...)
|
||||
bar = progressbar.NewOptions(library_size, pbopt...)
|
||||
}
|
||||
|
||||
waiting := sync.WaitGroup{}
|
||||
waiting.Add(nworkers)
|
||||
@@ -66,7 +69,9 @@ func MapOnLandmarkSequences(library obiseq.BioSequenceSlice, landmark_idx []int,
|
||||
match, lalign := obialign.FastLCSScore(landmark, seq, -1, &buffer)
|
||||
coord[j] = float64(lalign - match)
|
||||
}
|
||||
bar.Add(1)
|
||||
if bar != nil {
|
||||
bar.Add(1)
|
||||
}
|
||||
}
|
||||
waiting.Done()
|
||||
}
|
||||
@@ -170,23 +175,26 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
|
||||
taxa.Set(i, taxon)
|
||||
}
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowCount(),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetDescription("[Sequence Indexing]"),
|
||||
)
|
||||
var bar2 *progressbar.ProgressBar
|
||||
if obidefault.ProgressBar() {
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowCount(),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetDescription("[Sequence Indexing]"),
|
||||
)
|
||||
|
||||
bar := progressbar.NewOptions(len(library), pbopt...)
|
||||
bar2 = progressbar.NewOptions(len(library), pbopt...)
|
||||
}
|
||||
|
||||
for i, seq := range library {
|
||||
idx := obirefidx.GeomIndexSesquence(i, library, taxa, taxo)
|
||||
seq.SetOBITagGeomRefIndex(idx)
|
||||
|
||||
if i%10 == 0 {
|
||||
bar.Add(10)
|
||||
if bar2 != nil && i%10 == 0 {
|
||||
bar2.Add(10)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,8 +8,6 @@ import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// MaskingMode defines how to handle low-complexity regions
|
||||
@@ -37,62 +35,81 @@ const (
|
||||
// Returns: a SeqWorker function that can be applied to each sequence
|
||||
func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker {
|
||||
|
||||
// ========================================================================
|
||||
// FUNCTION 1: emax - Calculate theoretical maximum entropy
|
||||
// ========================================================================
|
||||
// Computes the maximum entropy of a k-mer of length lseq containing words of size word_size.
|
||||
//
|
||||
// Maximum entropy depends on the theoretical optimal word distribution:
|
||||
// - If we have more positions (nw) than possible canonical words (na),
|
||||
// some words will appear multiple times
|
||||
// - We calculate the entropy of a distribution where all words appear
|
||||
// cov or cov+1 times (most uniform distribution possible)
|
||||
//
|
||||
// IMPORTANT: Uses CanonicalCircularKmerCount to get the actual number of canonical words
|
||||
// after circular normalization (e.g., "atg", "tga", "gat" → all "atg").
|
||||
// This is much smaller than 4^word_size (e.g., 10 instead of 16 for word_size=2).
|
||||
emax := func(lseq, word_size int) float64 {
|
||||
nw := lseq - word_size + 1 // Number of words in a k-mer of length lseq
|
||||
na := obikmer.CanonicalCircularKmerCount(word_size) // Number of canonical words after normalization
|
||||
|
||||
// Case 1: Fewer positions than possible words
|
||||
// Maximum entropy is simply log(nw) since we can have at most nw different words
|
||||
if nw < na {
|
||||
return math.Log(float64(nw))
|
||||
}
|
||||
|
||||
// Case 2: More positions than possible words
|
||||
// Some words must appear multiple times
|
||||
cov := nw / na // Average coverage (average number of occurrences per word)
|
||||
remains := nw - (na * cov) // Number of words that will have one additional occurrence
|
||||
|
||||
// Calculate frequencies in the optimal distribution:
|
||||
// - (na - remains) words appear cov times → frequency f1 = cov/nw
|
||||
// - remains words appear (cov+1) times → frequency f2 = (cov+1)/nw
|
||||
f1 := float64(cov) / float64(nw)
|
||||
f2 := float64(cov+1) / float64(nw)
|
||||
|
||||
// Shannon entropy: H = -Σ p(i) * log(p(i))
|
||||
// where p(i) is the probability of observing word i
|
||||
return -(float64(na-remains)*f1*math.Log(f1) +
|
||||
float64(remains)*f2*math.Log(f2))
|
||||
nLogN := make([]float64, kmer_size+1)
|
||||
for i := 1; i <= kmer_size; i++ {
|
||||
nLogN[i] = float64(i) * math.Log(float64(i))
|
||||
}
|
||||
|
||||
normTables := make([][]int, level_max+1)
|
||||
for ws := 1; ws <= level_max; ws++ {
|
||||
size := 1 << (ws * 2)
|
||||
normTables[ws] = make([]int, size)
|
||||
for code := 0; code < size; code++ {
|
||||
normTables[ws][code] = int(obikmer.NormalizeCircular(uint64(code), ws))
|
||||
}
|
||||
}
|
||||
|
||||
type pair struct {
|
||||
index int
|
||||
value float64
|
||||
}
|
||||
|
||||
slidingMin := func(data []float64, window int) {
|
||||
if len(data) == 0 || window <= 0 {
|
||||
return
|
||||
}
|
||||
if window >= len(data) {
|
||||
minVal := data[0]
|
||||
for i := 1; i < len(data); i++ {
|
||||
if data[i] < minVal {
|
||||
minVal = data[i]
|
||||
}
|
||||
}
|
||||
for i := range data {
|
||||
data[i] = minVal
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
deque := make([]pair, 0, window)
|
||||
|
||||
for i, v := range data {
|
||||
for len(deque) > 0 && deque[0].index <= i-window {
|
||||
deque = deque[1:]
|
||||
}
|
||||
|
||||
for len(deque) > 0 && deque[len(deque)-1].value >= v {
|
||||
deque = deque[:len(deque)-1]
|
||||
}
|
||||
|
||||
deque = append(deque, pair{index: i, value: v})
|
||||
|
||||
data[i] = deque[0].value
|
||||
}
|
||||
}
|
||||
emaxValues := make([]float64, level_max+1)
|
||||
logNwords := make([]float64, level_max+1)
|
||||
for ws := 1; ws <= level_max; ws++ {
|
||||
nw := kmer_size - ws + 1
|
||||
na := obikmer.CanonicalCircularKmerCount(ws)
|
||||
if nw < na {
|
||||
logNwords[ws] = math.Log(float64(nw))
|
||||
emaxValues[ws] = math.Log(float64(nw))
|
||||
} else {
|
||||
cov := nw / na
|
||||
remains := nw - (na * cov)
|
||||
f1 := float64(cov) / float64(nw)
|
||||
f2 := float64(cov+1) / float64(nw)
|
||||
logNwords[ws] = math.Log(float64(nw))
|
||||
emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
|
||||
float64(remains)*f2*math.Log(f2))
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// FUNCTION 2: maskAmbiguities - Mark positions containing ambiguities
|
||||
// ========================================================================
|
||||
// Identifies positions with ambiguous nucleotides (N, Y, R, etc.) and marks
|
||||
// all k-mers that contain them.
|
||||
//
|
||||
// Returns: a slice where maskPositions[i] = -1 if position i is part of a
|
||||
// k-mer containing an ambiguity, 0 otherwise
|
||||
maskAmbiguities := func(sequence []byte) []int {
|
||||
maskPositions := make([]int, len(sequence))
|
||||
for i, nuc := range sequence {
|
||||
// If nucleotide is not a, c, g or t (lowercase), it's an ambiguity
|
||||
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
||||
// Mark all positions of k-mers that contain this nucleotide
|
||||
// A k-mer starting at position (i - kmer_size + 1) will contain position i
|
||||
end := max(0, i-kmer_size+1)
|
||||
for j := i; j >= end; j-- {
|
||||
maskPositions[j] = -1
|
||||
@@ -102,182 +119,87 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
return maskPositions
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// FUNCTION 3: cleanTable - Reset a frequency table to zero
|
||||
// ========================================================================
|
||||
cleanTable := func(table []int, over int) {
|
||||
for i := 0; i < over; i++ {
|
||||
table[i] = 0
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// FUNCTION 4: slidingMin - Calculate sliding minimum over a window
|
||||
// ========================================================================
|
||||
// Applies a sliding window of size window over data and replaces each
|
||||
// value with the minimum in the window centered on that position.
|
||||
//
|
||||
// Uses a MinMultiset to efficiently maintain the minimum in the window.
|
||||
slidingMin := func(data []float64, window int) {
|
||||
minimier := obiutils.NewMinMultiset(func(a, b float64) bool { return a < b })
|
||||
ldata := len(data)
|
||||
mem := make([]float64, window) // Circular buffer to store window values
|
||||
|
||||
// Initialize buffer with sentinel value
|
||||
for i := range mem {
|
||||
mem[i] = 10000
|
||||
}
|
||||
|
||||
for i, v := range data {
|
||||
// Get the old value leaving the window
|
||||
m := mem[i%window]
|
||||
mem[i%window] = v
|
||||
|
||||
// Remove old value from multiset if it was valid
|
||||
if m < 10000 {
|
||||
minimier.RemoveOne(m)
|
||||
}
|
||||
|
||||
// Add new value if full window is ahead of us
|
||||
if (ldata - i) >= window {
|
||||
minimier.Add(v)
|
||||
}
|
||||
|
||||
// log.Warnf("taille du minimier %d @ %d", minimier.Len(), i)
|
||||
|
||||
// Retrieve and store current minimum
|
||||
var ok bool
|
||||
if data[i], ok = minimier.Min(); !ok {
|
||||
log.Error("problem with minimum entropy")
|
||||
data[i] = 0.0
|
||||
}
|
||||
|
||||
//xx, _ := minimier.Min()
|
||||
//log.Warnf("Pos: %d n: %d min: %.3f -> %.3f", i, minimier.Len(), v, xx)
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// FUNCTION 5: computeEntropies - Calculate normalized entropy for each position
|
||||
// ========================================================================
|
||||
// This is the central function that calculates the entropy of each k-mer in the sequence
|
||||
// at a given scale (wordSize).
|
||||
//
|
||||
// Algorithm:
|
||||
// 1. Encode the sequence into words (subsequences of size wordSize)
|
||||
// 2. For each k-mer, count the frequencies of words it contains
|
||||
// 3. Calculate normalized entropy = observed_entropy / maximum_entropy
|
||||
// 4. Apply a sliding min filter to smooth results
|
||||
//
|
||||
// IMPORTANT: Line 147 uses NormalizeInt for circular normalization of words!
|
||||
// This means "atg", "tga", and "gat" are considered the same word.
|
||||
computeEntropies := func(sequence []byte,
|
||||
maskPositions []int, // Positions of ambiguities
|
||||
entropies []float64, // Output: normalized entropies for each position
|
||||
table []int, // Frequency table for words (reused between calls)
|
||||
words []int, // Buffer to store encoded words (reused)
|
||||
wordSize int) { // Word size (scale of analysis)
|
||||
maskPositions []int,
|
||||
entropies []float64,
|
||||
table []int,
|
||||
words []int,
|
||||
wordSize int,
|
||||
normTable []int) {
|
||||
|
||||
lseq := len(sequence) // Sequence length
|
||||
tableSize := 1 << (wordSize * 2) // Actual table size (must fit all codes 0 to 4^wordSize-1)
|
||||
nwords := kmer_size - wordSize + 1 // Number of words in a k-mer
|
||||
lseq := len(sequence)
|
||||
tableSize := 1 << (wordSize * 2)
|
||||
nwords := kmer_size - wordSize + 1
|
||||
float_nwords := float64(nwords)
|
||||
log_nwords := math.Log(float_nwords) // log(nwords) used in entropy calculation
|
||||
entropyMax := emax(kmer_size, wordSize) // Theoretical maximum entropy (uses CanonicalKmerCount internally)
|
||||
log_nwords := logNwords[wordSize]
|
||||
entropyMax := emaxValues[wordSize]
|
||||
|
||||
// Reset frequency table (must clear entire table, not just nalpha entries)
|
||||
cleanTable(table, tableSize)
|
||||
|
||||
for i := 1; i < lseq; i++ {
|
||||
entropies[i] = 6
|
||||
}
|
||||
end := lseq - wordSize + 1 // Last position where a word can start
|
||||
end := lseq - wordSize + 1
|
||||
|
||||
// ========================================================================
|
||||
// STEP 1: Encode all words in the sequence
|
||||
// ========================================================================
|
||||
// Uses left-shift encoding: each nucleotide is encoded on 2 bits
|
||||
// a=00, c=01, g=10, t=11
|
||||
mask := (1 << (wordSize * 2)) - 1
|
||||
|
||||
mask := (1 << (wordSize * 2)) - 1 // Mask to keep only last wordSize*2 bits
|
||||
|
||||
// Initialize first word (all nucleotides except the last one)
|
||||
word_index := 0
|
||||
for i := 0; i < wordSize-1; i++ {
|
||||
word_index = (word_index << 2) + int(obikmer.EncodeNucleotide(sequence[i]))
|
||||
}
|
||||
|
||||
// Encode all words with sliding window
|
||||
for i, j := 0, wordSize-1; i < end; i, j = i+1, j+1 {
|
||||
// Shift left by 2 bits, mask, and add new nucleotide
|
||||
word_index = ((word_index << 2) & mask) + int(obikmer.EncodeNucleotide(sequence[j]))
|
||||
|
||||
// *** CIRCULAR NORMALIZATION ***
|
||||
// Convert word to its canonical form (smallest by circular rotation)
|
||||
// This is where "atg", "tga", "gat" all become "atg"
|
||||
// Now using uint64-based NormalizeCircular for better performance
|
||||
words[i] = int(obikmer.NormalizeCircular(uint64(word_index), wordSize))
|
||||
words[i] = normTable[word_index]
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// STEP 2: Calculate entropy for each k-mer with sliding window
|
||||
// ========================================================================
|
||||
s := 0 // Number of words processed in current k-mer
|
||||
sum_n_logn := 0.0 // Sum of n*log(n) for entropy calculation
|
||||
entropy := 1.0 // Current normalized entropy
|
||||
cleaned := true // Flag indicating if table has been cleaned
|
||||
s := 0
|
||||
sum_n_logn := 0.0
|
||||
entropy := 1.0
|
||||
cleaned := true
|
||||
|
||||
for i := range end {
|
||||
s++
|
||||
|
||||
switch {
|
||||
// CASE 1: Filling phase (fewer than nwords words collected)
|
||||
case s < nwords:
|
||||
cleaned = false
|
||||
table[words[i]]++ // Increment word frequency
|
||||
table[words[i]]++
|
||||
|
||||
// CASE 2: Position contains an ambiguity
|
||||
case i >= (nwords-1) && maskPositions[i-nwords+1] < 0:
|
||||
entropies[i-nwords+1] = 4.0 // Mark entropy as invalid
|
||||
entropies[i-nwords+1] = 4.0
|
||||
if !cleaned {
|
||||
cleanTable(table, tableSize) // Reset table
|
||||
cleanTable(table, tableSize)
|
||||
}
|
||||
cleaned = true
|
||||
s = 0
|
||||
sum_n_logn = 0.0
|
||||
|
||||
// CASE 3: First complete k-mer (s == nwords)
|
||||
case s == nwords:
|
||||
cleaned = false
|
||||
table[words[i]]++
|
||||
|
||||
// Calculate Shannon entropy: H = -Σ p(i)*log(p(i))
|
||||
// = log(N) - (1/N)*Σ n(i)*log(n(i))
|
||||
// where N = nwords, n(i) = frequency of word i
|
||||
//
|
||||
// NOTE: We iterate over entire table (tableSize = 4^wordSize) to count all frequencies.
|
||||
// Canonical codes are not contiguous (e.g., for k=2: {0,1,2,3,5,6,7,10,11,15})
|
||||
// so we must scan the full table even though only ~10 entries will be non-zero
|
||||
sum_n_logn = 0
|
||||
for j := range tableSize {
|
||||
n := float64(table[j])
|
||||
if n > 0 {
|
||||
sum_n_logn += n * math.Log(n)
|
||||
sum_n_logn += nLogN[int(n)]
|
||||
}
|
||||
}
|
||||
// Normalized entropy = observed entropy / maximum entropy
|
||||
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
|
||||
|
||||
// CASE 4: Sliding window (s > nwords)
|
||||
// Incremental update of entropy by adding a new word
|
||||
// and removing the old one
|
||||
case s > nwords:
|
||||
cleaned = false
|
||||
|
||||
new_word := words[i]
|
||||
old_word := words[i-nwords]
|
||||
|
||||
// Optimization: only recalculate if word changes
|
||||
if old_word != new_word {
|
||||
table[new_word]++
|
||||
table[old_word]--
|
||||
@@ -285,59 +207,39 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
n_old := float64(table[old_word])
|
||||
n_new := float64(table[new_word])
|
||||
|
||||
// Incremental update of sum_n_logn
|
||||
// Remove contribution of old word (before decrement)
|
||||
sum_n_logn -= (n_old + 1) * math.Log(n_old+1)
|
||||
// Add contribution of old word (after decrement)
|
||||
sum_n_logn -= nLogN[int(n_old+1)]
|
||||
if n_old > 0 {
|
||||
sum_n_logn += n_old * math.Log(n_old)
|
||||
sum_n_logn += nLogN[int(n_old)]
|
||||
}
|
||||
// Add contribution of new word (after increment)
|
||||
if n_new > 0 {
|
||||
sum_n_logn += n_new * math.Log(n_new)
|
||||
sum_n_logn += nLogN[int(n_new)]
|
||||
}
|
||||
// Remove contribution of new word (before increment)
|
||||
if n_new > 1 {
|
||||
sum_n_logn -= (n_new - 1) * math.Log(n_new-1)
|
||||
sum_n_logn -= nLogN[int(n_new-1)]
|
||||
}
|
||||
}
|
||||
|
||||
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
|
||||
}
|
||||
|
||||
// Store entropy for position corresponding to start of k-mer
|
||||
if s >= nwords && maskPositions[i-nwords+1] >= 0 {
|
||||
if entropy < 0 {
|
||||
entropy = 0
|
||||
|
||||
}
|
||||
entropy = math.Round(entropy*10000) / 10000
|
||||
entropies[i-nwords+1] = entropy
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// STEP 3: Apply sliding min filter
|
||||
// ========================================================================
|
||||
// Replace each entropy with minimum in window of size kmer_size
|
||||
// This allows robust detection of low-complexity regions
|
||||
slidingMin(entropies, kmer_size)
|
||||
// log.Warnf("%v\n%v", e, entropies)
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// FUNCTION 6: applyMaskMode - Apply masking to sequence
|
||||
// ========================================================================
|
||||
applyMaskMode := func(sequence *obiseq.BioSequence, maskPositions []bool, mask byte) (obiseq.BioSequenceSlice, error) {
|
||||
// Create copy to avoid modifying original
|
||||
seqCopy := sequence.Copy()
|
||||
sequenceBytes := seqCopy.Sequence()
|
||||
|
||||
// Mask identified positions
|
||||
for i := range sequenceBytes {
|
||||
if maskPositions[i] {
|
||||
// Operation &^ 32 converts to UPPERCASE (clears bit 5)
|
||||
// sequenceBytes[i] = sequenceBytes[i] &^ 32
|
||||
sequenceBytes[i] = mask
|
||||
}
|
||||
}
|
||||
@@ -368,7 +270,6 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
}
|
||||
}
|
||||
|
||||
// Handle the case where we end in a masked region
|
||||
if inlow && fromlow >= 0 {
|
||||
frg, err := sequence.Subsequence(fromlow, len(maskPosition), false)
|
||||
if err != nil {
|
||||
@@ -403,7 +304,6 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
}
|
||||
}
|
||||
|
||||
// Handle the case where we end in an unmasked region
|
||||
if inhigh && fromhigh >= 0 {
|
||||
frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false)
|
||||
if err != nil {
|
||||
@@ -415,11 +315,6 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
return *rep, nil
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// FUNCTION 7: masking - Main masking function
|
||||
// ========================================================================
|
||||
// Calculates entropies at all scales and masks positions
|
||||
// whose minimum entropy is below the threshold.
|
||||
masking := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
if sequence.Len() < kmer_size {
|
||||
sequence.SetAttribute("obilowmask_error", "Sequence too short")
|
||||
@@ -432,45 +327,27 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
|
||||
bseq := sequence.Sequence()
|
||||
|
||||
// Identify ambiguities
|
||||
maskPositions := maskAmbiguities(bseq)
|
||||
|
||||
// Initialize data structures
|
||||
mask := make([]int, len(bseq)) // Stores scale detecting minimum entropy
|
||||
entropies := make([]float64, len(bseq)) // Minimum entropy at each position
|
||||
mask := make([]int, len(bseq))
|
||||
entropies := make([]float64, len(bseq))
|
||||
for i := range entropies {
|
||||
entropies[i] = 4.0 // Very high initial value
|
||||
entropies[i] = 4.0
|
||||
}
|
||||
|
||||
freqs := make([]int, 1<<(2*level_max)) // Frequency table (max size)
|
||||
words := make([]int, len(bseq)) // Buffer for encoded words
|
||||
freqs := make([]int, 1<<(2*level_max))
|
||||
words := make([]int, len(bseq))
|
||||
entropies2 := make([]float64, len(bseq))
|
||||
|
||||
// ========================================================================
|
||||
// Calculate entropy at maximum scale (level_max)
|
||||
// ========================================================================
|
||||
computeEntropies(bseq, maskPositions, entropies, freqs, words, level_max)
|
||||
computeEntropies(bseq, maskPositions, entropies, freqs, words, level_max, normTables[level_max])
|
||||
|
||||
// Initialize mask with level_max everywhere (except ambiguities)
|
||||
for i := range bseq {
|
||||
v := level_max
|
||||
// if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
||||
// v = 0
|
||||
// }
|
||||
mask[i] = v
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// Calculate entropy at lower scales
|
||||
// ========================================================================
|
||||
entropies2 := make([]float64, len(bseq))
|
||||
|
||||
for ws := level_max - 1; ws > 0; ws-- {
|
||||
// *** WARNING: POTENTIAL BUG ***
|
||||
// The parameter passed is level_max instead of ws!
|
||||
// This means we always recalculate with the same scale
|
||||
// Should be: computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws)
|
||||
computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws)
|
||||
// Keep minimum entropy and corresponding scale
|
||||
computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws, normTables[ws])
|
||||
for i, e2 := range entropies2 {
|
||||
if e2 < entropies[i] {
|
||||
entropies[i] = e2
|
||||
@@ -479,22 +356,17 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
}
|
||||
}
|
||||
|
||||
// Force entropy to 0 for ambiguous positions
|
||||
for i, nuc := range bseq {
|
||||
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
||||
entropies[i] = 0
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// Identify positions to mask
|
||||
// ========================================================================
|
||||
remove := make([]bool, len(entropies))
|
||||
for i, e := range entropies {
|
||||
remove[i] = e <= threshold
|
||||
}
|
||||
|
||||
// Save metadata in sequence attributes
|
||||
sequence.SetAttribute("mask", mask)
|
||||
sequence.SetAttribute("Entropies", entropies)
|
||||
|
||||
@@ -527,9 +399,7 @@ func CLISequenceEntropyMasker(iterator obiiter.IBioSequence) obiiter.IBioSequenc
|
||||
CLIMaskingChar(),
|
||||
)
|
||||
|
||||
// Apply worker in parallel
|
||||
newIter = iterator.MakeIWorker(worker, false, obidefault.ParallelWorkers())
|
||||
|
||||
// Filter resulting empty sequences
|
||||
return newIter.FilterEmpty()
|
||||
}
|
||||
|
||||
40
pkg/obitools/obilowmask/obilowmask_test.go
Normal file
40
pkg/obitools/obilowmask/obilowmask_test.go
Normal file
@@ -0,0 +1,40 @@
|
||||
package obilowmask
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
func TestLowMaskWorker(t *testing.T) {
|
||||
worker := LowMaskWorker(31, 6, 0.3, Mask, 'n')
|
||||
|
||||
seq := obiseq.NewBioSequence("test", []byte("acgtacgtacgtacgtacgtacgtacgtacgt"), "test")
|
||||
result, err := worker(seq)
|
||||
if err != nil {
|
||||
t.Fatalf("Worker failed: %v", err)
|
||||
}
|
||||
|
||||
if result.Len() != 1 {
|
||||
t.Fatalf("Expected 1 sequence, got %d", result.Len())
|
||||
}
|
||||
|
||||
resultSeq := result[0]
|
||||
if resultSeq.Len() != 32 {
|
||||
t.Fatalf("Expected sequence length 32, got %d", resultSeq.Len())
|
||||
}
|
||||
}
|
||||
|
||||
func TestLowMaskWorkerWithAmbiguity(t *testing.T) {
|
||||
worker := LowMaskWorker(31, 6, 0.3, Mask, 'n')
|
||||
|
||||
seq := obiseq.NewBioSequence("test", []byte("acgtNcgtacgtacgtacgtacgtacgtacgt"), "test")
|
||||
result, err := worker(seq)
|
||||
if err != nil {
|
||||
t.Fatalf("Worker failed: %v", err)
|
||||
}
|
||||
|
||||
if result.Len() != 1 {
|
||||
t.Fatalf("Expected 1 sequence, got %d", result.Len())
|
||||
}
|
||||
}
|
||||
@@ -207,16 +207,19 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
|
||||
log.Infof("Done. Found %d clusters", clusters.Len())
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowCount(),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetDescription("Cluster indexing"),
|
||||
)
|
||||
var bar *progressbar.ProgressBar
|
||||
if obidefault.ProgressBar() {
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowCount(),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetDescription("Cluster indexing"),
|
||||
)
|
||||
|
||||
bar := progressbar.NewOptions(len(clusters), pbopt...)
|
||||
bar = progressbar.NewOptions(len(clusters), pbopt...)
|
||||
}
|
||||
|
||||
limits := make(chan [2]int)
|
||||
waiting := sync.WaitGroup{}
|
||||
@@ -233,7 +236,9 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
for i := l[0]; i < l[1]; i++ {
|
||||
idx := IndexSequence(i, clusters, &kcluster, taxa, taxonomy)
|
||||
clusters[i].SetOBITagRefIndex(idx)
|
||||
bar.Add(1)
|
||||
if bar != nil {
|
||||
bar.Add(1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -239,16 +239,19 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
|
||||
log.Info("done")
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowCount(),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetDescription("[Sequence Processing]"),
|
||||
)
|
||||
var bar *progressbar.ProgressBar
|
||||
if obidefault.ProgressBar() {
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowCount(),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetDescription("[Sequence Processing]"),
|
||||
)
|
||||
|
||||
bar := progressbar.NewOptions(len(references), pbopt...)
|
||||
bar = progressbar.NewOptions(len(references), pbopt...)
|
||||
}
|
||||
|
||||
limits := make(chan [2]int)
|
||||
indexed := obiiter.MakeIBioSequence()
|
||||
@@ -267,7 +270,9 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
iref := references[i].Copy()
|
||||
iref.SetOBITagRefIndex(idx)
|
||||
sl = append(sl, iref)
|
||||
bar.Add(1)
|
||||
if bar != nil {
|
||||
bar.Add(1)
|
||||
}
|
||||
}
|
||||
indexed.Push(obiiter.MakeBioSequenceBatch(source, l[0]/10, sl))
|
||||
}
|
||||
|
||||
10
pkg/obitools/obisuperkmer/obisuperkmer.go
Normal file
10
pkg/obitools/obisuperkmer/obisuperkmer.go
Normal file
@@ -0,0 +1,10 @@
|
||||
// obisuperkmer function utility package.
|
||||
//
|
||||
// The obitools/obisuperkmer package contains every
|
||||
// function specifically required by the obisuperkmer utility.
|
||||
//
|
||||
// The obisuperkmer command extracts super k-mers from DNA sequences.
|
||||
// A super k-mer is a maximal subsequence where all consecutive k-mers
|
||||
// share the same minimizer. This decomposition is useful for efficient
|
||||
// k-mer indexing and analysis in bioinformatics applications.
|
||||
package obisuperkmer
|
||||
69
pkg/obitools/obisuperkmer/options.go
Normal file
69
pkg/obitools/obisuperkmer/options.go
Normal file
@@ -0,0 +1,69 @@
|
||||
package obisuperkmer
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// Private variables for storing option values
|
||||
var _KmerSize = 31
|
||||
var _MinimizerSize = 13
|
||||
|
||||
// SuperKmerOptionSet defines every option related to super k-mer extraction.
|
||||
//
|
||||
// The function adds to a CLI every option proposed to the user
|
||||
// to tune the parameters of the super k-mer extraction algorithm.
|
||||
//
|
||||
// Parameters:
|
||||
// - options: is a pointer to a getoptions.GetOpt instance normally
|
||||
// produced by the obioptions.GenerateOptionParser function.
|
||||
func SuperKmerOptionSet(options *getoptions.GetOpt) {
|
||||
options.IntVar(&_KmerSize, "kmer-size", _KmerSize,
|
||||
options.Alias("k"),
|
||||
options.Description("Size of k-mers (must be between m+1 and 31)."))
|
||||
|
||||
options.IntVar(&_MinimizerSize, "minimizer-size", _MinimizerSize,
|
||||
options.Alias("m"),
|
||||
options.Description("Size of minimizers (must be between 1 and k-1)."))
|
||||
}
|
||||
|
||||
// OptionSet adds to the basic option set every option declared for
|
||||
// the obisuperkmer command.
|
||||
//
|
||||
// It takes a pointer to a GetOpt struct as its parameter and does not return anything.
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(false)(options)
|
||||
SuperKmerOptionSet(options)
|
||||
}
|
||||
|
||||
// CLIKmerSize returns the k-mer size to use for super k-mer extraction.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// It returns an integer representing the k-mer size.
|
||||
func CLIKmerSize() int {
|
||||
return _KmerSize
|
||||
}
|
||||
|
||||
// SetKmerSize sets the k-mer size for super k-mer extraction.
|
||||
//
|
||||
// Parameters:
|
||||
// - k: the k-mer size (must be between m+1 and 31).
|
||||
func SetKmerSize(k int) {
|
||||
_KmerSize = k
|
||||
}
|
||||
|
||||
// CLIMinimizerSize returns the minimizer size to use for super k-mer extraction.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// It returns an integer representing the minimizer size.
|
||||
func CLIMinimizerSize() int {
|
||||
return _MinimizerSize
|
||||
}
|
||||
|
||||
// SetMinimizerSize sets the minimizer size for super k-mer extraction.
|
||||
//
|
||||
// Parameters:
|
||||
// - m: the minimizer size (must be between 1 and k-1).
|
||||
func SetMinimizerSize(m int) {
|
||||
_MinimizerSize = m
|
||||
}
|
||||
59
pkg/obitools/obisuperkmer/superkmer.go
Normal file
59
pkg/obitools/obisuperkmer/superkmer.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package obisuperkmer
|
||||
|
||||
import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
)
|
||||
|
||||
// CLIExtractSuperKmers extracts super k-mers from an iterator of BioSequences.
|
||||
//
|
||||
// This function takes an iterator of BioSequence objects, extracts super k-mers
|
||||
// from each sequence using the k-mer and minimizer sizes specified by CLI options,
|
||||
// and returns a new iterator yielding the extracted super k-mers as BioSequence objects.
|
||||
//
|
||||
// Each super k-mer is a maximal subsequence where all consecutive k-mers share
|
||||
// the same minimizer. The resulting BioSequences contain metadata including:
|
||||
// - minimizer_value: the canonical minimizer value
|
||||
// - minimizer_seq: the DNA sequence of the minimizer
|
||||
// - k: the k-mer size used
|
||||
// - m: the minimizer size used
|
||||
// - start: starting position in the original sequence
|
||||
// - end: ending position in the original sequence
|
||||
// - parent_id: ID of the parent sequence
|
||||
//
|
||||
// Parameters:
|
||||
// - iterator: an iterator yielding BioSequence objects to process.
|
||||
//
|
||||
// Returns:
|
||||
// - An iterator yielding BioSequence objects representing super k-mers.
|
||||
func CLIExtractSuperKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
// Get k-mer and minimizer sizes from CLI options
|
||||
k := CLIKmerSize()
|
||||
m := CLIMinimizerSize()
|
||||
|
||||
// Validate parameters
|
||||
if m < 1 || m >= k {
|
||||
log.Fatalf("Invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1)
|
||||
}
|
||||
|
||||
if k < 2 || k > 31 {
|
||||
log.Fatalf("Invalid k-mer size: %d (must be between 2 and 31)", k)
|
||||
}
|
||||
|
||||
log.Printf("Extracting super k-mers with k=%d, m=%d", k, m)
|
||||
|
||||
// Create the worker for super k-mer extraction
|
||||
worker := obikmer.SuperKmerWorker(k, m)
|
||||
|
||||
// Apply the worker to the iterator with parallel processing
|
||||
newIter := iterator.MakeIWorker(
|
||||
worker,
|
||||
false, // don't merge results
|
||||
obidefault.ParallelWorkers(),
|
||||
)
|
||||
|
||||
return newIter
|
||||
}
|
||||
149
pkg/obitools/obisuperkmer/superkmer_test.go
Normal file
149
pkg/obitools/obisuperkmer/superkmer_test.go
Normal file
@@ -0,0 +1,149 @@
|
||||
package obisuperkmer
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
func TestCLIExtractSuperKmers(t *testing.T) {
|
||||
// Create a test sequence
|
||||
testSeq := obiseq.NewBioSequence(
|
||||
"test_seq",
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACGT"),
|
||||
"",
|
||||
)
|
||||
|
||||
// Create a batch with the test sequence
|
||||
batch := obiseq.NewBioSequenceBatch()
|
||||
batch.Add(testSeq)
|
||||
|
||||
// Create an iterator from the batch
|
||||
iterator := obiiter.MakeBioSequenceBatchChannel(1)
|
||||
go func() {
|
||||
iterator.Push(batch)
|
||||
iterator.Close()
|
||||
}()
|
||||
|
||||
// Set test parameters
|
||||
SetKmerSize(15)
|
||||
SetMinimizerSize(7)
|
||||
|
||||
// Extract super k-mers
|
||||
result := CLIExtractSuperKmers(iterator)
|
||||
|
||||
// Count the number of super k-mers
|
||||
count := 0
|
||||
for result.Next() {
|
||||
batch := result.Get()
|
||||
for _, sk := range batch.Slice() {
|
||||
count++
|
||||
|
||||
// Verify that the super k-mer has the expected attributes
|
||||
if !sk.HasAttribute("minimizer_value") {
|
||||
t.Error("Super k-mer missing 'minimizer_value' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("minimizer_seq") {
|
||||
t.Error("Super k-mer missing 'minimizer_seq' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("k") {
|
||||
t.Error("Super k-mer missing 'k' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("m") {
|
||||
t.Error("Super k-mer missing 'm' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("start") {
|
||||
t.Error("Super k-mer missing 'start' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("end") {
|
||||
t.Error("Super k-mer missing 'end' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("parent_id") {
|
||||
t.Error("Super k-mer missing 'parent_id' attribute")
|
||||
}
|
||||
|
||||
// Verify attribute values
|
||||
k, _ := sk.GetIntAttribute("k")
|
||||
m, _ := sk.GetIntAttribute("m")
|
||||
|
||||
if k != 15 {
|
||||
t.Errorf("Expected k=15, got k=%d", k)
|
||||
}
|
||||
if m != 7 {
|
||||
t.Errorf("Expected m=7, got m=%d", m)
|
||||
}
|
||||
|
||||
parentID, _ := sk.GetStringAttribute("parent_id")
|
||||
if parentID != "test_seq" {
|
||||
t.Errorf("Expected parent_id='test_seq', got '%s'", parentID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if count == 0 {
|
||||
t.Error("No super k-mers were extracted")
|
||||
}
|
||||
|
||||
t.Logf("Extracted %d super k-mers from test sequence", count)
|
||||
}
|
||||
|
||||
func TestOptionGettersAndSetters(t *testing.T) {
|
||||
// Test initial values
|
||||
if CLIKmerSize() != 21 {
|
||||
t.Errorf("Expected default k-mer size 21, got %d", CLIKmerSize())
|
||||
}
|
||||
if CLIMinimizerSize() != 11 {
|
||||
t.Errorf("Expected default minimizer size 11, got %d", CLIMinimizerSize())
|
||||
}
|
||||
|
||||
// Test setters
|
||||
SetKmerSize(25)
|
||||
SetMinimizerSize(13)
|
||||
|
||||
if CLIKmerSize() != 25 {
|
||||
t.Errorf("SetKmerSize failed: expected 25, got %d", CLIKmerSize())
|
||||
}
|
||||
if CLIMinimizerSize() != 13 {
|
||||
t.Errorf("SetMinimizerSize failed: expected 13, got %d", CLIMinimizerSize())
|
||||
}
|
||||
|
||||
// Reset to defaults
|
||||
SetKmerSize(21)
|
||||
SetMinimizerSize(11)
|
||||
}
|
||||
|
||||
func BenchmarkCLIExtractSuperKmers(b *testing.B) {
|
||||
// Create a longer test sequence
|
||||
longSeq := make([]byte, 1000)
|
||||
bases := []byte{'A', 'C', 'G', 'T'}
|
||||
for i := range longSeq {
|
||||
longSeq[i] = bases[i%4]
|
||||
}
|
||||
|
||||
testSeq := obiseq.NewBioSequence("bench_seq", longSeq, "")
|
||||
|
||||
// Set parameters
|
||||
SetKmerSize(21)
|
||||
SetMinimizerSize(11)
|
||||
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
batch := obiseq.NewBioSequenceBatch()
|
||||
batch.Add(testSeq)
|
||||
|
||||
iterator := obiiter.MakeBioSequenceBatchChannel(1)
|
||||
go func() {
|
||||
iterator.Push(batch)
|
||||
iterator.Close()
|
||||
}()
|
||||
|
||||
result := CLIExtractSuperKmers(iterator)
|
||||
|
||||
// Consume the iterator
|
||||
for result.Next() {
|
||||
result.Get()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1 +1 @@
|
||||
4.4.10
|
||||
4.4.12
|
||||
|
||||
Reference in New Issue
Block a user