Push rxrxptuxltlp #12
@@ -24,3 +24,37 @@ Sauf qu'avec un index approximatif, les résultats seront approximatifs.
|
|||||||
--detail et --mismatch à implementer
|
--detail et --mismatch à implementer
|
||||||
|
|
||||||
- status : affiche le statut de l'index
|
- status : affiche le statut de l'index
|
||||||
|
|
||||||
|
## Problème biologique sur l'identification des contaminants
|
||||||
|
|
||||||
|
Exemple de reads problématiques:
|
||||||
|
```
|
||||||
|
>LH00534:161:22WMGWLT4:4:1101:45301:1420 {"coverage":{"gbbct":[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]},"kmer_count":117,"kmer_strict_matches":{"gbbct":117}}
|
||||||
|
GCCCCTACCGTACTCCAGCTTGGTAGTTTCCACCGCCTGTCCAGGGTTGAGCCCTGGGATTTGACGGCGGACTTAAAAAGCCACCTACAGACGCTTTACGCCCAATCATTCCGGATAACGCTTGCATCCTCTGTATTACCGCGGCTGCTGG
|
||||||
|
```
|
||||||
|
|
||||||
|
Par blast match une quantité invréssemblable de genomes chloroplastique avec un match de 100% (6554 hits pour Streptophyta)
|
||||||
|
|
||||||
|
mais aussi une quantité de sequences importantes à des OTU bactériennes (uncutured bacteria 115 hits) aussi avec 100% de similarité.
|
||||||
|
|
||||||
|
```
|
||||||
|
Uncultured bacterium clone Otu01032 16S ribosomal RNA gene, partial sequence
|
||||||
|
Sequence ID: KX996137.1Length: 440Number of Matches: 1
|
||||||
|
Range 1: 153 to 303GenBankGraphics
|
||||||
|
Next Match
|
||||||
|
Previous Match
|
||||||
|
Alignment statistics for match #1 Score Expect Identities Gaps Strand
|
||||||
|
273 bits(302) 2e-69 151/151(100%) 0/151(0%) Plus/Minus
|
||||||
|
|
||||||
|
Query 1 GCCCCTACCGTACTCCAGCTTGGTAGTTTCCACCGCCTGTCCAGGGTTGAGCCCTGGGAT 60
|
||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||
|
Sbjct 303 GCCCCTACCGTACTCCAGCTTGGTAGTTTCCACCGCCTGTCCAGGGTTGAGCCCTGGGAT 244
|
||||||
|
|
||||||
|
Query 61 TTGACGGCGGACTTAAAAAGCCACCTACAGACGCTTTACGCCCAATCATTCCGGATAACG 120
|
||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||
|
Sbjct 243 TTGACGGCGGACTTAAAAAGCCACCTACAGACGCTTTACGCCCAATCATTCCGGATAACG 184
|
||||||
|
|
||||||
|
Query 121 CTTGCATCCTCTGTATTACCGCGGCTGCTGG 151
|
||||||
|
|||||||||||||||||||||||||||||||
|
||||||
|
Sbjct 183 CTTGCATCCTCTGTATTACCGCGGCTGCTGG 153
|
||||||
|
```
|
||||||
|
|||||||
@@ -230,16 +230,11 @@ fn process_chunk(
|
|||||||
let batch = QueryBatch::from_records(records, k, 6, 0.7);
|
let batch = QueryBatch::from_records(records, k, 6, 0.7);
|
||||||
let n_seqs = batch.ids.len();
|
let n_seqs = batch.ids.len();
|
||||||
|
|
||||||
let mut accs: Vec<SeqAcc> =
|
// Per-sequence s-mer result vectors in global sequence position order.
|
||||||
(0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect();
|
// All partitions fill into this structure before Findere is applied.
|
||||||
|
let mut seq_results: Vec<Vec<Option<Box<[u32]>>>> = batch.n_kmers.iter()
|
||||||
let mut cov: Vec<Vec<Vec<u32>>> = if detail {
|
.map(|&n| vec![None; n as usize])
|
||||||
batch.n_kmers.iter()
|
.collect();
|
||||||
.map(|&n| vec![vec![0u32; n as usize]; n_genomes])
|
|
||||||
.collect()
|
|
||||||
} else {
|
|
||||||
Vec::new()
|
|
||||||
};
|
|
||||||
|
|
||||||
let by_part = batch.split_by_partition(n_partitions);
|
let by_part = batch.split_by_partition(n_partitions);
|
||||||
|
|
||||||
@@ -256,38 +251,60 @@ fn process_chunk(
|
|||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
let presence = force_presence || !with_counts;
|
|
||||||
let threshold = presence_threshold;
|
|
||||||
|
|
||||||
for (rsk, sk_kmer_results) in part_sks.iter().zip(kmer_results.iter()) {
|
for (rsk, sk_kmer_results) in part_sks.iter().zip(kmer_results.iter()) {
|
||||||
let filtered = apply_findere(sk_kmer_results, effective_z, n_genomes);
|
|
||||||
let descs = batch.map.get(*rsk).expect("rsk must be in map");
|
let descs = batch.map.get(*rsk).expect("rsk must be in map");
|
||||||
|
|
||||||
for desc in descs {
|
for desc in descs {
|
||||||
let acc = &mut accs[desc.seq_idx as usize];
|
let offset = desc.kmer_offset as usize;
|
||||||
|
let dst = &mut seq_results[desc.seq_idx as usize];
|
||||||
|
for (j, hit) in sk_kmer_results.iter().enumerate() {
|
||||||
|
dst[offset + j] = hit.as_ref().map(|r| r.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (local_pos, hit) in filtered.iter().enumerate() {
|
// Apply Findere on each complete sequence vector, then accumulate.
|
||||||
match hit {
|
let n_kmers_out: Vec<usize> = batch.n_kmers.iter()
|
||||||
None => {
|
.map(|&n| { let n = n as usize; if n >= effective_z { n - effective_z + 1 } else { 0 } })
|
||||||
if sk_kmer_results[local_pos].is_none() {
|
.collect();
|
||||||
acc.kmer_missing += 1;
|
|
||||||
}
|
let mut accs: Vec<SeqAcc> =
|
||||||
}
|
(0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect();
|
||||||
Some(row) => {
|
|
||||||
acc.kmer_count += 1;
|
let mut cov: Vec<Vec<Vec<u32>>> = if detail {
|
||||||
for (g, &v) in row.iter().enumerate() {
|
n_kmers_out.iter()
|
||||||
if v == 0 { continue; }
|
.map(|&n| vec![vec![0u32; n]; n_genomes])
|
||||||
let contribution = if presence {
|
.collect()
|
||||||
u32::from(v >= threshold)
|
} else {
|
||||||
} else {
|
Vec::new()
|
||||||
v
|
};
|
||||||
};
|
|
||||||
acc.genome_totals[g] += contribution;
|
let presence = force_presence || !with_counts;
|
||||||
if detail {
|
let threshold = presence_threshold;
|
||||||
let abs_pos = desc.kmer_offset as usize + local_pos;
|
|
||||||
cov[desc.seq_idx as usize][g][abs_pos] += contribution;
|
for seq_idx in 0..n_seqs {
|
||||||
}
|
let filtered = apply_findere(&seq_results[seq_idx], effective_z, n_genomes);
|
||||||
}
|
let acc = &mut accs[seq_idx];
|
||||||
|
|
||||||
|
for (pos, hit) in filtered.iter().enumerate() {
|
||||||
|
match hit {
|
||||||
|
None => {
|
||||||
|
if seq_results[seq_idx][pos].is_none() {
|
||||||
|
acc.kmer_missing += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(row) => {
|
||||||
|
acc.kmer_count += 1;
|
||||||
|
for (g, &v) in row.iter().enumerate() {
|
||||||
|
if v == 0 { continue; }
|
||||||
|
let contribution = if presence {
|
||||||
|
u32::from(v >= threshold)
|
||||||
|
} else {
|
||||||
|
v
|
||||||
|
};
|
||||||
|
acc.genome_totals[g] += contribution;
|
||||||
|
if detail {
|
||||||
|
cov[seq_idx][g][pos] += contribution;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user