refactor: improve de Bruijn graph traversal and longtig generation
- Refactored Node representation using compact bitfields for neighbor counts and nucleotides; added count_neighbors helper to compute_degrees() - Introduced StartIter iterator for unitig/longtigu generation with revised traversal semantics (e.g., interior node marking) - Added nucleotide() accessor to Kmer type for 2-bit extraction at position i - Renamed unitig.rs → longtigs, updated CLI command and output filenames to reflect "long t ig" - Extended extract_kmers() in scripts/compare.py with duplication statistics ```
This commit is contained in:
@@ -51,16 +51,21 @@ def iter_sequences(path: str):
|
||||
yield header, "".join(parts)
|
||||
|
||||
|
||||
def extract_kmers(path: str, k: int) -> set[str]:
|
||||
"""Return the set of canonical k-mers from all sequences in *path*."""
|
||||
kmers: set[str] = set()
|
||||
def extract_kmers(path: str, k: int) -> tuple[set[str], int]:
|
||||
"""Return (set of canonical k-mers, count of duplicated k-mers) from *path*.
|
||||
|
||||
A k-mer is duplicated if its canonical form appears more than once across
|
||||
all sequences in the file.
|
||||
"""
|
||||
counts: dict[str, int] = {}
|
||||
for _, seq in iter_sequences(path):
|
||||
# skip any character that is not ACGT
|
||||
for i in range(len(seq) - k + 1):
|
||||
kmer = seq[i : i + k]
|
||||
if all(c in "ACGT" for c in kmer):
|
||||
kmers.add(canonical(kmer))
|
||||
return kmers
|
||||
ck = canonical(kmer)
|
||||
counts[ck] = counts.get(ck, 0) + 1
|
||||
duplicated = sum(1 for v in counts.values() if v > 1)
|
||||
return set(counts), duplicated
|
||||
|
||||
|
||||
def main():
|
||||
@@ -81,16 +86,18 @@ def main():
|
||||
print()
|
||||
|
||||
print("reading A …", file=sys.stderr)
|
||||
set_a = extract_kmers(args.file_a, k)
|
||||
set_a, dup_a = extract_kmers(args.file_a, k)
|
||||
print("reading B …", file=sys.stderr)
|
||||
set_b = extract_kmers(args.file_b, k)
|
||||
set_b, dup_b = extract_kmers(args.file_b, k)
|
||||
|
||||
only_a = set_a - set_b
|
||||
only_b = set_b - set_a
|
||||
common = set_a & set_b
|
||||
|
||||
print(f"{'kmers in A':<25} {len(set_a):>12,}")
|
||||
print(f"{'duplicated in A':<25} {dup_a:>12,}")
|
||||
print(f"{'kmers in B':<25} {len(set_b):>12,}")
|
||||
print(f"{'duplicated in B':<25} {dup_b:>12,}")
|
||||
print(f"{'common':<25} {len(common):>12,}")
|
||||
print(f"{'only in A (lost)':<25} {len(only_a):>12,}")
|
||||
print(f"{'only in B (gained)':<25} {len(only_b):>12,}")
|
||||
|
||||
Reference in New Issue
Block a user