Compare commits

..

6 Commits

Author SHA1 Message Date
72b3e5d872 switch to version 3.0.1b7 2021-04-07 10:31:54 +12:00
238e9f70f3 alignpairedend: fixed bug that would cut out sequence ends when it
should not have
2021-04-07 10:31:12 +12:00
e099a16624 small fixes 2021-04-07 10:28:02 +12:00
847c9c816d import: fixed count estimation for tabular files with header 2021-03-30 09:07:14 +13:00
6026129ca8 Fixes 101 2021-03-30 09:06:08 +13:00
169b6514b4 small doc fixes 2021-03-29 13:07:48 +13:00
19 changed files with 123 additions and 75 deletions

View File

@ -1,3 +1,9 @@
import codecs
def unescaped_str(arg_str):
return arg_str.encode('latin-1', 'backslashreplace').decode('unicode-escape')
def __addInputOption(optionManager):
optionManager.add_argument(
@ -133,7 +139,7 @@ def __addTabularOption(optionManager):
group.add_argument('--sep',
action="store", dest="obi:sep",
default="\t",
type=str,
type=unescaped_str,
help="Column separator")

View File

@ -26,7 +26,7 @@ import sys
from cpython.exc cimport PyErr_CheckSignals
__title__="Annotate sequences with their corresponding NCBI taxid found from the taxon scientific name."
__title__="Annotate sequences with their corresponding NCBI taxid found from the taxon scientific name"

View File

@ -19,7 +19,7 @@ import time
import sys
__title__="Aligns one sequence column with itself or two sequence columns"
__title__="Align one sequence column with itself or two sequence columns"
def addOptions(parser):

View File

@ -23,7 +23,7 @@ import os
from cpython.exc cimport PyErr_CheckSignals
__title__="Aligns paired-ended reads"
__title__="Align paired-ended reads"

View File

@ -16,7 +16,7 @@ import sys
from cpython.exc cimport PyErr_CheckSignals
__title__="Tag a set of sequences for PCR and sequencing errors identification"
__title__="Build a reference database for ecotag"
def addOptions(parser):

View File

@ -22,7 +22,7 @@ import sys
from cpython.exc cimport PyErr_CheckSignals
__title__="Concatenate views."
__title__="Concatenate views"
def addOptions(parser):

View File

@ -10,7 +10,7 @@ from obitools3.dms.capi.obiview cimport COUNT_COLUMN
from cpython.exc cimport PyErr_CheckSignals
__title__="Counts sequence records"
__title__="Count sequence records"
def addOptions(parser):

View File

@ -16,7 +16,7 @@ from io import BufferedWriter
from cpython.exc cimport PyErr_CheckSignals
__title__="Keep the N first lines of a view."
__title__="Keep the N first lines of a view"
def addOptions(parser):

View File

@ -52,7 +52,7 @@ from cpython.exc cimport PyErr_CheckSignals
from io import BufferedWriter
__title__="Imports sequences from different formats into a DMS"
__title__="Import sequences from different formats into a DMS"
default_config = { 'destview' : None,
@ -430,7 +430,7 @@ def run(config):
if pb is not None:
pb(i, force=True)
print("", file=sys.stderr)
logger("info", "Imported %d entries", i)
logger("info", "Imported %d entries", len(view))
# Save command config in View and DMS comments
command_line = " ".join(sys.argv[1:])

View File

@ -24,7 +24,7 @@ from cpython.exc cimport PyErr_CheckSignals
from io import BufferedWriter
__title__="Assigns sequence records to the corresponding experiment/sample based on DNA tags and primers"
__title__="Assign sequence records to the corresponding experiment/sample based on DNA tags and primers"
def addOptions(parser):
@ -450,7 +450,7 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
sequences[1] = sequences[1][reversematch[1][2]:]
if not directmatch[0].forward:
sequences[1] = sequences[1].reverse_complement
sequences[0][REVERSE_SEQUENCE_COLUMN] = sequences[1].seq # used by alignpairedend tool
sequences[0][REVERSE_SEQUENCE_COLUMN] = sequences[1].seq # used by alignpairedend tool
sequences[0][REVERSE_QUALITY_COLUMN] = sequences[1].quality # used by alignpairedend tool
else:
sequences[0] = sequences[0][reversematch[1][2]:]

View File

@ -36,7 +36,7 @@ NULL_VALUE = {OBI_BOOL: OBIBool_NA,
OBI_STR: b""}
__title__="Sort view lines according to the value of a given attribute."
__title__="Sort view lines according to the value of a given attribute"
def addOptions(parser):

View File

@ -16,7 +16,7 @@ import sys
from cpython.exc cimport PyErr_CheckSignals
__title__="Compute basic statistics for attribute values."
__title__="Compute basic statistics for attribute values"
'''
`obi stats` computes basic statistics for attribute values of sequence records.

View File

@ -15,7 +15,7 @@ from cpython.exc cimport PyErr_CheckSignals
from io import BufferedWriter
__title__="Keep the N last lines of a view."
__title__="Keep the N last lines of a view"
def addOptions(parser):

View File

@ -39,7 +39,7 @@ COL_COMMENTS_MAX_LEN = 2048
MAX_INT = 2147483647 # used to generate random float values
__title__="Tests if the obitools are working properly"
__title__="Test if the obitools are working properly"
default_config = {

View File

@ -389,10 +389,7 @@ def open_uri(uri,
sep = tobytes(qualifiers[b"sep"][0][0])
else:
try:
sep = config["obi"]["sep"]
if sep == '\\t': # dirty workaround for flake8(?) issue that reads '\t' as '\'+'t' when parsing the option value
sep = '\t'
sep = tobytes(sep)
sep = tobytes(config["obi"]["sep"])
except KeyError:
sep=None
@ -568,6 +565,6 @@ def open_uri(uri,
entry_count = -1
if input:
entry_count = count_entries(file, format)
entry_count = count_entries(file, format, header)
return (file, iseq, objclass, urib, entry_count)

View File

@ -3,7 +3,7 @@
from obitools3.dms.capi.obitypes cimport obitype_t, index_t
cpdef bytes format_uniq_pattern(bytes format)
cpdef int count_entries(file, bytes format)
cpdef int count_entries(file, bytes format, bint header)
cdef obi_errno_to_exception(index_t line_nb=*, object elt_id=*, str error_message=*)

View File

@ -40,7 +40,7 @@ cpdef bytes format_uniq_pattern(bytes format):
return None
cpdef int count_entries(file, bytes format):
cpdef int count_entries(file, bytes format, bint header):
try:
sep = format_uniq_pattern(format)
@ -75,6 +75,8 @@ cpdef int count_entries(file, bytes format):
total_count += len(re.findall(sep, mmapped_file))
if format != b"ngsfilter" and format != b"tabular" and format != b"embl" and format != b"genbank" and format != b"fastq":
total_count += 1 # adding +1 for 1st entry because separators include \n (ngsfilter and tabular already count one more because of last \n)
if format == b"tabular" and header: # not counting header as an entry
total_count -= 1
except:
if len(files) > 1:

View File

@ -1,5 +1,5 @@
major = 3
minor = 0
serial= '1b6'
serial= '1b7'
version ="%d.%d.%s" % (major,minor,serial)

View File

@ -77,7 +77,6 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
int32_t* shift_count_array;
Obi_ali_p ali = NULL;
int i, j;
bool switched_seqs;
bool reversed;
int score = 0;
Obi_blob_p blob1 = NULL;
@ -124,6 +123,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
bool keep_seq2_start;
bool keep_seq1_end;
bool keep_seq2_end;
bool left_ali;
bool rev_quals = false;
// Check kmer size
if ((kmer_size < 1) || (kmer_size > 4))
@ -148,19 +149,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
}
// Choose the shortest sequence to save kmer positions in array
switched_seqs = false;
len1 = blob1->length_decoded_value;
len2 = blob2->length_decoded_value;
if (len2 < len1)
{
switched_seqs = true;
temp_len = len1;
len1 = len2;
len2 = temp_len;
temp_blob = blob1;
blob1 = blob2;
blob2 = temp_blob;
}
// Force encoding on 2 bits by replacing ambiguous nucleotides by 'a's
if (blob1->element_size == 4)
@ -196,7 +186,47 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
else
reversed = false;
if (reversed)
switched_seqs = !switched_seqs;
// unreverse to make cases simpler. Costly but rare (direct match is reverse primer match)
{
if (seq2 == NULL)
seq2 = obi_blob_to_seq(blob2);
seq2 = reverse_complement_sequence(seq2);
blob2 = obi_seq_to_blob(seq2);
if (seq1 == NULL)
seq1 = obi_blob_to_seq(blob1);
seq1 = reverse_complement_sequence(seq1);
blob1 = obi_seq_to_blob(seq1);
free_blob1 = true;
// Get the quality arrays
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
if (qual1 == NULL)
{
obidebug(1, "\nError getting the quality of the 1st sequence when computing the kmer similarity between two sequences");
return NULL;
}
qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len);
if (qual2 == NULL)
{
obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences");
return NULL;
}
uint8_t* newqual1 = malloc(qual1_len*sizeof(uint8_t));
uint8_t* newqual2 = malloc(qual2_len*sizeof(uint8_t));
for (i=0;i<qual1_len;i++)
newqual1[i] = qual1[qual1_len-1-i];
for (i=0;i<qual2_len;i++)
newqual2[i] = qual2[qual2_len-1-i];
qual1 = newqual1;
qual2 = newqual2;
rev_quals = true;
}
// Save total length for the shift counts array
total_len = len1 + len2 + 1; // +1 for shift 0
@ -237,7 +267,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
return NULL;
}
}
else if (len1 >= shift_array_height)
else if (total_len >= shift_array_height)
{
shift_array_height = total_len;
*shift_array_p = (int32_t*) realloc(*shift_array_p, ARRAY_LENGTH * shift_array_height * sizeof(int32_t));
@ -291,7 +321,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
*shift_array_height_p = shift_array_height;
*shift_count_array_length_p = shift_count_array_length;
// Fill array with positions of kmers in the shortest sequence
// Fill array with positions of kmers in the first sequence
encoding = blob1->element_size;
kmer_count = len1 - kmer_size + 1;
for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
@ -310,7 +340,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
kmer_pos_array[(kmer*kmer_pos_array_height)+i] = kmer_idx;
}
// Compare positions of kmers between both sequences and store shifts
// Compare positions of kmers between both sequences and store shifts (a shift corresponds to pos2 - pos1)
kmer_count = blob2->length_decoded_value - kmer_size + 1;
for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
{
@ -374,35 +404,42 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
// The 873863 cases of hell
if (best_shift > 0)
{
left_ali = false;
overlap_len = len2 - best_shift;
if (len1 <= overlap_len)
{
overlap_len = len1;
if (! switched_seqs)
keep_seq2_end = true;
else
keep_seq2_start = true;
}
else if (switched_seqs)
{
keep_seq2_start = true;
keep_seq1_end = true;
keep_seq2_end = true;
}
}
else if (best_shift < 0)
{
left_ali = true;
overlap_len = len1 + best_shift;
if (!switched_seqs)
{
keep_seq1_start = true;
keep_seq2_end = true;
}
}
else
{
overlap_len = len1;
if ((!switched_seqs) && (len2 > len1))
if (len2 <= overlap_len)
{
overlap_len = len2;
keep_seq1_start = true;
}
else
{
keep_seq1_start = true;
keep_seq2_end = true;
}
}
else // if (best_shift == 0)
{
if (len2 >= len1)
{
overlap_len = len1;
keep_seq2_end = true;
left_ali = true;
}
else
{
overlap_len = len2;
left_ali = false; // discussable
}
}
ali = (Obi_ali_p) malloc(sizeof(Obi_ali_t));
@ -433,7 +470,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
ali->direction[0] = '\0';
else
{
if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
if (left_ali)
strcpy(ali->direction, "left");
else
strcpy(ali->direction, "right");
@ -442,28 +479,28 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
// Build the consensus sequence if asked
if (build_consensus)
{
// Get the quality arrays
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
if (qual1 == NULL)
if (! rev_quals)
{
obidebug(1, "\nError getting the quality of the 1st sequence when computing the kmer similarity between two sequences");
return NULL;
}
qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len);
if (qual2 == NULL)
{
obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences");
return NULL;
// Get the quality arrays
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
if (qual1 == NULL)
{
obidebug(1, "\nError getting the quality of the 1st sequence when computing the kmer similarity between two sequences");
return NULL;
}
qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len);
if (qual2 == NULL)
{
obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences");
return NULL;
}
}
// Decode the first sequence if not already done
if (seq1 == NULL)
seq1 = obi_blob_to_seq(blob1);
if (! switched_seqs)
consensus_len = len2 - best_shift;
else
consensus_len = len1 + best_shift;
consensus_len = len2 - best_shift;
// Allocate memory for consensus sequence
consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe
@ -557,6 +594,12 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
free(seq2);
free(blob2);
if (rev_quals)
{
free(qual1);
free(qual2);
}
return ali;
}