Branch to refactor and debug (AVLs bugged)

This commit is contained in:
Celine Mercier
2016-04-08 15:38:57 +02:00
parent edc4fd7b3e
commit 019dfc01b4
32 changed files with 1553 additions and 812 deletions

View File

@ -12,8 +12,6 @@
../../../src/obilittlebigman.c ../../../src/obilittlebigman.c
../../../src/obitypes.h ../../../src/obitypes.h
../../../src/obitypes.c ../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/obiavl.h ../../../src/obiavl.h
../../../src/obiavl.c ../../../src/obiavl.c
../../../src/encode.h ../../../src/encode.h
@ -28,3 +26,5 @@
../../../src/murmurhash2.h ../../../src/murmurhash2.h
../../../src/crc64.c ../../../src/crc64.c
../../../src/crc64.h ../../../src/crc64.h
../../../src/utils.c
../../../src/utils.h

View File

@ -14,7 +14,7 @@
../../../src/obilittlebigman.c ../../../src/obilittlebigman.c
../../../src/obitypes.h ../../../src/obitypes.h
../../../src/obitypes.c ../../../src/obitypes.c
../../../src/private_at_functions.h ../../../src/utils.c
../../../src/private_at_functions.c ../../../src/utils.h
../../../src/obiavl.h ../../../src/obiavl.h
../../../src/obiavl.c ../../../src/obiavl.c

View File

@ -14,7 +14,7 @@
../../../src/obilittlebigman.c ../../../src/obilittlebigman.c
../../../src/obitypes.h ../../../src/obitypes.h
../../../src/obitypes.c ../../../src/obitypes.c
../../../src/private_at_functions.h ../../../src/utils.c
../../../src/private_at_functions.c ../../../src/utils.h
../../../src/obiavl.h ../../../src/obiavl.h
../../../src/obiavl.c ../../../src/obiavl.c

View File

@ -14,7 +14,7 @@
../../../src/obilittlebigman.c ../../../src/obilittlebigman.c
../../../src/obitypes.h ../../../src/obitypes.h
../../../src/obitypes.c ../../../src/obitypes.c
../../../src/private_at_functions.h ../../../src/utils.c
../../../src/private_at_functions.c ../../../src/utils.h
../../../src/obiavl.h ../../../src/obiavl.h
../../../src/obiavl.c ../../../src/obiavl.c

View File

@ -14,7 +14,7 @@
../../../src/obilittlebigman.c ../../../src/obilittlebigman.c
../../../src/obitypes.h ../../../src/obitypes.h
../../../src/obitypes.c ../../../src/obitypes.c
../../../src/private_at_functions.h ../../../src/utils.c
../../../src/private_at_functions.c ../../../src/utils.h
../../../src/obiavl.h ../../../src/obiavl.h
../../../src/obiavl.c ../../../src/obiavl.c

View File

@ -14,7 +14,7 @@
../../../src/obilittlebigman.c ../../../src/obilittlebigman.c
../../../src/obitypes.h ../../../src/obitypes.h
../../../src/obitypes.c ../../../src/obitypes.c
../../../src/private_at_functions.h ../../../src/utils.c
../../../src/private_at_functions.c ../../../src/utils.h
../../../src/obiavl.h ../../../src/obiavl.h
../../../src/obiavl.c ../../../src/obiavl.c

View File

@ -10,19 +10,23 @@ from .capi.obitypes cimport OBISeq_NA, const_char_p
from obitools3.utils cimport str2bytes, bytes2str from obitools3.utils cimport str2bytes, bytes2str
from libc.stdlib cimport free
from libc.string cimport strcmp
cdef class OBIDMS_column_seq(OBIDMS_column): cdef class OBIDMS_column_seq(OBIDMS_column):
cpdef object get_line(self, index_t line_nb): cpdef object get_line(self, index_t line_nb):
cdef bytes value cdef char* value
cdef object result cdef object result
value = <bytes> obi_column_get_obiseq_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, 0) value = obi_column_get_obiseq_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, 0)
if obi_errno > 0 : if obi_errno > 0 :
raise IndexError(line_nb) raise IndexError(line_nb)
if value == OBISeq_NA : # TODO if strcmp(value, OBISeq_NA) == 0 :
result = None result = None
else : else :
result = bytes2str(value) result = bytes2str(value)
free(value)
return result return result
cpdef set_line(self, index_t line_nb, object value): cpdef set_line(self, index_t line_nb, object value):
@ -38,19 +42,20 @@ cdef class OBIDMS_column_seq(OBIDMS_column):
cdef class OBIDMS_column_multi_elts_seq(OBIDMS_column_multi_elts): cdef class OBIDMS_column_multi_elts_seq(OBIDMS_column_multi_elts):
cpdef object get_item(self, index_t line_nb, str element_name): cpdef object get_item(self, index_t line_nb, str element_name):
cdef bytes value cdef char* value
cdef object result cdef object result
value = <bytes> obi_column_get_obiseq_with_elt_name_in_view(self.view, (self.pointer)[0], line_nb, str2bytes(element_name)) value = obi_column_get_obiseq_with_elt_name_in_view(self.view, (self.pointer)[0], line_nb, str2bytes(element_name))
if obi_errno > 0 : if obi_errno > 0 :
raise IndexError(line_nb, element_name) raise IndexError(line_nb, element_name)
if value == OBISeq_NA : if strcmp(value, OBISeq_NA) == 0 :
result = None result = None
else : else :
result = bytes2str(value) result = bytes2str(value)
free(value)
return result return result
cpdef object get_line(self, index_t line_nb) : cpdef object get_line(self, index_t line_nb) :
cdef bytes value cdef char* value
cdef object value_in_result cdef object value_in_result
cdef dict result cdef dict result
cdef index_t i cdef index_t i
@ -58,13 +63,14 @@ cdef class OBIDMS_column_multi_elts_seq(OBIDMS_column_multi_elts):
result = {} result = {}
all_NA = True all_NA = True
for i in range(self.nb_elements_per_line) : for i in range(self.nb_elements_per_line) :
value = <bytes> obi_column_get_obiseq_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, i) value = obi_column_get_obiseq_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, i)
if obi_errno > 0 : if obi_errno > 0 :
raise IndexError(line_nb) raise IndexError(line_nb)
if value == OBISeq_NA : if strcmp(value, OBISeq_NA) == 0 :
value_in_result = None value_in_result = None
else : else :
value_in_result = bytes2str(value) value_in_result = bytes2str(value)
free(value)
result[self.elements_names[i]] = value_in_result result[self.elements_names[i]] = value_in_result
if all_NA and (value_in_result is not None) : if all_NA and (value_in_result is not None) :
all_NA = False all_NA = False

View File

@ -14,7 +14,7 @@
../../../src/obilittlebigman.c ../../../src/obilittlebigman.c
../../../src/obitypes.h ../../../src/obitypes.h
../../../src/obitypes.c ../../../src/obitypes.c
../../../src/private_at_functions.h ../../../src/utils.c
../../../src/private_at_functions.c ../../../src/utils.h
../../../src/obiavl.h ../../../src/obiavl.h
../../../src/obiavl.c ../../../src/obiavl.c

View File

@ -10,19 +10,22 @@ from .capi.obitypes cimport OBIStr_NA, const_char_p
from obitools3.utils cimport str2bytes, bytes2str from obitools3.utils cimport str2bytes, bytes2str
from libc.string cimport strcmp
cdef class OBIDMS_column_str(OBIDMS_column): cdef class OBIDMS_column_str(OBIDMS_column):
cpdef object get_line(self, index_t line_nb): cpdef object get_line(self, index_t line_nb):
cdef bytes value cdef char* value
cdef object result cdef object result
value = <bytes> obi_column_get_obistr_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, 0) value = obi_column_get_obistr_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, 0)
if obi_errno > 0 : if obi_errno > 0 :
raise IndexError(line_nb) raise IndexError(line_nb)
if value == OBIStr_NA : # TODO if strcmp(value, OBIStr_NA) == 0 :
result = None result = None
else : else :
result = bytes2str(value) result = bytes2str(value)
# NOTE: value is not freed because the pointer points to a mmapped region in an AVL data file. (TODO discuss)
return result return result
cpdef set_line(self, index_t line_nb, object value): cpdef set_line(self, index_t line_nb, object value):
@ -38,19 +41,20 @@ cdef class OBIDMS_column_str(OBIDMS_column):
cdef class OBIDMS_column_multi_elts_str(OBIDMS_column_multi_elts): cdef class OBIDMS_column_multi_elts_str(OBIDMS_column_multi_elts):
cpdef object get_item(self, index_t line_nb, str element_name): cpdef object get_item(self, index_t line_nb, str element_name):
cdef bytes value cdef char* value
cdef object result cdef object result
value = <bytes> obi_column_get_obistr_with_elt_name_in_view(self.view, (self.pointer)[0], line_nb, str2bytes(element_name)) value = obi_column_get_obistr_with_elt_name_in_view(self.view, (self.pointer)[0], line_nb, str2bytes(element_name))
if obi_errno > 0 : if obi_errno > 0 :
raise IndexError(line_nb, element_name) raise IndexError(line_nb, element_name)
if value == OBIStr_NA : if strcmp(value, OBIStr_NA) == 0 :
result = None result = None
else : else :
result = bytes2str(value) result = bytes2str(value)
# NOTE: value is not freed because the pointer points to a mmapped region in an AVL data file. (TODO discuss)
return result return result
cpdef object get_line(self, index_t line_nb) : cpdef object get_line(self, index_t line_nb) :
cdef bytes value cdef char* value
cdef object value_in_result cdef object value_in_result
cdef dict result cdef dict result
cdef index_t i cdef index_t i
@ -58,13 +62,14 @@ cdef class OBIDMS_column_multi_elts_str(OBIDMS_column_multi_elts):
result = {} result = {}
all_NA = True all_NA = True
for i in range(self.nb_elements_per_line) : for i in range(self.nb_elements_per_line) :
value = <bytes> obi_column_get_obistr_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, i) value = obi_column_get_obistr_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, i)
if obi_errno > 0 : if obi_errno > 0 :
raise IndexError(line_nb) raise IndexError(line_nb)
if value == OBIStr_NA : if strcmp(value, OBIStr_NA) == 0 :
value_in_result = None value_in_result = None
else : else :
value_in_result = bytes2str(value) value_in_result = bytes2str(value)
# NOTE: value is not freed because the pointer points to a mmapped region in an AVL data file. (TODO discuss)
result[self.elements_names[i]] = value_in_result result[self.elements_names[i]] = value_in_result
if all_NA and (value_in_result is not None) : if all_NA and (value_in_result is not None) :
all_NA = False all_NA = False

View File

@ -12,8 +12,8 @@
../../../src/obilittlebigman.c ../../../src/obilittlebigman.c
../../../src/obitypes.h ../../../src/obitypes.h
../../../src/obitypes.c ../../../src/obitypes.c
../../../src/private_at_functions.h ../../../src/utils.c
../../../src/private_at_functions.c ../../../src/utils.h
../../../src/obiavl.h ../../../src/obiavl.h
../../../src/obiavl.c ../../../src/obiavl.c
../../../src/encode.h ../../../src/encode.h

View File

@ -26,7 +26,7 @@ cdef class OBI_Seq(dict) :
self[bytes2str(DESCRIPTION_COLUMN)] = description self[bytes2str(DESCRIPTION_COLUMN)] = description
cpdef get_description(self) : cpdef get_description(self) :
return self.description return self.description # TODO no
cpdef get_sequence(self) : cpdef get_sequence(self) :
return self.sequence return self.sequence
@ -48,28 +48,25 @@ cdef class OBI_Nuc_Seq(OBI_Seq) :
cdef class OBI_Nuc_Seq_Stored(OBIView_line) : cdef class OBI_Nuc_Seq_Stored(OBIView_line) :
cpdef set_id(self, str id) : cpdef set_id(self, str id) :
self.id = id
self[bytes2str(ID_COLUMN)] = id self[bytes2str(ID_COLUMN)] = id
cpdef get_id(self) : cpdef get_id(self) :
return self.id return self[bytes2str(ID_COLUMN)]
cpdef set_description(self, str description) : cpdef set_description(self, str description) :
self.description = description
self[bytes2str(DESCRIPTION_COLUMN)] = description self[bytes2str(DESCRIPTION_COLUMN)] = description
cpdef get_description(self) : cpdef get_description(self) :
return self.description return self[bytes2str(DESCRIPTION_COLUMN)]
cpdef set_sequence(self, str sequence) : cpdef set_sequence(self, str sequence) :
self.sequence = sequence
self[bytes2str(NUC_SEQUENCE_COLUMN)] = sequence self[bytes2str(NUC_SEQUENCE_COLUMN)] = sequence
cpdef get_sequence(self) : cpdef get_sequence(self) :
return self.sequence return self[bytes2str(NUC_SEQUENCE_COLUMN)]
def __str__(self) : def __str__(self) :
return self.sequence # or not return self[bytes2str(NUC_SEQUENCE_COLUMN)] # or not
# cpdef str reverse_complement(self) : TODO in C ? # cpdef str reverse_complement(self) : TODO in C ?
# pass # pass

View File

@ -12,8 +12,8 @@
../../../src/obilittlebigman.c ../../../src/obilittlebigman.c
../../../src/obitypes.h ../../../src/obitypes.h
../../../src/obitypes.c ../../../src/obitypes.c
../../../src/private_at_functions.h ../../../src/utils.c
../../../src/private_at_functions.c ../../../src/utils.h
../../../src/obiavl.h ../../../src/obiavl.h
../../../src/obiavl.c ../../../src/obiavl.c
../../../src/encode.h ../../../src/encode.h

View File

@ -5,7 +5,7 @@ import time
from obitools3.obidms._obidms import OBIDMS from obitools3.obidms._obidms import OBIDMS
def bufferedRead(fileobj,size=100000000): def bufferedRead(fileobj,size=209715200): ## 200 MB
buffer = fileobj.readlines(size) buffer = fileobj.readlines(size)
while buffer: while buffer:
for l in buffer: for l in buffer:
@ -26,14 +26,16 @@ if __name__ == '__main__':
view = d.new_view('uniq view', view_type="NUC_SEQS_VIEW") view = d.new_view('uniq view', view_type="NUC_SEQS_VIEW")
# for i in range(35000000) : for i in range(35000000) :
# if (not (i%500000)) : if (not (i%500000)) :
# print(str(time.time())+'\t'+str(i)) print(str(time.time())+'\t'+str(i))
# id = "@HWI-D00405:142:C71BAANXX:4:1101:1234:2234_CONS_SUB_SUB_"+str(i) id = "@HWI-D00405:142:C71BAANXX:4:1101:1234:2234_CONS_SUB_SUB_"+str(i)
# view[i].set_id(id) view[i].set_id(id)
if id != view[i]["ID"] :
print("nope", id, view[i]["ID"])
input_file = open(args.input_file, 'r') # input_file = open(args.input_file, 'r')
input_file_buffered = bufferedRead(input_file) # input_file_buffered = bufferedRead(input_file)
# #
# if args.input_file[-1:] == "a" : # if args.input_file[-1:] == "a" :
@ -111,37 +113,37 @@ if __name__ == '__main__':
# l = 0 # l = 0
# next = False # next = False
# #
l=0 # l=0
i=0 # i=0
# while (True): # while (True):
# l+=1 # l+=1
# line = input_file.readline() # line = input_file.readline()
# if line=="": # if line=="":
# break # break
for line in input_file_buffered : # for line in input_file_buffered :
# #
# #if i > 1E7 : # #if i > 1E7 :
# # print('hmm?') # # print('hmm?')
# #
# #if i == 10000000 : # if i == 6000000 :
# # break # break
# #
if l%4 == 0 : # if l%4 == 0 :
# #
if (not (i%500000)) : # if (not (i%500000)) :
print(str(time.time())+'\t'+str(i)) # print(str(time.time())+'\t'+str(i))
# # # #
# # #print("header", line) # # #print("header", line)
# # # #
id = line.split(" ", 1)[0][1:] # id = line.split(" ", 1)[0][1:]
print(id) # print(id)
# # #rest = (line[:-1].split(" ", 1)[1]).split(";") # # #rest = (line[:-1].split(" ", 1)[1]).split(";")
view[i].set_id(id) # view[i].set_id(id)
# print(view[i]["ID"]) # print(view[i]["ID"])
# #
i+=1 # i+=1
l+=1 # l+=1
# #
# # description = "" # # description = ""
# # for j in range(len(rest)) : # # for j in range(len(rest)) :
@ -186,7 +188,7 @@ if __name__ == '__main__':
# l+=1 # l+=1
# #
# #
input_file.close() # input_file.close()
#print(view) #print(view)
print(view.__repr__()) print(view.__repr__())

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.python.org/simple/ --extra-index-url https://pypi.python.org/simple/
Cython>=0.21 Cython==0.23.5
Sphinx>=1.2.0 Sphinx>=1.2.0
ipython>=3.0.0 ipython>=3.0.0
breathe>=4.0.0 breathe>=4.0.0

View File

@ -122,6 +122,40 @@ static void setup_buckets(struct bloom * bloom, unsigned int cache_size)
} }
// TODO
int bloom_filter_size(int entries, double error)
{
int bytes;
double num;
double denom;
double bpe;
int bits;
unsigned bucket_bytes;
int not_even_by;
num = log(error);
denom = 0.480453013918201; // ln(2)^2
bpe = -(num / denom);
bits = (int)(((double)entries) * bpe);
if (bits % 8) {
bytes = (bits / 8) + 1;
}
else {
bytes = bits / 8;
}
bucket_bytes = BLOOM_BUCKET_SIZE_FALLBACK;
not_even_by = bytes % bucket_bytes;
if (not_even_by) {
// adjust bytes
bytes += (bucket_bytes - not_even_by);
}
return bytes;
}
int bloom_init_size(struct bloom * bloom, int entries, double error, int bloom_init_size(struct bloom * bloom, int entries, double error,
unsigned int cache_size) unsigned int cache_size)
{ {
@ -151,19 +185,21 @@ int bloom_init_size(struct bloom * bloom, int entries, double error,
setup_buckets(bloom, cache_size); setup_buckets(bloom, cache_size);
bloom->bf = (unsigned char *)calloc(bloom->bytes, sizeof(unsigned char)); // TODO comment
if (bloom->bf == NULL) { memset(bloom->bf, 0, bloom->bytes);
return 1; //bloom->bf = (unsigned char *)calloc(bloom->bytes, sizeof(unsigned char));
} //if (bloom->bf == NULL) {
// return 1;
//}
bloom->ready = 1; bloom->ready = 1;
return 0; return 0;
} }
int bloom_init(struct bloom * bloom, int entries, double error) int bloom_init(struct bloom * bloom, int entries) //, double error)
{ {
return bloom_init_size(bloom, entries, error, 0); return bloom_init_size(bloom, entries, BLOOM_FILTER_ERROR_RATE, 0);
} }

View File

@ -9,6 +9,10 @@
#define _BLOOM_H #define _BLOOM_H
// TODO
#define BLOOM_FILTER_ERROR_RATE (0.001)
/** *************************************************************************** /** ***************************************************************************
* On Linux, the code attempts to compute a bucket size based on CPU cache * On Linux, the code attempts to compute a bucket size based on CPU cache
* size info, if available. If that fails for any reason, this fallback size * size info, if available. If that fails for any reason, this fallback size
@ -60,10 +64,17 @@ struct bloom
unsigned bucket_bits_fast_mod_operand; unsigned bucket_bits_fast_mod_operand;
double bpe; double bpe;
unsigned char * bf;
int ready; int ready;
unsigned char bf[];
}; };
typedef struct bloom bloom_t;
// TODO
int bloom_filter_size(int entries, double error);
/** *************************************************************************** /** ***************************************************************************
* Initialize the bloom filter for use. * Initialize the bloom filter for use.
@ -91,7 +102,7 @@ struct bloom
* 1 - on failure * 1 - on failure
* *
*/ */
int bloom_init(struct bloom * bloom, int entries, double error); int bloom_init(struct bloom * bloom, int entries); //, double error);
/** *************************************************************************** /** ***************************************************************************

View File

@ -64,6 +64,12 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length)
length_b = ceil((double) length / (double) 4.0); length_b = ceil((double) length / (double) 4.0);
seq_b = (byte_t*) malloc(length_b * sizeof(byte_t)); seq_b = (byte_t*) malloc(length_b * sizeof(byte_t));
if (seq_b == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR); // TODO
obidebug(1, "\nError allocating memory for an encoded DNA sequence");
return NULL;
}
// Initialize all the bits to 0 // Initialize all the bits to 0
memset(seq_b, 0, length_b); memset(seq_b, 0, length_b);
@ -93,6 +99,7 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length)
seq_b[i/4] |= NUC_T_2b; seq_b[i/4] |= NUC_T_2b;
break; break;
default: default:
obi_set_errno(OBI_ENCODE_ERROR); // TODO
obidebug(1, "\nInvalid nucleotide base when encoding (not [atgcATGC])"); obidebug(1, "\nInvalid nucleotide base when encoding (not [atgcATGC])");
return NULL; return NULL;
} }
@ -116,6 +123,12 @@ char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq)
uint8_t nuc; uint8_t nuc;
seq = (char*) malloc((length_seq+1) * sizeof(char)); seq = (char*) malloc((length_seq+1) * sizeof(char));
if (seq == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR); // TODO
obidebug(1, "\nError allocating memory for a decoded DNA sequence");
return NULL;
}
for (i=0; i<length_seq; i++) for (i=0; i<length_seq; i++)
{ {
@ -138,6 +151,7 @@ char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq)
seq[i] = 't'; seq[i] = 't';
break; break;
default: default:
obi_set_errno(OBI_DECODE_ERROR); // TODO
obidebug(1, "\nInvalid nucleotide base when decoding"); obidebug(1, "\nInvalid nucleotide base when decoding");
return NULL; return NULL;
} }
@ -159,6 +173,12 @@ byte_t* encode_seq_on_4_bits(char* seq, int32_t length)
length_b = ceil((double) length / (double) 2.0); length_b = ceil((double) length / (double) 2.0);
seq_b = (byte_t*) malloc(length_b * sizeof(byte_t)); seq_b = (byte_t*) malloc(length_b * sizeof(byte_t));
if (seq_b == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR); // TODO
obidebug(1, "\nError allocating memory for an encoded DNA sequence");
return NULL;
}
// Initialize all the bits to 0 // Initialize all the bits to 0
memset(seq_b, 0, length_b); memset(seq_b, 0, length_b);
@ -232,6 +252,7 @@ byte_t* encode_seq_on_4_bits(char* seq, int32_t length)
seq_b[i/2] |= NUC_N_4b; seq_b[i/2] |= NUC_N_4b;
break; break;
default: default:
obi_set_errno(OBI_ENCODE_ERROR); // TODO
obidebug(1, "\nInvalid nucleotide base when encoding (not IUPAC)"); obidebug(1, "\nInvalid nucleotide base when encoding (not IUPAC)");
return NULL; return NULL;
} }
@ -255,6 +276,12 @@ char* decode_seq_on_4_bits(byte_t* seq_b, int32_t length_seq)
uint8_t nuc; uint8_t nuc;
seq = (char*) malloc((length_seq+1) * sizeof(char)); seq = (char*) malloc((length_seq+1) * sizeof(char));
if (seq == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR); // TODO
obidebug(1, "\nError allocating memory for a decoded DNA sequence");
return NULL;
}
for (i=0; i<length_seq; i++) for (i=0; i<length_seq; i++)
{ {
@ -310,6 +337,7 @@ char* decode_seq_on_4_bits(byte_t* seq_b, int32_t length_seq)
seq[i] = 'n'; seq[i] = 'n';
break; break;
default: default:
obi_set_errno(OBI_DECODE_ERROR); // TODO
obidebug(1, "\nInvalid nucleotide base when decoding"); obidebug(1, "\nInvalid nucleotide base when decoding");
return NULL; return NULL;
} }
@ -321,6 +349,111 @@ char* decode_seq_on_4_bits(byte_t* seq_b, int32_t length_seq)
} }
Obi_byte_array_p obi_byte_array(byte_t* encoded_value, uint8_t element_size, int32_t length_encoded_value, int32_t length_decoded_value)
{
Obi_byte_array_p byte_array;
// Allocate the memory for the byte array structure
byte_array = (Obi_byte_array_p) malloc(sizeof(Obi_byte_array_t) + length_encoded_value);
if (byte_array == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR); // TODO
obidebug(1, "\nError allocating memory for a byte array");
return NULL;
}
// Store the number of bits on which each element is encoded
byte_array->element_size = element_size;
// Store the length (in bytes) of the encoded value
byte_array->length_encoded_value = length_encoded_value;
// Store the initial length (in bytes) of the decoded value
byte_array->length_decoded_value = length_decoded_value;
// Store the encoded value
memcpy(byte_array->value, encoded_value, length_encoded_value);
return byte_array;
}
Obi_byte_array_p obi_str_to_obibytes(char* value)
{
Obi_byte_array_p value_b;
int32_t length;
// Compute the number of bytes on which the value will be encoded
length = strlen(value) + 1; // +1 to store \0 at the end (makes retrieving faster)
value_b = obi_byte_array(value, ELEMENT_SIZE_STR, length, length);
if (value_b == NULL)
{
obidebug(1, "\nError encoding a character string in a byte array");
return NULL;
}
return value_b;
}
const char* obi_obibytes_to_str(Obi_byte_array_p value_b)
{
fprintf(stderr, "\n%s", value_b->value);
return value_b->value;
}
Obi_byte_array_p obi_seq_to_obibytes(char* seq)
{
Obi_byte_array_p value_b;
int32_t length_encoded_seq; // length of the encoded sequence in bytes
int32_t seq_length;
byte_t* encoded_seq;
seq_length = strlen(seq);
// Check if just ATGC and encode accordingly
if (only_ATGC(seq))
{
// Compute the length (in bytes) of the encoded sequence
length_encoded_seq = ceil((double) seq_length / (double) 4.0);
// Encode
encoded_seq = encode_seq_on_2_bits(seq, seq_length);
if (encoded_seq == NULL)
return NULL;
value_b = obi_byte_array(encoded_seq, ELEMENT_SIZE_SEQ_2, length_encoded_seq, seq_length);
}
else
{
// Compute the length (in bytes) of the encoded sequence
length_encoded_seq = ceil((double) seq_length / (double) 2.0);
// Encode
encoded_seq = encode_seq_on_4_bits(seq, seq_length);
if (encoded_seq == NULL)
return NULL;
value_b = obi_byte_array(encoded_seq, ELEMENT_SIZE_SEQ_4, length_encoded_seq, seq_length);
}
free(encoded_seq);
return value_b;
}
const char* obi_obibytes_to_seq(Obi_byte_array_p value_b)
{
// Decode
if (value_b->element_size == 2)
return decode_seq_on_2_bits(value_b->value, value_b->length_decoded_value);
else
return decode_seq_on_4_bits(value_b->value, value_b->length_decoded_value);
}
// TODO same for int
///////////////////// FOR DEBUGGING /////////////////////////// ///////////////////// FOR DEBUGGING ///////////////////////////
//NOTE: The first byte is printed the first (at the left-most). //NOTE: The first byte is printed the first (at the left-most).

View File

@ -10,6 +10,10 @@
*/ */
#ifndef ENCODE_H_
#define ENCODE_H_
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
#include <stdint.h> #include <stdint.h>
@ -18,8 +22,31 @@
#include "obitypes.h" #include "obitypes.h"
#define NUC_MASK_2B 0x3 /**< Binary: 11 to use when decoding 2 bits sequences */ #define NUC_MASK_2B 0x3 /**< Binary: 11 to use when decoding 2 bits sequences
#define NUC_MASK_4B 0xF /**< Binary: 1111 to use when decoding 4 bits sequences */ */
#define NUC_MASK_4B 0xF /**< Binary: 1111 to use when decoding 4 bits sequences
*/
#define ELEMENT_SIZE_STR (8) /**< The size of an element from a value of type character string.
*/
#define ELEMENT_SIZE_SEQ_2 (2) /**< The size of an element from a value of type DNA sequence encoded on 2 bits.
*/
#define ELEMENT_SIZE_SEQ_4 (4) /**< The size of an element from a value of type DNA sequence encoded on 4 bits.
*/
/**
* @brief Byte array structure.
*/
typedef struct Obi_byte_array {
uint8_t element_size; /**< Size in bits of one element from the value.
*/
int32_t length_encoded_value; /**< Length in bytes of the encoded value.
*/
int32_t length_decoded_value; /**< Length in bytes of the decoded value.
*/
byte_t value[]; /**< Encoded value.
*/
} Obi_byte_array_t, *Obi_byte_array_p;
/** /**
@ -174,8 +201,70 @@ byte_t* encode_seq_on_4_bits(char* seq, int32_t length);
char* decode_seq_on_4_bits(byte_t* seq_b, int32_t length_seq); char* decode_seq_on_4_bits(byte_t* seq_b, int32_t length_seq);
/**
* @brief Converts a character string to a byte array with a header.
*
* @warning The byte array must be freed by the caller.
*
* @param value The character string to convert.
*
* @returns A pointer to the byte array created.
* @retval NULL if an error occurred.
*
* @since October 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
Obi_byte_array_p obi_str_to_obibytes(char* value);
/**
* @brief Converts a byte array to a character string.
*
* @param value_b The byte array to convert.
*
* @returns A pointer to the character string contained in the byte array.
*
* @since October 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
const char* obi_obibytes_to_str(Obi_byte_array_p value_b);
/**
* @brief Converts a DNA sequence to a byte array with a header.
*
* @warning The byte array must be freed by the caller.
*
* @param value The DNA sequence to convert.
*
* @returns A pointer to the byte array created.
* @retval NULL if an error occurred.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
Obi_byte_array_p obi_seq_to_obibytes(char* seq);
/**
* @brief Converts a byte array to a DNA sequence.
*
* @param value_b The byte array to convert.
*
* @returns A pointer to the DNA sequence contained in the byte array.
* @retval NULL if an error occurred.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
const char* obi_obibytes_to_seq(Obi_byte_array_p value_b); // TODO move to encode source files
////////// FOR DEBUGGING /////////// ////////// FOR DEBUGGING ///////////
// little endian // little endian
void print_bits(void* ptr, int32_t length); void print_bits(void* ptr, int32_t length);
#endif /* ENCODE_H_ */

File diff suppressed because it is too large Load Diff

View File

@ -25,26 +25,30 @@
#include "obidms.h" #include "obidms.h"
#include "obitypes.h" #include "obitypes.h"
#include "bloom.h" #include "bloom.h"
#include "utils.h"
#include "encode.h"
#define NODE_COUNT_PER_AVL (10000000) #define MAX_NB_OF_AVLS_IN_GROUP (100) /**< The maximum number of AVL trees in a group. // TODO discuss
*/
#define BLOOM_FILTER_ERROR_RATE (0.001) #define MAX_NODE_COUNT_PER_AVL (10000000) /**< The maximum number of nodes in an AVL tree.
* Only used to decide when to create a new AVL in a group, and to initialize the bloom filter // TODO discuss.
*/
#define MAX_DATA_SIZE_PER_AVL (1073741824) /**< The maximum size of the data referred to by an AVL tree in a group.
* Only used to decide when to create a new AVL in a group.
* Should not be greater than int32_t max (2,147,483,647), as indexes will have to be stored on 32 bits.
* Here 1073741824 B = 1 GB
*/
#define AVL_MAX_DEPTH (1024) /**< The maximum depth of an AVL tree. Used to save paths through the tree.
*/
#define AVL_MAX_NAME (1024) /**< The maximum length of an AVL tree name. #define AVL_MAX_NAME (1024) /**< The maximum length of an AVL tree name.
*/ */
#define AVL_GROWTH_FACTOR (2) /**< The growth factor when an AVL tree is enlarged. #define AVL_GROWTH_FACTOR (2) /**< The growth factor when an AVL tree is enlarged.
*/ */
#define AVL_MAX_DEPTH (1000) /**< The maximum depth of an AVL tree.
*/
#define LEFT_CHILD(node) (avl->tree)+(node->left_child) /**< Pointer to the left child of a node in an AVL tree. #define LEFT_CHILD(node) (avl->tree)+(node->left_child) /**< Pointer to the left child of a node in an AVL tree.
*/ */
#define RIGHT_CHILD(node) (avl->tree)+(node->right_child) /**< Pointer to the right child of a node in an AVL tree. #define RIGHT_CHILD(node) (avl->tree)+(node->right_child) /**< Pointer to the right child of a node in an AVL tree.
*/ */
#define BYTE_ARRAY_HEADER_SIZE (9) /**< The size of the header of a byte array.
*/
typedef struct bloom bloom_t;
/** /**
@ -59,7 +63,8 @@ typedef struct AVL_node {
*/ */
index_t value; /**< Index of the value associated with the node in the data array. index_t value; /**< Index of the value associated with the node in the data array.
*/ */
uint64_t crc64; // TODO uint64_t crc64; /**< Cyclic Redundancy Check code on 64 bits associated with the value.
*/
} AVL_node_t, *AVL_node_p; } AVL_node_t, *AVL_node_p;
@ -90,6 +95,8 @@ typedef struct OBIDMS_avl_data {
*/ */
byte_t* data; /**< A pointer to the beginning of the data. byte_t* data; /**< A pointer to the beginning of the data.
*/ */
int data_fd; /**< File descriptor of the file containing the data.
*/
} OBIDMS_avl_data_t, *OBIDMS_avl_data_p; } OBIDMS_avl_data_t, *OBIDMS_avl_data_p;
@ -111,7 +118,9 @@ typedef struct OBIDMS_avl_header {
*/ */
time_t creation_date; /**< Date of creation of the file. time_t creation_date; /**< Date of creation of the file.
*/ */
bloom_t bloom_filter; bloom_t bloom_filter; /**< Bloom filter associated with the AVL tree, enabling to know if a value
* might already be stored in the data associated with the tree.
*/
} OBIDMS_avl_header_t, *OBIDMS_avl_header_p; } OBIDMS_avl_header_t, *OBIDMS_avl_header_p;
@ -139,10 +148,10 @@ typedef struct OBIDMS_avl {
int dir_fd; /**< The file descriptor of the directory entry int dir_fd; /**< The file descriptor of the directory entry
* usable to refer and scan the AVL tree directory. * usable to refer and scan the AVL tree directory.
*/ */
int avl_fd; /**< The file descriptor of the file containing the AVL tree.
*/
size_t counter; /**< Indicates by how many threads/programs (TODO) the AVL tree is used. size_t counter; /**< Indicates by how many threads/programs (TODO) the AVL tree is used.
*/ */
int avl_fd;
int data_fd;
} OBIDMS_avl_t, *OBIDMS_avl_p; } OBIDMS_avl_t, *OBIDMS_avl_p;
@ -150,28 +159,26 @@ typedef struct OBIDMS_avl {
* @brief OBIDMS AVL tree group structure. * @brief OBIDMS AVL tree group structure.
*/ */
typedef struct OBIDMS_avl_group { typedef struct OBIDMS_avl_group {
// TODO put each group in a directory later OBIDMS_avl_p sub_avls[MAX_NB_OF_AVLS_IN_GROUP]; /**< Array containing the pointers to the AVL trees of the group.
OBIDMS_avl_p sub_avls[64]; // TODO macro for max */
int current_avl_idx; int current_avl_idx; /**< Index in the sub_avls array of the AVL tree currently being filled.
char avl_name[AVL_MAX_NAME+1]; */
OBIDMS_p dms; char avl_name[AVL_MAX_NAME+1]; /**< Base name of the AVL group. The AVL trees in it have names of the form basename_idx.
*/
OBIDMS_p dms; /**< Pointer to the OBIDMS structure to which the AVL group belongs.
*/
} OBIDMS_avl_group_t, *OBIDMS_avl_group_p; } OBIDMS_avl_group_t, *OBIDMS_avl_group_p;
OBIDMS_avl_group_p obi_create_avl_group(OBIDMS_p dms, const char* avl_name);
index_t insert_in_avl_group(OBIDMS_avl_group_p avl_group, byte_t* value);
/** /**
* @brief Checks if an AVL tree already exists or not. * @brief Checks if an AVL tree or AVL tree group already exists or not.
* *
* @param dms The OBIDMS to which the AVL tree belongs. * @param dms The OBIDMS to which the AVL tree or AVL tree group belongs.
* @param avl_name The name of the AVL tree. * @param avl_name The name of the AVL treeor the base name of the AVL tree group.
* *
* @returns A value indicating whether the AVL tree exists or not. * @returns A value indicating whether the AVL tree or AVL tree group exists or not.
* @retval 1 if the AVL tree exists. * @retval 1 if the AVL tree or AVL tree group exists.
* @retval 0 if the AVL tree does not exist. * @retval 0 if the AVL tree or AVL tree group does not exist.
* @retval -1 if an error occurred. * @retval -1 if an error occurred.
* *
* @since December 2015 * @since December 2015
@ -180,36 +187,19 @@ index_t insert_in_avl_group(OBIDMS_avl_group_p avl_group, byte_t* value);
int obi_avl_exists(OBIDMS_p dms, const char* avl_name); int obi_avl_exists(OBIDMS_p dms, const char* avl_name);
/**
* @brief Opens an AVL tree and creates it if it does not already exist.
*
* Note: An AVL tree is made of two files (referred to by two structures).
* One file contains the indices referring to the data, and the other
* file contains the data itself. The AVL tree as a whole is referred
* to via the OBIDMS_avl structure.
*
* @param dms The OBIDMS to which the AVL tree belongs.
* @param avl_name The name of the AVL tree.
*
* @returns A pointer to the AVL tree structure.
* @retval NULL if an error occurred.
*
* @since December 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
OBIDMS_avl_p obi_avl(OBIDMS_p dms, const char* avl_name);
/** /**
* @brief Creates an AVL tree. Fails if it already exists. * @brief Creates an AVL tree. Fails if it already exists.
* *
* Note: An AVL tree is made of two files (referred to by two structures). * Note: An AVL tree is made of two files (referred to by two structures).
* One file contains the indices referring to the data, and the other * One file contains the indices referring to the data, and the other
* file contains the data itself. The AVL tree as a whole is referred * file contains the data itself. The AVL tree as a whole is referred
* to via the OBIDMS_avl structure. * to via the OBIDMS_avl structure. An AVL tree is stored in a directory
* with the same name, or with the base name of the AVL group if it is
* part of an AVL group.
* *
* @param dms The OBIDMS to which the AVL tree belongs. * @param dms The OBIDMS to which the AVL tree belongs.
* @param avl_name The name of the AVL tree. * @param avl_name The name of the AVL tree.
* @param avl_idx The index of the AVL tree if it is part of an AVL group.
* *
* @returns A pointer to the newly created AVL tree structure. * @returns A pointer to the newly created AVL tree structure.
* @retval NULL if an error occurred. * @retval NULL if an error occurred.
@ -217,7 +207,7 @@ OBIDMS_avl_p obi_avl(OBIDMS_p dms, const char* avl_name);
* @since December 2015 * @since December 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org) * @author Celine Mercier (celine.mercier@metabarcoding.org)
*/ */
OBIDMS_avl_p obi_create_avl(OBIDMS_p dms, const char* avl_name); OBIDMS_avl_p obi_create_avl(OBIDMS_p dms, const char* avl_name, int avl_idx);
/** /**
@ -230,6 +220,7 @@ OBIDMS_avl_p obi_create_avl(OBIDMS_p dms, const char* avl_name);
* *
* @param dms The OBIDMS to which the AVL tree belongs. * @param dms The OBIDMS to which the AVL tree belongs.
* @param avl_name The name of the AVL tree. * @param avl_name The name of the AVL tree.
* @param avl_idx The index of the AVL tree if it is part of an AVL group.
* *
* @returns A pointer to the AVL tree structure. * @returns A pointer to the AVL tree structure.
* @retval NULL if an error occurred. * @retval NULL if an error occurred.
@ -237,17 +228,66 @@ OBIDMS_avl_p obi_create_avl(OBIDMS_p dms, const char* avl_name);
* @since December 2015 * @since December 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org) * @author Celine Mercier (celine.mercier@metabarcoding.org)
*/ */
OBIDMS_avl_p obi_open_avl(OBIDMS_p dms, const char* avl_name); OBIDMS_avl_p obi_open_avl(OBIDMS_p dms, const char* avl_name, int avl_idx);
/**
* @brief Opens an AVL tree group and creates it if it does not already exist.
*
* Note: An AVL tree group is composed of multiple AVL trees that all have the
* same base name, and an index differentiating them.
*
* @param dms The OBIDMS to which the AVL tree belongs.
* @param avl_name The base name of the AVL tree group.
*
* @returns A pointer to the AVL tree group structure.
* @retval NULL if an error occurred.
*
* @since April 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
OBIDMS_avl_group_p obi_avl_group(OBIDMS_p dms, const char* avl_name);
/**
* @brief Creates an AVL tree group.
*
* Note: An AVL tree group is composed of multiple AVL trees that all have the
* same base name, and an index differentiating them.
*
* @param dms The OBIDMS to which the AVL tree belongs.
* @param avl_name The base name of the AVL tree group.
*
* @returns A pointer to the AVL tree group structure.
* @retval NULL if an error occurred.
*
* @since April 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
OBIDMS_avl_group_p obi_create_avl_group(OBIDMS_p dms, const char* avl_name);
/**
* @brief Opens an AVL tree group.
*
* Note: An AVL tree group is composed of multiple AVL trees that all have the
* same base name, and an index differentiating them.
*
* @param dms The OBIDMS to which the AVL tree belongs.
* @param avl_name The base name of the AVL tree group.
*
* @returns A pointer to the AVL tree group structure.
* @retval NULL if an error occurred.
*
* @since April 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
OBIDMS_avl_group_p obi_open_avl_group(OBIDMS_p dms, const char* avl_name);
/** /**
* @brief Closes an AVL tree. * @brief Closes an AVL tree.
* *
* Note: An AVL tree is made of two files (referred to by two structures).
* One file contains the indices referring to the data, and the other
* file contains the data itself. The AVL tree as a whole is referred
* to via the OBIDMS_avl structure.
*
* @param avl A pointer to the AVL tree structure to close and free. * @param avl A pointer to the AVL tree structure to close and free.
* *
* @retval 0 if the operation was successfully completed. * @retval 0 if the operation was successfully completed.
@ -260,26 +300,56 @@ int obi_close_avl(OBIDMS_avl_p avl);
/** /**
* @brief Adds a value (byte array) in an AVL tree, checking if it is already in it. * @brief Closes an AVL tree group.
* *
* @warning The byte array to add must already be encoded and contain its header. * @param avl_group A pointer to the AVL tree group structure to close and free.
*
* @retval 0 if the operation was successfully completed.
* @retval -1 if an error occurred.
*
* @since April 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int obi_close_avl_group(OBIDMS_avl_group_p avl_group);
/**
* @brief Recovers a value (byte array) in an AVL tree.
*
* @warning The byte array recovered must be decoded to get the original value.
*
* @param avl A pointer to the AVL tree.
* @param index The index of the value in the data array.
*
* @returns A pointer to the byte array recovered.
*
* @since December 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
Obi_byte_array_p obi_avl_get(OBIDMS_avl_p avl, index_t index);
/**
* @brief Adds a value (byte array) in an AVL tree NOT checking first if it is already in it. // TODO to discuss
*
* @warning The value given must be already be encoded into a byte array structure (Obi_byte_array_t).
* *
* @param avl A pointer to the AVL tree. * @param avl A pointer to the AVL tree.
* @param value The byte array to add in the AVL tree. * @param value The byte array to add in the AVL tree.
* *
* @returns The index of the value, whether it was added or already in the AVL tree. * @returns The index of the value newly added in the AVL tree.
* @retval -1 if an error occurred. * @retval -1 if an error occurred.
* *
* @since December 2015 * @since December 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org) * @author Celine Mercier (celine.mercier@metabarcoding.org)
*/ */
index_t obi_avl_add(OBIDMS_avl_p avl, byte_t* value); index_t obi_avl_add(OBIDMS_avl_p avl, Obi_byte_array_p value);
/** /**
* @brief Finds a value (byte array) in an AVL tree, checking first if it is already in it. * @brief Finds a value (byte array) in an AVL tree.
* *
* @warning The byte array to add must already be encoded and contain its header. * @warning The value given must be already be encoded into a byte array structure (Obi_byte_array_t).
* *
* @param avl A pointer to the AVL tree. * @param avl A pointer to the AVL tree.
* @param value The byte array to add in the AVL tree. * @param value The byte array to add in the AVL tree.
@ -290,86 +360,40 @@ index_t obi_avl_add(OBIDMS_avl_p avl, byte_t* value);
* @since December 2015 * @since December 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org) * @author Celine Mercier (celine.mercier@metabarcoding.org)
*/ */
index_t obi_avl_find(OBIDMS_avl_p avl, byte_t* value); index_t obi_avl_find(OBIDMS_avl_p avl, Obi_byte_array_p value);
/** /**
* @brief Recovers a value (byte array) in an AVL tree. * @brief Recovers a value (byte array) in an AVL tree.
* *
* @warning The byte array recovered is encoded and contains its header. * @warning The byte array recovered must be decoded to get the original value.
* *
* @param avl A pointer to the AVL tree. * @param avl_group A pointer to the AVL tree.
* @param index The index of the value in the data array. * @param index The index of the value in the data array.
* *
* @returns A pointer to the byte array recovered. * @returns A pointer to the byte array recovered.
* *
* @since December 2015 * @since April 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org) * @author Celine Mercier (celine.mercier@metabarcoding.org)
*/ */
byte_t* obi_avl_get(OBIDMS_avl_p avl, index_t index); Obi_byte_array_p obi_avl_group_get(OBIDMS_avl_group_p avl_group, index_t idx);
/** /**
* @brief Converts a character string to a byte array with a header. * @brief Adds a value (byte array) in an AVL tree group, checking if it is already in it.
* *
* @warning The byte array must be freed by the caller. * @warning The value given must be already be encoded into a byte array structure (Obi_byte_array_t).
* *
* @param value The character string to convert. * @param avl_group A pointer to the AVL tree group.
* @param value The byte array to add in the AVL tree group.
* *
* @returns A pointer to the byte array created. * @returns The index of the value newly added in the AVL tree group.
* @retval NULL if an error occurred. * @retval -1 if an error occurred.
* *
* @since October 2015 * @since April 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org) * @author Celine Mercier (celine.mercier@metabarcoding.org)
*/ */
byte_t* obi_str_to_obibytes(char* value); index_t obi_avl_group_add(OBIDMS_avl_group_p avl_group, Obi_byte_array_p value);
/**
* @brief Converts a byte array to a character string.
*
* @param value_b The byte array to convert.
*
* @returns A pointer to the character string contained in the byte array.
*
* @since October 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
const char* obi_obibytes_to_str(byte_t* value_b);
/**
* @brief Converts a DNA sequence to a byte array with a header.
*
* @warning The byte array must be freed by the caller.
*
* @param value The DNA sequence to convert.
*
* @returns A pointer to the byte array created.
* @retval NULL if an error occurred.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
byte_t* obi_seq_to_obibytes(char* seq);
/**
* @brief Converts a byte array to a DNA sequence.
*
* @param value_b The byte array to convert.
*
* @returns A pointer to the DNA sequence contained in the byte array.
* @retval NULL if an error occurred.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
const char* obi_obibytes_to_seq(byte_t* value_b); // TODO move to encode source files
// TODO
byte_t* obi_avl_group_get(OBIDMS_avl_group_p avl_group, index_t idx);
#endif /* OBIAVL_H_ */ #endif /* OBIAVL_H_ */

View File

@ -23,7 +23,7 @@
#include "obierrno.h" #include "obierrno.h"
#include "obidebug.h" #include "obidebug.h"
#include "obidmscolumn.h" #include "obidmscolumn.h"
#include "private_at_functions.h" #include "utils.h"
#include "obilittlebigman.h" #include "obilittlebigman.h"

View File

@ -33,7 +33,7 @@
*/ */
#define MAX_NB_OPENED_COLUMNS (100) /**< The maximum number of columns open at the same time. #define MAX_NB_OPENED_COLUMNS (100) /**< The maximum number of columns open at the same time.
*/ */
#define MAX_NB_OPENED_AVL_TREES (100) /**< The maximum number of AVL trees open at the same time. #define MAX_NB_OPENED_AVL_TREES (1000) /**< The maximum number of AVL trees open at the same time.
*/ */

View File

@ -20,7 +20,7 @@
#include "obidms.h" #include "obidms.h"
#include "obidebug.h" #include "obidebug.h"
#include "obierrno.h" #include "obierrno.h"
#include "private_at_functions.h" #include "utils.h"
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)

View File

@ -30,6 +30,7 @@
#include "obidebug.h" #include "obidebug.h"
#include "obilittlebigman.h" #include "obilittlebigman.h"
#include "obiavl.h" #include "obiavl.h"
#include "utils.h"
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
@ -726,15 +727,15 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms,
// If the data type is OBI_STR or OBI_SEQ, the associated obi_avl is opened or created // If the data type is OBI_STR or OBI_SEQ, the associated obi_avl is opened or created
if ((returned_data_type == OBI_STR) || (returned_data_type == OBI_SEQ)) if ((returned_data_type == OBI_STR) || (returned_data_type == OBI_SEQ))
{ {
new_column->avl = obi_create_avl_group(dms, avl_name); new_column->avl = obi_avl_group(dms, avl_name);
// if (avl == NULL) TODO if (new_column->avl == NULL)
// { {
// obidebug(1, "\nError opening or creating the aVL tree associated with a column"); obidebug(1, "\nError opening or creating the AVL group associated with a column");
// munmap(new_column->header, header_size); munmap(new_column->header, header_size);
// close(column_file_descriptor); close(column_file_descriptor);
// free(new_column); free(new_column);
// return NULL; return NULL;
// } }
strncpy(header->avl_name, avl_name, AVL_MAX_NAME); strncpy(header->avl_name, avl_name, AVL_MAX_NAME);
} }
@ -762,7 +763,6 @@ OBIDMS_column_p obi_open_column(OBIDMS_p dms,
int column_file_descriptor; int column_file_descriptor;
size_t header_size; size_t header_size;
size_t i; size_t i;
OBIDMS_avl_p avl;
column = NULL; column = NULL;
@ -770,7 +770,7 @@ OBIDMS_column_p obi_open_column(OBIDMS_p dms,
column_directory = obi_open_column_directory(dms, column_name); column_directory = obi_open_column_directory(dms, column_name);
if (column_directory == NULL) if (column_directory == NULL)
{ {
//obidebug(1, "\nError opening a column directory structure"); //obidebug(1, "\nError opening a column directory structure"); // TODO
return NULL; return NULL;
} }
@ -879,8 +879,8 @@ OBIDMS_column_p obi_open_column(OBIDMS_p dms,
// If the data type is OBI_STR or OBI_SEQ, the associated AVL tree is opened // If the data type is OBI_STR or OBI_SEQ, the associated AVL tree is opened
if (((column->header)->returned_data_type == OBI_STR) || ((column->header)->returned_data_type == OBI_SEQ)) if (((column->header)->returned_data_type == OBI_STR) || ((column->header)->returned_data_type == OBI_SEQ))
{ {
avl = obi_avl(dms, (column->header)->avl_name); column->avl = obi_open_avl_group(dms, (column->header)->avl_name);
if (avl == NULL) if (column->avl == NULL)
{ {
obidebug(1, "\nError opening the AVL tree associated with a column"); obidebug(1, "\nError opening the AVL tree associated with a column");
munmap(column->header, header_size); munmap(column->header, header_size);
@ -888,7 +888,6 @@ OBIDMS_column_p obi_open_column(OBIDMS_p dms,
free(column); free(column);
return NULL; return NULL;
} }
//column->avl = avl; TODO
} }
close(column_file_descriptor); close(column_file_descriptor);
@ -964,18 +963,13 @@ OBIDMS_column_p obi_clone_column(OBIDMS_p dms,
} }
else if (clone_data && (line_selection != NULL)) else if (clone_data && (line_selection != NULL))
{ {
obidebug(1, "\nCloning data from line selection\n");
line_size = obi_sizeof((new_column->header)->stored_data_type) * (new_column->header)->nb_elements_per_line; line_size = obi_sizeof((new_column->header)->stored_data_type) * (new_column->header)->nb_elements_per_line;
fprintf(stderr, "\nline size = %ld\n", line_size);
for (i=0; i<((line_selection->header)->lines_used); i++) for (i=0; i<((line_selection->header)->lines_used); i++)
{ {
index = *(((index_t*) (line_selection->data)) + i); index = *(((index_t*) (line_selection->data)) + i);
fprintf(stderr, "\nindex = %lld, i = %lld\n", index, i);
memcpy((new_column->data)+(i*line_size), (column_to_clone->data)+(index*line_size), line_size); memcpy((new_column->data)+(i*line_size), (column_to_clone->data)+(index*line_size), line_size);
fprintf(stderr, "\nmemcpied\n");
} }
(new_column->header)->lines_used = (line_selection->header)->lines_used; (new_column->header)->lines_used = (line_selection->header)->lines_used;
obidebug(1, "\nCloned data from line selection\n");
} }
// Close column_to_clone // Close column_to_clone
@ -1022,12 +1016,12 @@ int obi_close_column(OBIDMS_column_p column)
} }
} }
// If the data type is OBI_STR or OBI_SEQ, the associated AVL tree is closed TODO // If the data type is OBI_STR or OBI_SEQ, the associated AVL group is closed
// if (((column->header)->returned_data_type == OBI_STR) || ((column->header)->returned_data_type == OBI_SEQ)) if (((column->header)->returned_data_type == OBI_STR) || ((column->header)->returned_data_type == OBI_SEQ))
// { {
// if (obi_close_avl(column->avl) < 0) if (obi_close_avl_group(column->avl) < 0)
// return -1; return -1;
// } }
// Munmap data // Munmap data
if (munmap(column->data, (column->header)->data_size) < 0) if (munmap(column->data, (column->header)->data_size) < 0)
@ -1045,10 +1039,10 @@ int obi_close_column(OBIDMS_column_p column)
return -1; return -1;
} }
free(column);
if (close_dir) if (close_dir)
obi_close_column_directory(column->column_directory); obi_close_column_directory(column->column_directory);
free(column);
} }
return 0; return 0;

View File

@ -28,8 +28,6 @@
#include "obiavl.h" #include "obiavl.h"
#define ONE_IF_ZERO(x) (((x)==0)?1:(x)) /**< If x is equal to 0, x takes the value 1.
*/
#define ELEMENTS_NAMES_MAX (2048) /**< The maximum length of the list of elements names. #define ELEMENTS_NAMES_MAX (2048) /**< The maximum length of the list of elements names.
*/ */
#define COLUMN_GROWTH_FACTOR (2) /**< The growth factor when a column is enlarged. #define COLUMN_GROWTH_FACTOR (2) /**< The growth factor when a column is enlarged.

View File

@ -33,7 +33,7 @@
int obi_column_set_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx, char* value) int obi_column_set_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx, char* value)
{ {
byte_t* value_b; Obi_byte_array_p value_b;
index_t idx; index_t idx;
// Check that the line number is not greater than the maximum allowed // Check that the line number is not greater than the maximum allowed
@ -56,18 +56,13 @@ int obi_column_set_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb,
if ((line_nb+1) > (column->header)->lines_used) if ((line_nb+1) > (column->header)->lines_used)
(column->header)->lines_used = line_nb+1; (column->header)->lines_used = line_nb+1;
// Encode the value on a byte array with a header // Encode the value on a byte array with a header // TODO make function
value_b = obi_seq_to_obibytes(value); value_b = obi_seq_to_obibytes(value);
if (value_b == NULL) if (value_b == NULL)
return -1; return -1;
//if (strlen(value_b) == 0)
// fprintf(stderr, "\nPOUIC");
//fprintf(stderr, "\n>%s||%s", value, obi_obibytes_to_seq(value_b));
// Add in the AVL tree // Add in the AVL tree
idx = insert_in_avl_group(column->avl, value_b); idx = obi_avl_group_add(column->avl, value_b);
if (idx == -1) if (idx == -1)
return -1; return -1;
@ -120,7 +115,7 @@ int obi_column_set_obiseq_with_elt_idx_in_view(Obiview_p view, OBIDMS_column_p c
const char* obi_column_get_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx) const char* obi_column_get_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx)
{ {
index_t idx; index_t idx;
byte_t* value_b; Obi_byte_array_p value_b;
if ((line_nb+1) > ((column->header)->line_count)) if ((line_nb+1) > ((column->header)->line_count))
{ {

View File

@ -32,7 +32,7 @@
int obi_column_set_obistr_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx, char* value) int obi_column_set_obistr_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx, char* value)
{ {
byte_t* value_b; Obi_byte_array_p value_b;
index_t idx; index_t idx;
// Check that the line number is not greater than the maximum allowed // Check that the line number is not greater than the maximum allowed
@ -61,7 +61,7 @@ int obi_column_set_obistr_with_elt_idx(OBIDMS_column_p column, index_t line_nb,
return -1; return -1;
// Add in the AVL tree // Add in the AVL tree
idx = insert_in_avl_group(column->avl, value_b); idx = obi_avl_group_add(column->avl, value_b);
if (idx == -1) if (idx == -1)
return -1; return -1;
@ -114,7 +114,7 @@ int obi_column_set_obistr_with_elt_idx_in_view(Obiview_p view, OBIDMS_column_p c
const char* obi_column_get_obistr_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx) const char* obi_column_get_obistr_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx)
{ {
index_t idx; index_t idx;
byte_t* value_b; Obi_byte_array_p value_b;
if ((line_nb+1) > ((column->header)->line_count)) if ((line_nb+1) > ((column->header)->line_count))
{ {

View File

@ -20,7 +20,7 @@
#include "obidmscolumndir.h" #include "obidmscolumndir.h"
#include "obidms.h" #include "obidms.h"
#include "private_at_functions.h" #include "utils.h"
#include "obierrno.h" #include "obierrno.h"
#include "obidebug.h" #include "obidebug.h"

View File

@ -104,6 +104,14 @@ extern int obi_errno;
*/ */
#define OBI_TAXONOMY_ERROR (22) /** Error while handling binary taxonomy files #define OBI_TAXONOMY_ERROR (22) /** Error while handling binary taxonomy files
*/ */
#define OBI_MALLOC_ERROR (23) /** Error while allocating memory
*/
#define OBI_ENCODE_ERROR (24) /** Error while encoding a value
*/
#define OBI_DECODE_ERROR (25) /** Error while decoding a value
*/
#define OBI_UTILS_ERROR (26) /** Error in a utils function
*/
/**@}*/ /**@}*/
#endif /* OBIERRNO_H_ */ #endif /* OBIERRNO_H_ */

View File

@ -22,7 +22,7 @@
#include "obierrno.h" #include "obierrno.h"
#include "obidebug.h" #include "obidebug.h"
#include "obidmscolumn.h" #include "obidmscolumn.h"
#include "private_at_functions.h" #include "utils.h"
#include "obilittlebigman.h" #include "obilittlebigman.h"
#include "obidmscolumn_idx.h" #include "obidmscolumn_idx.h"

View File

@ -1,15 +1,14 @@
/**************************************************************************** /****************************************************************************
* Private *at functions * * Utility functions *
****************************************************************************/ ****************************************************************************/
/** /**
* @file private_at_functions.c * @file utils.c
* @author Celine Mercier (celine.mercier@metabarcoding.org) * @author Celine Mercier (celine.mercier@metabarcoding.org)
* @date 15 June 2015 * @date 29 March 2016
* @brief Private replacement functions for *at functions. * @brief Code for utility functions.
*/ */
#include <fcntl.h> #include <fcntl.h>
#include <string.h> #include <string.h>
#include <sys/stat.h> #include <sys/stat.h>
@ -19,7 +18,7 @@
#include <dirent.h> #include <dirent.h>
#include <unistd.h> #include <unistd.h>
#include "private_at_functions.h" #include "utils.h"
#include "obidebug.h" #include "obidebug.h"
#include "obierrno.h" #include "obierrno.h"
#include "obidms.h" #include "obidms.h"
@ -28,6 +27,13 @@
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
/**********************************************************************
*
* D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S
*
**********************************************************************/
char* get_full_path(OBIDMS_p dms, const char* path_name) char* get_full_path(OBIDMS_p dms, const char* path_name)
{ {
char* full_path; char* full_path;
@ -35,17 +41,18 @@ char* get_full_path(OBIDMS_p dms, const char* path_name)
full_path = (char*) malloc((MAX_PATH_LEN)*sizeof(char)); full_path = (char*) malloc((MAX_PATH_LEN)*sizeof(char));
if (full_path == NULL) if (full_path == NULL)
{ {
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for the char* path to a file or directory"); obidebug(1, "\nError allocating memory for the char* path to a file or directory");
return NULL; return NULL;
} }
if (getcwd(full_path, MAX_PATH_LEN) == NULL) if (getcwd(full_path, MAX_PATH_LEN) == NULL)
{ {
obi_set_errno(OBI_UTILS_ERROR);
obidebug(1, "\nError getting the path to a file or directory"); obidebug(1, "\nError getting the path to a file or directory");
return NULL; return NULL;
} }
// TODO check errors?
strcat(full_path, "/"); strcat(full_path, "/");
strcat(full_path, dms->directory_name); strcat(full_path, dms->directory_name);
strcat(full_path, "/"); strcat(full_path, "/");
@ -66,7 +73,10 @@ DIR* opendir_in_dms(OBIDMS_p dms, const char* path_name)
directory = opendir(full_path); directory = opendir(full_path);
if (directory == NULL) if (directory == NULL)
{
obi_set_errno(OBI_UTILS_ERROR);
obidebug(1, "\nError opening a directory"); obidebug(1, "\nError opening a directory");
}
free(full_path); free(full_path);
@ -74,3 +84,24 @@ DIR* opendir_in_dms(OBIDMS_p dms, const char* path_name)
} }
int count_dir(char *dir)
{
struct dirent *dp;
DIR *fd;
int count;
count = 0;
if ((fd = opendir(dir)) == NULL)
{
obi_set_errno(OBI_UTILS_ERROR);
obidebug(1, "Error opening a directory: %s\n", dir);
return -1;
}
while ((dp = readdir(fd)) != NULL)
{
if ((dp->d_name)[0] == '.')
continue;
count++;
}
return count;
}

View File

@ -1,25 +1,30 @@
/**************************************************************************** /****************************************************************************
* Header file for private *at functions * * Header file for utility functions *
****************************************************************************/ ****************************************************************************/
/** /**
* @file private_at_functions.h * @file utils.h
* @author Celine Mercier (celine.mercier@metabarcoding.org) * @author Celine Mercier (celine.mercier@metabarcoding.org)
* @date 15 June 2015 * @date 29 March 2016
* @brief Header file for the private replacement functions for *at functions. * @brief Header file for utility functions.
*/ */
#ifndef PRIVATE_OPENAT_H_ #ifndef UTILS_H_
#define PRIVATE_OPENAT_H_ #define UTILS_H_
#include <stdio.h>
#include <sys/stat.h> #include <sys/stat.h>
#include "obidms.h" #include "obidms.h"
#define ONE_IF_ZERO(x) (((x)==0)?1:(x)) /**< If x is equal to 0, x takes the value 1.
*/
#define MAX_PATH_LEN 4096 /**< Maximum length for the character string defining a #define MAX_PATH_LEN 4096 /**< Maximum length for the character string defining a
file or directory path */ * file or directory path.
*/
/** /**
@ -56,4 +61,10 @@ char* get_full_path(OBIDMS_p dms, const char* path_name);
DIR* opendir_in_dms(OBIDMS_p dms, const char* path_name); DIR* opendir_in_dms(OBIDMS_p dms, const char* path_name);
#endif /* PRIVATEOPENAT_H_ */ /*
* TODO
*/
int count_dir(char *dir);
#endif /* UTILS_H_ */