Testing RAM instead of mmap for blob alignment

obi import: fixed bug caused by new behaviour of StopIteration
exceptions in Python>=3.7
2023-11-29 16:24:31 +13:00 · 2023-09-21 17:47:40 +12:00 · 2023-05-15 11:35:42 +12:00 · 2023-05-12 17:04:21 +12:00 · 2023-05-08 17:48:50 +12:00
7 changed files with 148 additions and 26 deletions
--- a/python/obitools3/commands/ngsfilter.pyx
+++ b/python/obitools3/commands/ngsfilter.pyx
@ -24,6 +24,9 @@ from cpython.exc cimport PyErr_CheckSignals
 from io import BufferedWriter


+MAX_PAT_LEN = 31
+
+
 __title__="Assign sequence records to the corresponding experiment/sample based on DNA tags and primers"


@ -84,6 +87,8 @@ class Primer:
        @type direct:
        '''
        
+        assert len(sequence) <= MAX_PAT_LEN, "Primer %s is too long, 31 bp max" % sequence
+        
        assert sequence not in Primer.collection        \
            or Primer.collection[sequence]==taglength,  \
            "Primer %s must always be used with tags of the same length" % sequence
@ -271,7 +276,7 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):

    if not_aligned:
        sequences[1] = sequences[1].clone()
-        sequences[0][REVERSE_SEQUENCE_COLUMN] = sequences[1].seq             # used by alignpairedend tool
+        sequences[0][REVERSE_SEQUENCE_COLUMN] = sequences[1].seq        # used by alignpairedend tool
        sequences[0][REVERSE_QUALITY_COLUMN] = sequences[1].quality     # used by alignpairedend tool

    for seq in sequences:
@ -299,7 +304,7 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
            directmatch.append((p, p(seq, same_sequence=not new_seq, pattern=pattern), seq, p))
            new_seq = False
            pattern+=1
-    
+        
    # Choose match closer to the start of (one of the) sequence(s)
    directmatch = sorted(directmatch, key=sortMatch)
    all_direct_matches = directmatch
--- a/python/obitools3/commands/rm.pyx
+++ b/python/obitools3/commands/rm.pyx
@ -5,7 +5,9 @@ from obitools3.apps.config import logger
 from obitools3.dms import DMS
 from obitools3.apps.optiongroups import addMinimalInputOption
 from obitools3.dms.view.view cimport View
+from obitools3.utils cimport tostr
 import os
+import shutil
 
 __title__="Delete a view"

@ -30,15 +32,56 @@ def run(config):
    else: 
        raise NotImplementedError()
    
+    dms = input[0]
+    
    # Get the path to the view file to remove
-    path = input[0].full_path  # dms path
-    path+=b"/VIEWS/"
-    path+=view.name
-    path+=b".obiview"
+    path = dms.full_path  # dms path
+    view_path=path+b"/VIEWS/"
+    view_path+=view.name
+    view_path+=b".obiview"
+    
+    to_remove = {}
+    # For each column:
+    for col_alias in view.keys():
+        col = view[col_alias]
+        col_name = col.original_name
+        col_version = col.version
+        col_type = col.data_type
+        col_ref = (col_name, col_version)
+        # build file name and AVL file names
+        col_file_name = f"{tostr(path)}/{tostr(col.original_name)}.obicol/{tostr(col.original_name)}@{col.version}.odc"
+        if col_type in [b'OBI_CHAR', b'OBI_QUAL', b'OBI_STR', b'OBI_SEQ']:
+            avl_file_name = f"{tostr(path)}/OBIBLOB_INDEXERS/{tostr(col.original_name)}_{col.version}_indexer"
+        else:
+            avl_file_name = None
+        to_remove[col_ref] = [col_file_name, avl_file_name]
+    
+    # For each view:
+    do_not_remove = []
+    for vn in dms:
+        v = dms[vn]
+        # ignore the one being deleted
+        if v.name != view.name:
+            # check that none of the column is referenced, if referenced, remove from list to remove
+            cols = [(v[c].original_name, v[c].version) for c in v.keys()]
+            for col_ref in to_remove:
+                if col_ref in cols:
+                    do_not_remove.append(col_ref)
+    
+    for nr in do_not_remove:
+        to_remove.pop(nr)
    
    # Close the view and the DMS
    view.close()
    input[0].close(force=True)
    
-    # Rm
-    os.remove(path)
+    #print(to_remove)
+    
+    # rm AFTER view and DMS close
+    os.remove(view_path)
+    for col in to_remove:
+        os.remove(to_remove[col][0])
+        if to_remove[col][1] is not None:
+            shutil.rmtree(to_remove[col][1])
+        
+        
--- a/python/obitools3/parsers/fastq.pyx
+++ b/python/obitools3/parsers/fastq.pyx
@ -103,7 +103,11 @@ def fastqWithQualityIterator(lineiterator,
        yield seq

        read+=1
-        hline = next(i)
+        try:
+            hline = next(i)
+        except StopIteration:
+            return
+
    

 def fastqWithoutQualityIterator(lineiterator, 
@ -174,5 +178,7 @@ def fastqWithoutQualityIterator(lineiterator,
        yield seq
    
        read+=1
-        hline = next(i)
-        
+        try:
+            hline = next(i)
+        except StopIteration:
+            return        
--- a/python/obitools3/parsers/tab.pyx
+++ b/python/obitools3/parsers/tab.pyx
@ -99,7 +99,10 @@ def tabIterator(lineiterator,
        
        read+=1
        
-        line = next(iterator)
+        try:
+            line = next(iterator)
+        except StopIteration:
+            return 
    
        
        
--- a/python/obitools3/version.py
+++ b/python/obitools3/version.py
@ -1,5 +1,5 @@
 major = 3
 minor = 0
-serial= '1b21'
+serial= '1b25'

 version ="%d.%d.%s" % (major,minor,serial)
--- a/src/obi_ecopcr.c
+++ b/src/obi_ecopcr.c
@ -365,8 +365,6 @@ static int print_seq(Obiview_p i_view, Obiview_p o_view,

 	int32_t i;

-	// TODO add check for primer longer than MAX_PAT_LEN (32)
-
 	// Get sequence id
 	seq_id = obi_get_str_with_elt_idx_and_col_p_in_view(i_view, i_id_column, i_idx, 0);

@ -751,6 +749,20 @@ int obi_ecopcr(const char* i_dms_name,
 	o1c = complementPattern(o1);
 	o2c = complementPattern(o2);

+	// check for primers equal or longer than MAX_PAT_LEN (32)
+	if (strlen(primer1) >= MAX_PAT_LEN)
+	{
+		obi_set_errno(OBI_ECOPCR_ERROR);
+		obidebug(1, "\nError: first primer is too long, needs to be < 32bp (%s)", primer1);
+		return -1;
+	}
+	if (strlen(primer2) >= MAX_PAT_LEN)
+	{
+		obi_set_errno(OBI_ECOPCR_ERROR);
+		obidebug(1, "\nError: second primer is too long, needs to be < 32bp (%s)", primer2);
+		return -1;
+	}
+
 	// Open input DMS
 	i_dms = obi_open_dms(i_dms_name, false);
 	if (i_dms == NULL)
--- a/src/obi_lcs.c
+++ b/src/obi_lcs.c
@ -10,8 +10,9 @@
 */

 //#define OMP_SUPPORT // TODO
-#ifdef OMP_SUPPORT
+#ifdef _OPENMP
 #include <omp.h>
+#include <pthread.h>
 #endif

 #include <stdlib.h>
@ -407,10 +408,15 @@ int obi_lcs_align_one_column(const char* dms_name,
 	int             lcs_length;
 	int             ali_length;
 	Kmer_table_p    ktable;
+	Obi_blob_p      blob1_mapped;
+	Obi_blob_p   	blob2_mapped;
 	Obi_blob_p      blob1;
-	Obi_blob_p   	blob2;
+	Obi_blob_p      blob2;
+	int				blob1_size;
+	int             blob2_size;
 	int				lcs_min;
 	index_t         seq_elt_idx;
+	bool			stop = false;

 	OBIDMS_p        dms = NULL;
 	Obiview_p       seq_view = NULL;
@ -568,10 +574,16 @@ int obi_lcs_align_one_column(const char* dms_name,

 	seq_count = (seq_view->infos)->line_count;

-	#ifdef OMP_SUPPORT
-	omp_set_num_threads(thread_count);
-	#pragma omp parallel for
-	#endif
+//	#ifdef _OPENMP
+//	int max_threads = omp_get_max_threads();
+//	if ((thread_count == -1) || (thread_count > max_threads))
+//		thread_count = max_threads;
+//	omp_set_num_threads(thread_count);
+//	fprintf(stderr, "Running on %d thread(s)\n", thread_count);
+//	pthread_mutex_t mutex;
+//	if (pthread_mutex_init(&mutex, NULL) < 0)
+//		return -1;
+//	#endif

 	for (i=0; i < (seq_count - 1); i++)
 	{
@ -585,23 +597,45 @@ int obi_lcs_align_one_column(const char* dms_name,
 		id1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0);	// TODO Could there be multiple IDs per line?
 		// Get first sequence and its index
 		seq1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx);
-		blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx);
+		blob1_mapped = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx);
+		if (blob1_mapped == NULL)
+		{
+			obidebug(1, "\nError retrieving sequences to align");
+			return -1;
+		}
+		blob1_size = obi_blob_sizeof(blob1_mapped);
+		blob1 = (Obi_blob_p) malloc(blob1_size);
 		if (blob1 == NULL)
 		{
 			obidebug(1, "\nError retrieving sequences to align");
 			return -1;
 		}
+		blob1 = memcpy(blob1, blob1_mapped, blob1_size);

+//#pragma omp parallel shared(blob1_mapped, blob1, blob1_size, id1_idx, seq1_idx, stop, seq_count, output_view, k, ktable, \
+//		   idx1_column, idx2_column, i, id1_column, id2_column, print_seq, seq1_column, seq2_column, \
+//		   print_count, count1_column, count2_column, ali_length_column, lcs_length_column, score_column, reference, normalize, similarity_mode) \
+//		private(blob2_mapped, blob2, blob2_size, j, id2_idx, seq2_idx, count2, count1, score, ali_length, lcs_length)
+//{
+//		#pragma omp for schedule(dynamic, seq_count/thread_count + (seq_count % thread_count != 0))  // Avoid 0 which blocks the program
 		for (j=i+1; j < seq_count; j++)
 		{
 			// Get second sequence and its index
 			seq2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx);
-			blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx);
+			blob2_mapped = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx);
+			if (blob2_mapped == NULL)
+			{
+				obidebug(1, "\nError retrieving sequences to align");
+				stop = true;
+			}
+			blob2_size = obi_blob_sizeof(blob2_mapped);
+			blob2 = (Obi_blob_p) malloc(blob2_size);
 			if (blob2 == NULL)
 			{
 				obidebug(1, "\nError retrieving sequences to align");
-				return -1;
+				stop = true;
 			}
+			blob2 = memcpy(blob2, blob2_mapped, blob2_size);

 			// Check if the sequences are identical in a quick way (same index in the same indexer)
 			if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx))
@ -624,6 +658,8 @@ int obi_lcs_align_one_column(const char* dms_name,
 					score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode, &lcs_length, &ali_length);
 			}

+			free(blob2);
+
 			if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold))))
 			{	// Print result

@ -637,6 +673,10 @@ int obi_lcs_align_one_column(const char* dms_name,
 					count2 = obi_get_int_with_elt_idx_and_col_p_in_view(seq_view, i_count_column, j, 0);
 				}

+//				#ifdef _OPENMP
+//				if (pthread_mutex_lock(&mutex) < 0)
+//					stop = true;
+//				#endif
 				if (print_alignment_result(output_view, k,
 										   idx1_column, idx2_column, i, j,
 										   id1_column, id2_column, id1_idx, id2_idx,
@ -646,15 +686,24 @@ int obi_lcs_align_one_column(const char* dms_name,
 										   lcs_length_column, lcs_length,
 										   score_column, score,
 										   reference, normalize, similarity_mode) < 0)
-					return -1;
+					stop = true;
+//				#ifdef _OPENMP
+//				if (pthread_mutex_unlock(&mutex) < 0)
+//					stop = true;
+//				#endif

 				k++;
 			}
 		}
+//}
+		free(blob1);
 	}

 	fprintf(stderr,"\rDone : 100 %%         \n");

+	//fprintf(stderr,"\nstop=%d\n", stop);
+
+
 	// Close views
 	if (obi_save_and_close_view(seq_view) < 0)
 	{
@ -675,6 +724,11 @@ int obi_lcs_align_one_column(const char* dms_name,

 	free_kmer_tables(ktable, seq_count);

+//	#ifdef _OPENMP
+//	if (pthread_mutex_destroy(&mutex) < 0)
+//		return -1;
+//	#endif
+
 	return 0;
 }

@ -1087,4 +1141,3 @@ int obi_lcs_align_two_columns(const char* dms_name,

 	return 0;
 }
-
Author	SHA1	Message	Date
Celine Mercier	6819d04615	Testing RAM instead of mmap for blob alignment	2023-11-29 16:24:31 +13:00
Celine Mercier	8a1f844645	obi import: fixed bug caused by new behaviour of StopIteration exceptions in Python>=3.7	2023-09-21 17:47:40 +12:00
Celine Mercier	791ccfb92e	Fixed include bug in previous version and switch to version 3.0.1b24	2023-05-15 11:35:42 +12:00
Celine Mercier	1c9a906f5b	ngsfilter and ecopcr: now check for primers too long for apat library to handle (31bp max) and switch to version 3.0.1b23	2023-05-12 17:04:21 +12:00
Celine Mercier	55b2679b23	New command obi rm and switch to version 3.0.1b22	2023-05-08 17:48:50 +12:00