C function for LCS alignment of two columns, and optimized and fixed

line count bug in function to align one column
2016-12-16 19:39:02 +01:00
parent 303bd6f445
commit d99447c12b
2 changed files with 430 additions and 97 deletions
--- a/src/obi_align.h
+++ b/src/obi_align.h
@ -1,12 +1,12 @@
 /****************************************************************************
- * Sequence alignment functions header file	                                *
+ * LCS sequence alignment functions header file	                            *
 ****************************************************************************/

 /**
 * @file obi_align.h
 * @author Celine Mercier
 * @date May 11th 2016
- * @brief Header file for the functions handling the alignment of DNA sequences.
+ * @brief Header file for the functions handling the LCS alignment of DNA sequences.
 */


@ -55,7 +55,7 @@


 /**
- * @brief Aligns a NUC_SEQ column with itself.
+ * @brief Aligns an OBI_SEQ column with itself.
 *
 * Note: The columns where the results are written are automatically named and created.
 *
@ -96,14 +96,59 @@ int obi_lcs_align_one_column(OBIDMS_p dms,


 /**
- * @brief
+ * @brief Aligns two OBI_SEQ columns.
 *
- * TODO
+ * The columns must belong to the same OBIDMS, but can belong to different views.
 *
+ * Note: The columns where the results are written are automatically named and created.
+ *
+ * @param dms A pointer on an OBIDMS.
+ * @param seq1_view_name The name of the view where the first column to align is.
+ * @param seq2_view_name The name of the view where the second column to align is ("" if it is the same view as the first one).
+ * @param seq1_column_name The name of the first OBI_SEQ column in the input view to align.
+ *                         If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "NUC_SEQ" column is aligned.
+ * @param seq2_column_name The name of the second OBI_SEQ column in the input view to align.
+ *                         If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "NUC_SEQ" column is aligned.
+ * @param seq1_elt_name The name of the element in the first column corresponding to the sequence to align, if the column has multiple
+ *                      elements per line.
+ * @param seq2_elt_name The name of the element in the second column corresponding to the sequence to align, if the column has multiple
+ *                      elements per line.
+ * @param id1_column_name The name of the column in the first input view containing the identifiers of the first sequence to align.
+ *                        If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "ID" column is aligned.
+ * @param id2_column_name The name of the column in the second input view containing the identifiers of the second sequence to align.
+ *                        If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "ID" column is aligned.
+ * @param output_view_name The name of the output view where the results should be written (should not already exist).
+ * @param output_view_comments The comments that should be associated with the output view.
+ * @param print_seq A boolean indicating whether the aligned sequences should be copied in the output view.
+ * @param print_count A boolean indicating whether the aligned sequence counts should be copied in the output view.
+ * @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95
+ * 					for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity),
+ * 					e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is
+ * 					the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance,
+ *                  it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed.
+ * @param normalize Whether the score should be normalized with the reference sequence length.
+ * @param reference The reference length. 0: The alignement length; 1: The longest sequence's length; 2: The shortest sequence's length.
+ * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false).
+ *
+ * @returns A value indicating the success of the operation.
+ * @retval 0 if the operation was successfully completed.
+ * @retval -1 if an error occurred.
+ *
+ * @since December 2016
+ * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
-//int obi_align_two_columns(Obiview_p seq_view, OBIDMS_column_p seq_column_1, OBIDMS_column_p seq_column_2,
-//						  Obiview_p score_view, OBIDMS_column_p score_column,
-//						  double threshold, bool normalize, int reference, bool similarity_mode);
+int obi_lcs_align_two_columns(OBIDMS_p dms,
+							  const char* seq1_view_name,
+							  const char* seq2_view_name,
+							  const char* seq1_column_name,
+							  const char* seq2_column_name,
+							  const char* seq1_elt_name,
+							  const char* seq2_elt_name,
+							  const char* id1_column_name,
+							  const char* id2_column_name,
+					          const char* output_view_name, const char* output_view_comments,
+							  bool print_seq, bool print_count,
+						      double threshold, bool normalize, int reference, bool similarity_mode);


 #endif /* OBI_ALIGN_H_ */