Alignment: API rework. 'obi align' is now 'obi lcs', and the results are

now written to columns automatically created in the output view, all
optimally handled at the C level.
This commit is contained in:
Celine Mercier
2016-12-12 11:58:59 +01:00
parent fa4e4ffaff
commit 8afb1644e9
11 changed files with 579 additions and 272 deletions

View File

@ -1,120 +0,0 @@
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.obidms._obidms import OBIDMS, OBIView # TODO cimport doesn't work
import time
__title__="Aligns one sequence column with itself or two sequence columns"
default_config = { 'inputview' : None,
}
def addOptions(parser):
# TODO put this common group somewhere else but I don't know where
group=parser.add_argument_group('DMS and view options')
group.add_argument('--default-dms','-d',
action="store", dest="obi:defaultdms",
metavar='<DMS NAME>',
default=None,
type=str,
help="Name of the default DMS for reading and writing data.")
group.add_argument('--input-view','-i',
action="store", dest="obi:inputview",
metavar='<INPUT VIEW NAME>',
default=None,
type=str,
help="Name of the input view.")
# TODO eventually 2nd view, or 2nd column?
group.add_argument('--output-view','-o',
action="store", dest="obi:outputview",
metavar='<OUTPUT VIEW NAME>',
default=None,
type=str,
help="Name of the output view.")
group=parser.add_argument_group('obi align specific options')
group.add_argument('--lcs','-C',
action="store", dest="align:alitype",
metavar='<ALIGNMENT TYPE>',
default='lcs',
type=str,
help="Compute alignment using the LCS method (default).")
group.add_argument('--threshold','-t',
action="store", dest="align:threshold",
metavar='<THRESHOLD>',
default=0.0,
type=float,
help="Score threshold. If the score is normalized and expressed in similarity (default),"
" it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized"
" and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%."
" If the score is not normalized and expressed in similarity, it is the length of the"
" Longest Common Subsequence. If the score is not normalized and expressed in distance,"
" it is (reference length - LCS length)."
" Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
" (no threshold).")
group.add_argument('--longest_length','-L',
action="store_const", dest="align:reflength",
default=0,
const=1,
help="The reference length is the length of the longest sequence."
" Default: the reference length is the length of the alignment.")
group.add_argument('--shortest_length','-l',
action="store_const", dest="align:reflength",
default=0,
const=2,
help="The reference length is the length of the shortest sequence."
" Default: the reference length is the length of the alignment.")
group.add_argument('--raw','-r',
action="store_false", dest="align:normalize",
default=True,
help="Raw score, not normalized. Default: score is normalized with the reference sequence length.")
group.add_argument('--distance','-D',
action="store_false", dest="align:similarity",
default=True,
help="Score is expressed in distance. Default: score is expressed in similarity.")
def run(config):
# Open DMS
d = OBIDMS(config['obi']['defaultdms'])
# Open input view 1
iview = d.open_view(config['obi']['inputview'])
# TODO Open input view 2 if there is one
# Create output view
oview = d.new_view(config['obi']['outputview'])
# TODO Take other alignment types into account when they'll be implemented
# Call cython alignment function
iview.align(oview, threshold=config['align']['threshold'], normalize=config['align']['normalize'], reference=config['align']['reflength'], similarity_mode=config['align']['similarity'])
print(repr(oview))
iview.close()
oview.close()
d.close()
print("Done.")

View File

@ -0,0 +1,209 @@
#cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.obidms._obidms cimport OBIDMS # TODO cimport doesn't work
from obitools3.utils cimport str2bytes
from obitools3.obidms.capi.obialign cimport obi_lcs_align_one_column
import time
__title__="Aligns one sequence column with itself or two sequence columns"
default_config = { 'inputview' : None,
}
def addOptions(parser):
# TODO put this common group somewhere else but I don't know where.
# Also some options should probably be in another group
group=parser.add_argument_group('DMS and view options')
group.add_argument('--default-dms', '-d',
action="store", dest="obi:defaultdms",
metavar='<DMS NAME>',
default=None,
type=str,
help="Name of the default DMS for reading and writing data.")
group.add_argument('--input-view-1', '-i',
action="store", dest="obi:inputview1",
metavar='<INPUT VIEW NAME>',
default=None,
type=str,
help="Name of the (first) input view.")
group.add_argument('--input-view-2', '-I',
action="store", dest="obi:inputview2",
metavar='<INPUT VIEW NAME>',
default="",
type=str,
help="Eventually, the name of the second input view.")
group.add_argument('--input-column-1', '-c',
action="store", dest="obi:inputcolumn1",
metavar='<INPUT COLUMN NAME>',
default="",
type=str,
help="Name of the (first) input column. "
" Default: the default nucleotide sequence column of the view if there is one.")
group.add_argument('--input-column-2', '-C',
action="store", dest="obi:inputcolumn2",
metavar='<INPUT COLUMN NAME>',
default="",
type=str,
help="Eventually, the name of the second input column.")
group.add_argument('--input-elt-1', '-e',
action="store", dest="obi:inputelement1",
metavar='<INPUT ELEMENT NAME>',
default="",
type=str,
help="If the first input column has multiple elements per line, name of the element referring to the sequence to align. "
" Default: the first element of the line.")
group.add_argument('--input-elt-2', '-E',
action="store", dest="obi:inputelement2",
metavar='<INPUT ELEMENT NAME>',
default="",
type=str,
help="If the second input column has multiple elements per line, name of the element referring to the sequence to align. "
" Default: the first element of the line.")
group.add_argument('--id-column-1', '-f',
action="store", dest="obi:idcolumn1",
metavar='<ID COLUMN NAME>',
default="",
type=str,
help="Name of the (first) column containing the identifiers of the sequences to align. "
" Default: the default ID column of the view if there is one.")
group.add_argument('--id-column-2', '-F',
action="store", dest="obi:idcolumn2",
metavar='<ID COLUMN NAME>',
default="",
type=str,
help="Eventually, the name of the second ID column.")
group.add_argument('--output-view', '-o',
action="store", dest="obi:outputview",
metavar='<OUTPUT VIEW NAME>',
default=None,
type=str,
help="Name of the output view.")
group=parser.add_argument_group('obi lcs specific options')
group.add_argument('--threshold','-t',
action="store", dest="align:threshold",
metavar='<THRESHOLD>',
default=0.0,
type=float,
help="Score threshold. If the score is normalized and expressed in similarity (default),"
" it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized"
" and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%."
" If the score is not normalized and expressed in similarity, it is the length of the"
" Longest Common Subsequence. If the score is not normalized and expressed in distance,"
" it is (reference length - LCS length)."
" Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
" (no threshold).")
group.add_argument('--longest-length','-L',
action="store_const", dest="align:reflength",
default=0,
const=1,
help="The reference length is the length of the longest sequence."
" Default: the reference length is the length of the alignment.")
group.add_argument('--shortest-length','-l',
action="store_const", dest="align:reflength",
default=0,
const=2,
help="The reference length is the length of the shortest sequence."
" Default: the reference length is the length of the alignment.")
group.add_argument('--raw','-r',
action="store_false", dest="align:normalize",
default=True,
help="Raw score, not normalized. Default: score is normalized with the reference sequence length.")
group.add_argument('--distance','-D',
action="store_false", dest="align:similarity",
default=True,
help="Score is expressed in distance. Default: score is expressed in similarity.")
group.add_argument('--print-seq','-s',
action="store_true", dest="align:printseq",
default=False,
help="The nucleotide sequences are written in the output view. Default: they are not written.")
group.add_argument('--print-count','-n',
action="store_true", dest="align:printcount",
default=False,
help="Sequence counts are written in the output view. Default: they are not written.")
cpdef align(str dms_n,
str input_view_1_n, str output_view_n,
str input_view_2_n="",
str input_column_1_n="", str input_column_2_n="",
str input_elt_1_n="", str input_elt_2_n="",
str id_column_1_n="", str id_column_2_n="",
double threshold=0.0, bint normalize=True,
int reference=0, bint similarity_mode=True,
bint print_seq=False, bint print_count=False,
comments="") :
cdef OBIDMS d
d = OBIDMS(dms_n)
# Align 1 column (2 columns not implemented yet)
if obi_lcs_align_one_column(d._pointer, \
str2bytes(input_view_1_n), \
str2bytes(input_column_1_n), \
str2bytes(input_elt_1_n), \
str2bytes(id_column_1_n), \
str2bytes(output_view_n), \
str2bytes(comments), \
print_seq, \
print_count, \
threshold, normalize, reference, similarity_mode) < 0 :
raise Exception("Error aligning sequences")
d.close()
def run(config):
# TODO: Build formatted comments with all parameters etc
comments = "Obi align"
# Call cython alignment function
align(config['obi']['defaultdms'], \
config['obi']['inputview1'], \
config['obi']['outputview'], \
input_view_2_n = config['obi']['inputview2'], \
input_column_1_n = config['obi']['inputcolumn1'], \
input_column_2_n = config['obi']['inputcolumn2'], \
input_elt_1_n = config['obi']['inputelement1'], \
input_elt_2_n = config['obi']['inputelement2'], \
id_column_1_n = config['obi']['idcolumn1'], \
id_column_2_n = config['obi']['idcolumn2'], \
threshold = config['align']['threshold'], \
normalize = config['align']['normalize'], \
reference = config['align']['reflength'], \
similarity_mode = config['align']['similarity'], \
print_seq = config['align']['printseq'], \
print_count = config['align']['printcount'], \
comments = comments)
print("Done.")