Patch decoding of URL

This commit is contained in:
2017-07-28 12:41:28 +02:00
parent 84bb93096f
commit b9c65a871f
9 changed files with 276 additions and 260 deletions

View File

@ -2,6 +2,8 @@
# TODO cimport generate errors with argument numbers, but without them some variables can't be declared
import sys
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.files.universalopener cimport uopen
from obitools3.parsers.fasta import fastaIterator
@ -20,6 +22,8 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \
from obitools3.dms.capi.obierrno cimport obi_errno
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
from obitools3.uri.decode import open_uri
__title__="Imports sequences from different formats into a DMS"
@ -30,83 +34,14 @@ default_config = { 'destview' : None,
'skiperror' : False,
'seqinformat' : None,
'moltype' : 'nuc',
'filename' : None
'source' : None
}
def addOptions(parser):
parser.add_argument(dest='import:filename',
metavar='<FILENAME>',
nargs='?',
default=None,
help='Name of the sequence file to import' )
addSequenceInputOption(parser)
addMinimalOutputOption(parser)
group=parser.add_argument_group('obi import specific options')
group.add_argument('--default-dms','-d',
action="store", dest="obi:defaultdms",
metavar='<DMS NAME>',
default=None,
type=str,
help="Name of the default DMS for reading and writing data")
group.add_argument('--destination-view','-v',
action="store", dest="import:destview",
metavar='<VIEW NAME>',
default=None,
type=str,
required=True,
help="Name of the default DMS for reading and writing data")
group.add_argument('--skip',
action="store", dest="import:skip",
metavar='<N>',
default=0,
type=int,
help="Skip the N first sequences")
group.add_argument('--only',
action="store", dest="import:only",
metavar='<N>',
default=None,
type=int,
help="Treat only N sequences")
group.add_argument('--skip-on-error',
action="store_true", dest="import:skiperror",
default=None,
help="Skip sequence entries with parse error")
group.add_argument('--fasta',
action="store_const", dest="import:seqinformat",
default=None,
const='fasta',
help="Input file is in fasta nucleic format (including obitools fasta extentions)")
group.add_argument('--fastq',
action="store_const", dest="import:seqinformat",
default=None,
const='fastq',
help="Input file is in sanger fastq nucleic format (standard fastq)")
group.add_argument('--nuc',
action="store_const", dest="import:moltype",
default=None,
const='nuc',
help="Input file contains nucleic sequences")
group.add_argument('--prot',
action="store_const", dest="import:moltype",
default=None,
const='pep',
help="Input file contains protein sequences")
group.add_argument('--NA',
action="store", dest="import:NA",
metavar='<NA_value>',
default='NA',
type=str,
help="Character string for Not Available values in the input file "
"(default: 'NA'")
def run(config):
@ -142,147 +77,159 @@ def run(config):
cdef ProgressBar pb
global obi_errno
pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
inputs = uopen(config['import']['filename'])
# Create or open DMS
d = DMS.open_or_new(config['obi']['defaultdms'])
get_quality = False
NUC_SEQS_view = False
if config['import']['seqinformat']=='fasta':
get_quality = False
NUC_SEQS_view = True
iseq = fastaIterator(inputs, skip=config['import']['skip'])
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
elif config['import']['seqinformat']=='fastq':
get_quality = True
NUC_SEQS_view = True
iseq = fastqIterator(inputs, skip=config['import']['skip'])
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
else:
raise RuntimeError('File format not handled')
# Save basic columns in variables for optimization
if NUC_SEQS_view :
id_col = view["ID"]
def_col = view["DEFINITION"]
seq_col = view["NUC_SEQ"]
if get_quality :
qual_col = view["QUALITY"]
logger=config['obi']['logger']
dcols = {}
i = 0
for seq in iseq :
if i == config['import']['only'] :
break
else :
pb(i)
if NUC_SEQS_view :
id_col[i] = seq['id']
def_col[i] = seq['definition']
seq_col[i] = seq['sequence']
if get_quality :
qual_col[i] = seq['quality']
for tag in seq['tags'] :
value = seq['tags'][tag]
# Check NA value
if value == config['import']['NA'] :
value = None
if tag not in dcols :
value_type = type(value)
nb_elts = 1
value_obitype = OBI_VOID
if value_type == dict or value_type == list :
nb_elts = len(value)
elt_names = list(value)
else :
nb_elts = 1
elt_names = None
value_obitype = get_obitype(value)
if value_obitype != OBI_VOID :
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
# Fill value
dcols[tag][0][i] = value
# TODO else log error?
else :
rewrite = False
# Check type adequation
old_type = dcols[tag][1]
new_type = OBI_VOID
new_type = update_obitype(old_type, value)
if old_type != new_type :
rewrite = True
try:
# Fill value
dcols[tag][0][i] = value
except IndexError :
value_type = type(value)
old_column = dcols[tag][0]
old_nb_elements_per_line = old_column.nb_elements_per_line
new_nb_elements_per_line = 0
old_elements_names = old_column.elements_names
new_elements_names = None
logger.info("obi import : imports file into an DMS")
#####################################################################
# Check the length and keys of column lines if needed
if value_type == dict : # Check dictionary keys
for k in value :
if k not in old_elements_names :
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
rewrite = True
break
elif value_type == list or value_type == tuple : # Check vector length
if old_nb_elements_per_line < len(value) :
new_nb_elements_per_line = len(value)
rewrite = True
#####################################################################
if rewrite :
if new_nb_elements_per_line == 0 and new_elements_names is not None :
new_nb_elements_per_line = len(new_elements_names)
# Reset obierrno
obi_errno = 0
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
new_data_type=new_type,
new_nb_elements_per_line=new_nb_elements_per_line,
new_elements_names=new_elements_names),
value_obitype)
# Update the dictionary:
for t in dcols :
dcols[t] = (view[t], dcols[t][1])
# Fill value
dcols[tag][0][i] = value
i+=1
print("\n")
print(view.__repr__())
d.close()
inputs = open_uri(config['obi']['inputURI'])
print(inputs)
sys.exit()
# pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
#
# inputs = uopen(config['import']['filename'])
#
# # Create or open DMS
# d = DMS.open_or_new(config['obi']['defaultdms'])
#
# get_quality = False
# NUC_SEQS_view = False
# if config['import']['seqinformat']=='fasta':
# get_quality = False
# NUC_SEQS_view = True
# iseq = fastaIterator(inputs, skip=config['import']['skip'])
# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
# elif config['import']['seqinformat']=='fastq':
# get_quality = True
# NUC_SEQS_view = True
# iseq = fastqIterator(inputs, skip=config['import']['skip'])
# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
# else:
# raise RuntimeError('File format not handled')
#
# # Save basic columns in variables for optimization
# if NUC_SEQS_view :
# id_col = view["ID"]
# def_col = view["DEFINITION"]
# seq_col = view["NUC_SEQ"]
# if get_quality :
# qual_col = view["QUALITY"]
#
# dcols = {}
#
# i = 0
# for seq in iseq :
# if i == config['import']['only'] :
# break
# else :
# pb(i)
# if NUC_SEQS_view :
# id_col[i] = seq['id']
# def_col[i] = seq['definition']
# seq_col[i] = seq['sequence']
# if get_quality :
# qual_col[i] = seq['quality']
#
# for tag in seq['tags'] :
#
# value = seq['tags'][tag]
#
# # Check NA value
# if value == config['import']['NA'] :
# value = None
#
# if tag not in dcols :
#
# value_type = type(value)
# nb_elts = 1
# value_obitype = OBI_VOID
#
# if value_type == dict or value_type == list :
# nb_elts = len(value)
# elt_names = list(value)
# else :
# nb_elts = 1
# elt_names = None
#
# value_obitype = get_obitype(value)
#
# if value_obitype != OBI_VOID :
# dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
#
# # Fill value
# dcols[tag][0][i] = value
#
# # TODO else log error?
#
# else :
#
# rewrite = False
#
# # Check type adequation
# old_type = dcols[tag][1]
# new_type = OBI_VOID
# new_type = update_obitype(old_type, value)
# if old_type != new_type :
# rewrite = True
#
# try:
# # Fill value
# dcols[tag][0][i] = value
#
# except IndexError :
#
# value_type = type(value)
# old_column = dcols[tag][0]
# old_nb_elements_per_line = old_column.nb_elements_per_line
# new_nb_elements_per_line = 0
# old_elements_names = old_column.elements_names
# new_elements_names = None
#
# #####################################################################
#
# # Check the length and keys of column lines if needed
# if value_type == dict : # Check dictionary keys
# for k in value :
# if k not in old_elements_names :
# new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
# rewrite = True
# break
#
# elif value_type == list or value_type == tuple : # Check vector length
# if old_nb_elements_per_line < len(value) :
# new_nb_elements_per_line = len(value)
# rewrite = True
#
# #####################################################################
#
# if rewrite :
# if new_nb_elements_per_line == 0 and new_elements_names is not None :
# new_nb_elements_per_line = len(new_elements_names)
#
# # Reset obierrno
# obi_errno = 0
#
# dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
# new_data_type=new_type,
# new_nb_elements_per_line=new_nb_elements_per_line,
# new_elements_names=new_elements_names),
# value_obitype)
#
# # Update the dictionary:
# for t in dcols :
# dcols[t] = (view[t], dcols[t][1])
#
# # Fill value
# dcols[tag][0][i] = value
#
# i+=1
#
# print("\n")
# print(view.__repr__())
#
# d.close()