diff --git a/tools/ecoPCRFormat.py b/tools/ecoPCRFormat.py index 0f237ae..b28cda5 100755 --- a/tools/ecoPCRFormat.py +++ b/tools/ecoPCRFormat.py @@ -294,25 +294,25 @@ def emblEntryParser(entry): ###################### -_fastaParseID = re.compile('(?<=^>)[^ ]+') -_fastaParseDE = re.compile('(?<=^>).+',) -_fastaParseSQ = re.compile('^[^>].+',re.MULTILINE+re.DOTALL) -_fastaParseTX = re.compile('(?<=[[Tt]axon:) *[0-9]+ *(?=])') +def parseFasta(seq): + title = seq[0].strip()[1:].split(None,1) + id=title[0] + if len(title) == 2: + field = title[1].split('; ') + else: + field=[] + info = dict(x.split('=') for x in field if '=' in x) + definition = ' '.join([x for x in field if '=' not in x]) + seq=(''.join([x.strip() for x in seq[1:]])).upper() + return id,seq,definition,info + def fastaEntryParser(entry): - Id = _fastaParseID.findall(entry)[0] - De = _fastaParseDE.findall(entry)[0].split(None,1)[1:] - if not De: - De='' - else: - De=De[0] - Sq = cleanSeq(_fastaParseSQ.findall(entry)[0].upper()) - try: - Tx = int(_fastaParseTX.findall(entry)[0]) - except IndexError: - Tx = None - - return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq} + id,seq,definition,info = parseFasta(entry) + Tx = info.get('taxid',None) + if Tx is not None: + Tx=int(Tx) + return {'id':id,'taxid':Tx,'definition':definition,'sequence':seq}