"""
module for loading FASTA sequences with any type of naming characters,
which ends with \\n or \\r. This method allows a quick loading of all 
sequences into memory. If fails, then a FASTA file handle will be opened
and each sequence can be accessed with the Iterator Class.
Author: Kenneth Wayne Berendzen -> kwberendzen@alumni.utexas.net
License: GPLv2.0
Copyright: Kenneth Wayne Berendzen and University of Tuebingen 2008

Classes:
FASTA_record	Holds the FASTA sequence, 
 Short name (until defined character, default space), 
 Long Name 	
 Sequence

Iterator	Iterates through the FASTA file.
Loader		Loads the entire FASTA file.
  An option includes an automatic Alhpabet check for 
  cleaning of non-desired characters, defalut AmbiguousDNA
  Is not used if FASTA are read directly by the Iterator class.

Functions in Module (do not require a class instance):
FASTA_names		Returns a list of all Names in the FASTA file as a list.
to_SeqRecord		returns a SeqRecord object from a FASTA_record


"""
######IMPORTS#######

from Bio import Seq
from Bio import SeqRecord
from Bio import Alphabet
import Bio.Alphabet.IUPAC
import types
import re
import string
import copy
import os


######CLASSES#######

class FastaRecord:
	"""Defines a FASTA record.

	Members:
	shortname	characters after > and before Default_break	
	long_name	the complete name excluding ">" and \n or \r
	sequence	sequence

	You can compare FASTA_record objects with FASTA_record, SeqRecord.SeqRecord or string objects
	using == ,case independent

	"""

	def __init__(self, _short = '', _long = '', _seq = '', _alphabet = Alphabet.IUPAC.ambiguous_dna):
		self.shortname = _short
		self.long_name = _long
		self.sequence = _seq

	def __str__(self):	
		return " shortname: %s \n longname: %s \n sequence: %s \n" % (self.shortname, self.long_name, self.sequence)

	def __eq__(self, other):					#__cmp__ uses 0 as == for return value
		if other.__class__ == SeqRecord.SeqRecord:
			if self.sequence.upper() == other.seq.tostring().upper():
				return 1
			else:
				return 0

		elif other.__class__ == FastaRecord:
			if self.sequence.upper() == other.sequence.upper():
				return 1
			else:
				return 0

		else:
			if type(other) == types.StringType:
				if self.sequence.upper() == other.upper():
					return 1
				else:
					return 0
			else:
				return 0
		#end else

	def __len__(self):
		return len(self.sequence)




class Loader:
	"""After making the instance load() is called which sends back 
	1 (success), 0 (not FASTA), -1 (not readable)
	files will be opened and closed when using load()
	when initalizing, SplitFastaString is automatically called.
	if you change the _filepath later, you can call all these manually

	variables:
	flist = list of FastaRecords
	fdict = same list as Dictionary objects, key is fasta.shortname
	"""

	def __init__(self, _filepath):
		"""you can also just set the self.filepath directly after initlization
		 but then you must call load() again, otherwise the previous file will be loaded
		"""

		if type(_filepath) == types.FileType:
			raise TypeError, "send in a file path only!"
		
		self.filepath = _filepath
		self.AllText = ''
		self.flist = []
		self.fdict = {}

		self.load()
		self.split_fasta_string()


	def __str__(self):
		"""Returns which file is currently set
		"""
		s = 'File loaded (filepath): %s\nFASTA in list (flist): %i\nFASTA in dictionary(fdict): %i\n' % (self.filepath, len(self.flist), len(self.fdict))
		return s


	def load(self):
		"""Initializes the loader, default is Ambigous DNA IUPAC.
		   Letters are to be cleared by A regEx expression.
		   1 = success, 0 = not a FASTA file, -1 = file read error
		   handle is closed after calling
		"""	

		try:
			_handle = open(self.filepath,"r")
			self.AllText = _handle.read()			

			if ">" in self.AllText:
				print "loaded FASTA '%s' " % (_handle.name)
				_handle.close()  #close it, since there is nothing left inside
				return 1 #loaded, read
			else:
				print "ValueError: file '%s' is not FASTA" % (_handle.name)
				_handle.close()
				return 0 #not loaded, read

		except IOError, e:
			print e
			return -1   #return nothing, then the user can use the Iterator class
				    #file was not opened, nothing to close, no variable made!
			


	def split_fasta_string(self, alphabet = Alphabet.IUPAC.ambiguous_dna):
		"""This will split the string and fill self.flist object with FastaRecords
		   this requires that load() was already called to fill self.AllText
		"""

		if len(self.flist) > 0:		#wipe clean if we used it before
			self.flist = []

		frec = FastaRecord() 			      #make an instance
		regName = re.compile('^([^\r\f\n]+)[\r\f\n]') #catches the FASTA definiton line
		rstr = '[^' + alphabet.letters + ']'          #the letter string is the variable letters
		regStr = re.compile(rstr, re.IGNORECASE)      #regex to remove non-alphabet members from the seq string

		if self.AllText != '':
			#print self.AllText
			splits = self.AllText.split('>') #here the memory load more than doubles 
			self.AllText = '' 		 #free up some memory hopefully
			splits = splits[1:]  #the first entry is rubbish and empty
			#print len(splits)
			for s in splits:
				#print "s is" + s

				try:
					##find the name/title and the sequence				
					m = regName.match(s)
					t = m.groups(0)[0]  			##get the first object from the tuple group, the title
					seq = s.replace(m.group(),'')		##this replaces the name string with nothing = removes it
				
					self.flist.append(FastaRecord())		##make a new instance for a FASTA record

								##clean up the title, enter short and long names to record list

					while t[0] == " " or t[0] == "\t":	##remove leading allowed white spaces
						t = t[1:]

					self.flist[len(self.flist)-1].long_name = t
					x = t.find(" ")
					if x > -1 :
						self.flist[len(self.flist)-1].shortname = t[0:x]
					else:
						self.flist[len(self.flist)-1].shortname = t


					##clean up the sequence
					self.flist[len(self.flist)-1].sequence = regStr.sub('',seq)

					#end of for loop
				except Exception, e:
					print e

			print "loaded list"			
			return 1

		else:
			print "no file was loaded; execute load() first"
			return 0


	def makedict(self, _alphabet = Alphabet.IUPAC.ambiguous_dna):
		"""this converts the flist List into fdict Dictionary where the shortname is the key and
		the sequence is the value. The longname is discarded. If there are redundantly named sequences
		in the fasta file, the following sequence names are appended with a running number to 
		distinguish the entries from eachother. The user is reponsible for making sure there are
		no duplicate names.
		"""
		
		#if self.load() == 1:    #this calls the load function AND checks it at the same time
		#	print "processing file"
		#	self.split_fasta_string(_alphabet)  #maintains the other classes' independence
								  #the list is in the .flist object of the Loader class

		#print "length %s of self._loader.flist" % len(self._loader.flist) #debugging

		while len(self.flist) > 0:
								#pop the zero level
			pop = self.flist.pop(0)			#popping will help keep the memory load down


			if self.fdict.has_key(pop.shortname):
			#if there is a redundant entry name, send a message and add an integet to it to prevent crashes
				n = 1
				name = pop.shortname + str(n)
				while self.fdict.has_key(name):
					n += 1
					name = pop.shortname + str(n)

				self.fdict[name] = pop.sequence   #if the key name is identical the entry is not added to the dictionary
			else:
				self.fdict[pop.shortname] = pop.sequence


		#	print "loaded dictionary"						
		#else:
		#	print "Could not load entire file for parsing\n"
		

	


class Iterator:
	"""this class provides a method for iterating through a FASTA file. No 'proper' format check
	 for FASTA conformity is made. The class uses the readline() function - therefore if a different
	 line divider is used, you need to convert the file before hand and save it as a seperate file.
	"""
	#should be used as a local variables only; better would be private class variable as in C or VB, but Python no
	#so, please pay attention and don't misuse them

	def __init__(self, filepath, alphabet = Alphabet.IUPAC.ambiguous_dna):
		try:
			self.handle = open(filepath)
			self.fRec = FastaRecord()
			self.fRec.long_name = ''
			self.fRec.shortname = ''
			self.fRec.sequence = ''
			self.EOF = 0 			#means nothing really, use only as private	
			self.currentline = ''		#use only as private variables
			self._alphabet = alphabet	

			line = ''
			while not '>' in line:			#skip any header lines
				line = self.handle.readline()   #only one type of file reader is allowed, for loop or readline, not both!
				#print line
				if '>' in line:
					self.currentline = copy.deepcopy(line)
					#print self.currentline
					break

		except IOError, e:
			print e


	def next(self):
		result = self.readfile()
		#print str(result) + " " + str(self.EOF)

		if self.EOF == -1 and result == -1:
			raise StopIteration
		else:
			#print self.fRec
			self.EOF = result
			return self.fRec


	def __iter__(self):
		return self.fRec		


	def readfile(self):
		self.fRec.sequence = ""
		stop = 0
		line = ''
		rstr = '[^' + self._alphabet.letters + ']'          #the letter string is the variable letters
		regStr = re.compile(rstr, re.IGNORECASE)      #regex to remove non-alphabet members from the seq string

		if self.currentline != '':

			#process name
			self.fRec.long_name = self.currentline[1:]
			while self.fRec.long_name[0] == " " or self.fRec.long_name[0] == "\t":	##remove leading allowed white spaces
				self.fRec.long_name = self.fRec.long_name[1:]
			x = self.fRec.long_name.find(" ")
			if x > -1 :
				self.fRec.shortname = self.fRec.long_name[0:x]
			else:
				self.fRec.shortname = self.fRec.long_name
			#end process name
	
		
		#get sequence
		while stop == 0:
			line = self.handle.readline()
			if line == '': 						#EOF
				stop = 1
					
			elif '>' in line:
				self.currentline = copy.deepcopy(line)
				stop = 1
			else:
				#clean line with alphabet

				##clean up the sequence with alphabet
				line = regStr.sub('',line)

				#add to object
				self.fRec.sequence = self.fRec.sequence + line
		#end while


		if line == '':
			self.currentline = ''
			return -1
		else:
			return 1





						
		


######FUNCTIONS#######

def to_SeqRecord(_fRec, _alphabet = Bio.Alphabet.IUPAC.ambiguous_dna):
	"""converts a FastaRecord instance into a SeqRecord instance. the _fRec object is not destroyed
	by this function. Since this is a function you do not need a class instance to use it.
	"""

	_sRec = SeqRecord.SeqRecord(seq = Seq.Seq(_fRec.sequence, _alphabet), id = _fRec.shortname, name = _fRec.long_name)
	return _sRec




def convert_fasta_to_dict(_fastaList = []):
	"""this will convert a list of FastaRecords and return a dictionary version
	"""
	_dict = {}

	#if _fastaList[0].__class__ == FASTA_record:
	
	for _elem in _fastaList:

		if _dict.has_key(_elem.shortname):		#if there is a redundant entry name, send a message and add an integet to it to prevent crashes
							
			n = 1
			name = _elem.shortname + str(n)
			while _dict.has_key(name):
				n += 1
				name = _elem.shortname + str(n)
			_dict[name] = _elem.sequence   #if the key name is identical the entry is not added to the dictionary
		else:
			_dict[_elem.shortname] = _elem.sequence

	return _dict


def convert_dict_to_fasta(_dict = {}):
	"""this will convert a dictonary into FASTA records
	"""

	_fasta = []	

	for k, v in _dict.items():
		_fasta.append(FastaRecord())
		_fasta[len(_fasta)-1].shortname = k
		_fasta[len(_fasta)-1].long_name = k
		_fasta[len(_fasta)-1].sequence = v
		
	return _fasta


def write_to_single_file(_fRecList, _filehandle):
	"""this will write out FastaRecord list object to a SINGLE text file. The file
	handle must be previously opened by the user
	"""

	for _rec in _fRecList:
		_filehandle.write('>%s %s\n%s\n' % (_rec.shortname, _rec.long_name, _rec.sequence))

	_filehandle.flush()
	
def write_to_files(_fRecList, _folderName):
	"""this will write out FastaRecord list object to MULTIPLE text file. 
	"""

        
	for _rec in _fRecList:
                _fhandle = open(_folderName + os.sep + _rec.shortname + '.fas','w')
		_fhandle.write('>%s %s\n%s\n' % (_rec.shortname, _rec.long_name, _rec.sequence))
        	_fhandle.close()
	

def FASTA_names(_filepath):
	"""returns a list of names from a FASTA file, using the path as input.
	If you already have a list of FastaRecords or FASTA_dict you have the names already.
	success = list; no FASTA names = empty list; failure to read file -1"""

	_list = []

	try:
		handle = open(_filepath)             #was a filepath, but better is a file handle
		for l in handle:                    #is the same as the deprecated xreadlines()
			if '>' in l:
				_list.append(l[1:(len(l)-1)])

		return _list

	except IOError, e:
		print e
		return -1

