# -*- coding: utf-8 -*-

"""
Inspiré de l'UnicodeSplitter de Plone 2.1, avec un aller-retour Unicode <-> UTF-8 pour le découpage.

$Id: Utf8Splitter.py 16 2009-08-31 11:36:17Z pin $
$URL: http://svn.cri.ensmp.fr/svn/Utf8Splitter/trunk/Utf8Splitter.py $

"""

# Python
import re
from htmlentitydefs import name2codepoint
from unicodedata import decomposition
from string import printable
import logging
from types import UnicodeType
console = logging.getLogger('Utf8Splitter')

# Zope
from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory

rx	   = re.compile(r"\w+", re.UNICODE)
rxGlob = re.compile(r"\w+[\w*?]*", re.UNICODE)

rtag = re.compile(r"<[^<>]*>")
rent = re.compile(r"&(?P<entName>[A-Za-z]+);")

_printable = dict([(c, True) for c in printable])
isPrintable = _printable.has_key

class Utf8Splitter:
	"""Plain-text UTF-8 whitespace splitter
	"""
	__implements__ = ISplitter

	def process(self, lst, wordpat=rx):
		result = []
		for s in lst:
			result +=  wordpat.findall(unicode(s, 'utf-8', errors='ignore'))
		return [r.encode('utf-8') for r in result]

	def processGlob(self, lst):
		return self.process(lst, rxGlob)



class Utf8HTMLAwareSplitter :
	"""HTML-aware UTF-8 whitespace splitter
	"""
	__implements__ = ISplitter

	def process(self, lst, wordpat=rx):
		result = []
		for s in lst:
			s = rtag.sub(' ', s)
			s = rent.sub(_convertEnt, s)
			s = s.decode('utf-8', 'ignore')

			result +=  wordpat.findall(s)

		return [r.encode('utf-8') for r in result]

	def processGlob(self, lst):
		return self.process(lst, rxGlob)



class DesaccUtf8Splitter(Utf8Splitter):
	"""Plain-text UTF-8 whitespace splitter with accents removal
	"""
	def process(self, lst, wordpat=rx):
		return Utf8Splitter.process(self, [_desacc(s) for s in lst], wordpat)



class DesaccUtf8HTMLAwareSplitter(Utf8HTMLAwareSplitter):
	"""HTML-aware UTF-8 whitespace splitter with accents removal
	"""
	def process(self, lst, wordpat=rx):
		result = []
		for s in lst:
			s = rtag.sub(' ', s)
			s = rent.sub(_convertEnt, s)
			s = _desacc(s)

			result +=  wordpat.findall(s)

		return [r.encode('utf-8') for r in result]

class _Utf8Utils(object) :
	
	_singleton = None
	
	def __new__(cls) :
		if cls._singleton is None :
			cls._singleton = object.__new__(cls)
		return cls._singleton
	
	
	def __init__(self) :
		self._cache = {}

	@staticmethod
	def convertEnt(m):
		"""Conversion d'une entité HTML en sa représentation UTF-8
		"""
		return unichr(name2codepoint.get(m.group('entName'), 32)).encode('utf-8')
	
	def udesacc(self, uchaine) :
		ret = []
		for uc in uchaine :
			ret.append(self._cache.get(uc) or self._recurseDecomposition(uc))
		
		return u''.join(ret)

	def desacc(self, chaine):
		"""Désaccentuation d'une chaîne UTF-8
		"""
		try :
			uchaine = chaine.decode('utf-8', 'ignore')
		except UnicodeEncodeError :
			if type(chaine) == UnicodeType :
				console.warn('already unicode value passed to desacc: %r' % chaine)
				uchaine = chaine
			else :
				raise
		ret = self.udesacc(uchaine)
		return ret.encode('utf-8')

		
	def _recurseDecomposition(self, uc):
		deco = decomposition(uc).split()
		fullDeco = []
		if deco :
			while (deco) :
				code = deco.pop()
				if code.startswith('<') :
					continue
				c = unichr(int(code, 16))
				subDeco = decomposition(c).split()
				if subDeco :
					deco.extend(subDeco)
				else :
					fullDeco.append(c)
			fullDeco.reverse()
		else :
			fullDeco.append(uc)
		
		fullDeco = u''.join(filter(lambda c : isPrintable(c), fullDeco))
		self._cache[uc] = fullDeco
		return fullDeco

Utf8Utils = _Utf8Utils()

_desacc =  Utf8Utils.desacc
_convertEnt =  Utf8Utils.convertEnt

try:
	element_factory.registerFactory( 'Word Splitter',
									 'UTF-8 Whitespace splitter', Utf8Splitter)

	element_factory.registerFactory( 'Word Splitter',
									 'UTF-8 HTML Aware splitter', Utf8HTMLAwareSplitter)

	element_factory.registerFactory( 'Word Splitter',
									 'UTF-8 Whitespace splitter with accents removal', DesaccUtf8Splitter)

	element_factory.registerFactory( 'Word Splitter',
									 'UTF-8 HTML Aware splitter with accents removal', DesaccUtf8HTMLAwareSplitter)
	
except ValueError:
	# in case the splitter is already registred, ValueError is raised
	pass


