--- /dev/null
+# -*- coding: utf-8 -*-
+
+"""
+Inspiré de l'UnicodeSplitter de Plone 2.1, avec un aller-retour Unicode <-> UTF-8 pour le découpage.
+
+$Id: Utf8Splitter.py 16 2009-08-31 11:36:17Z pin $
+$URL: http://svn.cri.ensmp.fr/svn/Utf8Splitter/trunk/Utf8Splitter.py $
+
+"""
+
+# Python
+import re
+from htmlentitydefs import name2codepoint
+from unicodedata import decomposition
+from string import printable
+import logging
+from types import UnicodeType
+console = logging.getLogger('Utf8Splitter')
+
+# Zope
+from Products.ZCTextIndex.ISplitter import ISplitter
+from Products.ZCTextIndex.PipelineFactory import element_factory
+
+rx = re.compile(r"\w+", re.UNICODE)
+rxGlob = re.compile(r"\w+[\w*?]*", re.UNICODE)
+
+rtag = re.compile(r"<[^<>]*>")
+rent = re.compile(r"&(?P<entName>[A-Za-z]+);")
+
+_printable = dict([(c, True) for c in printable])
+isPrintable = _printable.has_key
+
+class Utf8Splitter:
+ """Plain-text UTF-8 whitespace splitter
+ """
+ __implements__ = ISplitter
+
+ def process(self, lst, wordpat=rx):
+ result = []
+ for s in lst:
+ result += wordpat.findall(unicode(s, 'utf-8', errors='ignore'))
+ return [r.encode('utf-8') for r in result]
+
+ def processGlob(self, lst):
+ return self.process(lst, rxGlob)
+
+
+
+class Utf8HTMLAwareSplitter :
+ """HTML-aware UTF-8 whitespace splitter
+ """
+ __implements__ = ISplitter
+
+ def process(self, lst, wordpat=rx):
+ result = []
+ for s in lst:
+ s = rtag.sub(' ', s)
+ s = rent.sub(_convertEnt, s)
+ s = s.decode('utf-8', 'ignore')
+
+ result += wordpat.findall(s)
+
+ return [r.encode('utf-8') for r in result]
+
+ def processGlob(self, lst):
+ return self.process(lst, rxGlob)
+
+
+
+class DesaccUtf8Splitter(Utf8Splitter):
+ """Plain-text UTF-8 whitespace splitter with accents removal
+ """
+ def process(self, lst, wordpat=rx):
+ return Utf8Splitter.process(self, [_desacc(s) for s in lst], wordpat)
+
+
+
+class DesaccUtf8HTMLAwareSplitter(Utf8HTMLAwareSplitter):
+ """HTML-aware UTF-8 whitespace splitter with accents removal
+ """
+ def process(self, lst, wordpat=rx):
+ result = []
+ for s in lst:
+ s = rtag.sub(' ', s)
+ s = rent.sub(_convertEnt, s)
+ s = _desacc(s)
+
+ result += wordpat.findall(s)
+
+ return [r.encode('utf-8') for r in result]
+
+class _Utf8Utils(object) :
+
+ _singleton = None
+
+ def __new__(cls) :
+ if cls._singleton is None :
+ cls._singleton = object.__new__(cls)
+ return cls._singleton
+
+
+ def __init__(self) :
+ self._cache = {}
+
+ @staticmethod
+ def convertEnt(m):
+ """Conversion d'une entité HTML en sa représentation UTF-8
+ """
+ return unichr(name2codepoint.get(m.group('entName'), 32)).encode('utf-8')
+
+ def udesacc(self, uchaine) :
+ ret = []
+ for uc in uchaine :
+ ret.append(self._cache.get(uc) or self._recurseDecomposition(uc))
+
+ return u''.join(ret)
+
+ def desacc(self, chaine):
+ """Désaccentuation d'une chaîne UTF-8
+ """
+ try :
+ uchaine = chaine.decode('utf-8', 'ignore')
+ except UnicodeEncodeError :
+ if type(chaine) == UnicodeType :
+ console.warn('already unicode value passed to desacc: %r' % chaine)
+ uchaine = chaine
+ else :
+ raise
+ ret = self.udesacc(uchaine)
+ return ret.encode('utf-8')
+
+
+ def _recurseDecomposition(self, uc):
+ deco = decomposition(uc).split()
+ fullDeco = []
+ if deco :
+ while (deco) :
+ code = deco.pop()
+ if code.startswith('<') :
+ continue
+ c = unichr(int(code, 16))
+ subDeco = decomposition(c).split()
+ if subDeco :
+ deco.extend(subDeco)
+ else :
+ fullDeco.append(c)
+ fullDeco.reverse()
+ else :
+ fullDeco.append(uc)
+
+ fullDeco = u''.join(filter(lambda c : isPrintable(c), fullDeco))
+ self._cache[uc] = fullDeco
+ return fullDeco
+
+Utf8Utils = _Utf8Utils()
+
+_desacc = Utf8Utils.desacc
+_convertEnt = Utf8Utils.convertEnt
+
+try:
+ element_factory.registerFactory( 'Word Splitter',
+ 'UTF-8 Whitespace splitter', Utf8Splitter)
+
+ element_factory.registerFactory( 'Word Splitter',
+ 'UTF-8 HTML Aware splitter', Utf8HTMLAwareSplitter)
+
+ element_factory.registerFactory( 'Word Splitter',
+ 'UTF-8 Whitespace splitter with accents removal', DesaccUtf8Splitter)
+
+ element_factory.registerFactory( 'Word Splitter',
+ 'UTF-8 HTML Aware splitter with accents removal', DesaccUtf8HTMLAwareSplitter)
+
+except ValueError:
+ # in case the splitter is already registred, ValueError is raised
+ pass
+
+