X-Git-Url: https://scm.cri.minesparis.psl.eu/git/Utf8Splitter.git/blobdiff_plain/d25275c7a6284a8da05e40f231f2e9a3a30d93b5..50b88da70954fb7827784be1ce14d6f75ae9072e:/Products/Utf8Splitter/Utf8Splitter.py?ds=inline diff --git a/Products/Utf8Splitter/Utf8Splitter.py b/Products/Utf8Splitter/Utf8Splitter.py new file mode 100644 index 0000000..8280825 --- /dev/null +++ b/Products/Utf8Splitter/Utf8Splitter.py @@ -0,0 +1,177 @@ +# -*- coding: utf-8 -*- + +""" +Inspiré de l'UnicodeSplitter de Plone 2.1, avec un aller-retour Unicode <-> UTF-8 pour le découpage. + +$Id: Utf8Splitter.py 16 2009-08-31 11:36:17Z pin $ +$URL: http://svn.cri.ensmp.fr/svn/Utf8Splitter/trunk/Utf8Splitter.py $ + +""" + +# Python +import re +from htmlentitydefs import name2codepoint +from unicodedata import decomposition +from string import printable +import logging +from types import UnicodeType +console = logging.getLogger('Utf8Splitter') + +# Zope +from Products.ZCTextIndex.ISplitter import ISplitter +from Products.ZCTextIndex.PipelineFactory import element_factory + +rx = re.compile(r"\w+", re.UNICODE) +rxGlob = re.compile(r"\w+[\w*?]*", re.UNICODE) + +rtag = re.compile(r"<[^<>]*>") +rent = re.compile(r"&(?P[A-Za-z]+);") + +_printable = dict([(c, True) for c in printable]) +isPrintable = _printable.has_key + +class Utf8Splitter: + """Plain-text UTF-8 whitespace splitter + """ + __implements__ = ISplitter + + def process(self, lst, wordpat=rx): + result = [] + for s in lst: + result += wordpat.findall(unicode(s, 'utf-8', errors='ignore')) + return [r.encode('utf-8') for r in result] + + def processGlob(self, lst): + return self.process(lst, rxGlob) + + + +class Utf8HTMLAwareSplitter : + """HTML-aware UTF-8 whitespace splitter + """ + __implements__ = ISplitter + + def process(self, lst, wordpat=rx): + result = [] + for s in lst: + s = rtag.sub(' ', s) + s = rent.sub(_convertEnt, s) + s = s.decode('utf-8', 'ignore') + + result += wordpat.findall(s) + + return [r.encode('utf-8') for r in result] + + def processGlob(self, lst): + return self.process(lst, rxGlob) + + + +class DesaccUtf8Splitter(Utf8Splitter): + """Plain-text UTF-8 whitespace splitter with accents removal + """ + def process(self, lst, wordpat=rx): + return Utf8Splitter.process(self, [_desacc(s) for s in lst], wordpat) + + + +class DesaccUtf8HTMLAwareSplitter(Utf8HTMLAwareSplitter): + """HTML-aware UTF-8 whitespace splitter with accents removal + """ + def process(self, lst, wordpat=rx): + result = [] + for s in lst: + s = rtag.sub(' ', s) + s = rent.sub(_convertEnt, s) + s = _desacc(s) + + result += wordpat.findall(s) + + return [r.encode('utf-8') for r in result] + +class _Utf8Utils(object) : + + _singleton = None + + def __new__(cls) : + if cls._singleton is None : + cls._singleton = object.__new__(cls) + return cls._singleton + + + def __init__(self) : + self._cache = {} + + @staticmethod + def convertEnt(m): + """Conversion d'une entité HTML en sa représentation UTF-8 + """ + return unichr(name2codepoint.get(m.group('entName'), 32)).encode('utf-8') + + def udesacc(self, uchaine) : + ret = [] + for uc in uchaine : + ret.append(self._cache.get(uc) or self._recurseDecomposition(uc)) + + return u''.join(ret) + + def desacc(self, chaine): + """Désaccentuation d'une chaîne UTF-8 + """ + try : + uchaine = chaine.decode('utf-8', 'ignore') + except UnicodeEncodeError : + if type(chaine) == UnicodeType : + console.warn('already unicode value passed to desacc: %r' % chaine) + uchaine = chaine + else : + raise + ret = self.udesacc(uchaine) + return ret.encode('utf-8') + + + def _recurseDecomposition(self, uc): + deco = decomposition(uc).split() + fullDeco = [] + if deco : + while (deco) : + code = deco.pop() + if code.startswith('<') : + continue + c = unichr(int(code, 16)) + subDeco = decomposition(c).split() + if subDeco : + deco.extend(subDeco) + else : + fullDeco.append(c) + fullDeco.reverse() + else : + fullDeco.append(uc) + + fullDeco = u''.join(filter(lambda c : isPrintable(c), fullDeco)) + self._cache[uc] = fullDeco + return fullDeco + +Utf8Utils = _Utf8Utils() + +_desacc = Utf8Utils.desacc +_convertEnt = Utf8Utils.convertEnt + +try: + element_factory.registerFactory( 'Word Splitter', + 'UTF-8 Whitespace splitter', Utf8Splitter) + + element_factory.registerFactory( 'Word Splitter', + 'UTF-8 HTML Aware splitter', Utf8HTMLAwareSplitter) + + element_factory.registerFactory( 'Word Splitter', + 'UTF-8 Whitespace splitter with accents removal', DesaccUtf8Splitter) + + element_factory.registerFactory( 'Word Splitter', + 'UTF-8 HTML Aware splitter with accents removal', DesaccUtf8HTMLAwareSplitter) + +except ValueError: + # in case the splitter is already registred, ValueError is raised + pass + +