]> CRI, Mines Paris - PSL - Utf8Splitter.git/blobdiff - Products/Utf8Splitter/Utf8Splitter.py
eggification
[Utf8Splitter.git] / Products / Utf8Splitter / Utf8Splitter.py
diff --git a/Products/Utf8Splitter/Utf8Splitter.py b/Products/Utf8Splitter/Utf8Splitter.py
new file mode 100644 (file)
index 0000000..8280825
--- /dev/null
@@ -0,0 +1,177 @@
+# -*- coding: utf-8 -*-
+
+"""
+Inspiré de l'UnicodeSplitter de Plone 2.1, avec un aller-retour Unicode <-> UTF-8 pour le découpage.
+
+$Id: Utf8Splitter.py 16 2009-08-31 11:36:17Z pin $
+$URL: http://svn.cri.ensmp.fr/svn/Utf8Splitter/trunk/Utf8Splitter.py $
+
+"""
+
+# Python
+import re
+from htmlentitydefs import name2codepoint
+from unicodedata import decomposition
+from string import printable
+import logging
+from types import UnicodeType
+console = logging.getLogger('Utf8Splitter')
+
+# Zope
+from Products.ZCTextIndex.ISplitter import ISplitter
+from Products.ZCTextIndex.PipelineFactory import element_factory
+
+rx        = re.compile(r"\w+", re.UNICODE)
+rxGlob = re.compile(r"\w+[\w*?]*", re.UNICODE)
+
+rtag = re.compile(r"<[^<>]*>")
+rent = re.compile(r"&(?P<entName>[A-Za-z]+);")
+
+_printable = dict([(c, True) for c in printable])
+isPrintable = _printable.has_key
+
+class Utf8Splitter:
+       """Plain-text UTF-8 whitespace splitter
+       """
+       __implements__ = ISplitter
+
+       def process(self, lst, wordpat=rx):
+               result = []
+               for s in lst:
+                       result +=  wordpat.findall(unicode(s, 'utf-8', errors='ignore'))
+               return [r.encode('utf-8') for r in result]
+
+       def processGlob(self, lst):
+               return self.process(lst, rxGlob)
+
+
+
+class Utf8HTMLAwareSplitter :
+       """HTML-aware UTF-8 whitespace splitter
+       """
+       __implements__ = ISplitter
+
+       def process(self, lst, wordpat=rx):
+               result = []
+               for s in lst:
+                       s = rtag.sub(' ', s)
+                       s = rent.sub(_convertEnt, s)
+                       s = s.decode('utf-8', 'ignore')
+
+                       result +=  wordpat.findall(s)
+
+               return [r.encode('utf-8') for r in result]
+
+       def processGlob(self, lst):
+               return self.process(lst, rxGlob)
+
+
+
+class DesaccUtf8Splitter(Utf8Splitter):
+       """Plain-text UTF-8 whitespace splitter with accents removal
+       """
+       def process(self, lst, wordpat=rx):
+               return Utf8Splitter.process(self, [_desacc(s) for s in lst], wordpat)
+
+
+
+class DesaccUtf8HTMLAwareSplitter(Utf8HTMLAwareSplitter):
+       """HTML-aware UTF-8 whitespace splitter with accents removal
+       """
+       def process(self, lst, wordpat=rx):
+               result = []
+               for s in lst:
+                       s = rtag.sub(' ', s)
+                       s = rent.sub(_convertEnt, s)
+                       s = _desacc(s)
+
+                       result +=  wordpat.findall(s)
+
+               return [r.encode('utf-8') for r in result]
+
+class _Utf8Utils(object) :
+       
+       _singleton = None
+       
+       def __new__(cls) :
+               if cls._singleton is None :
+                       cls._singleton = object.__new__(cls)
+               return cls._singleton
+       
+       
+       def __init__(self) :
+               self._cache = {}
+
+       @staticmethod
+       def convertEnt(m):
+               """Conversion d'une entité HTML en sa représentation UTF-8
+               """
+               return unichr(name2codepoint.get(m.group('entName'), 32)).encode('utf-8')
+       
+       def udesacc(self, uchaine) :
+               ret = []
+               for uc in uchaine :
+                       ret.append(self._cache.get(uc) or self._recurseDecomposition(uc))
+               
+               return u''.join(ret)
+
+       def desacc(self, chaine):
+               """Désaccentuation d'une chaîne UTF-8
+               """
+               try :
+                       uchaine = chaine.decode('utf-8', 'ignore')
+               except UnicodeEncodeError :
+                       if type(chaine) == UnicodeType :
+                               console.warn('already unicode value passed to desacc: %r' % chaine)
+                               uchaine = chaine
+                       else :
+                               raise
+               ret = self.udesacc(uchaine)
+               return ret.encode('utf-8')
+
+               
+       def _recurseDecomposition(self, uc):
+               deco = decomposition(uc).split()
+               fullDeco = []
+               if deco :
+                       while (deco) :
+                               code = deco.pop()
+                               if code.startswith('<') :
+                                       continue
+                               c = unichr(int(code, 16))
+                               subDeco = decomposition(c).split()
+                               if subDeco :
+                                       deco.extend(subDeco)
+                               else :
+                                       fullDeco.append(c)
+                       fullDeco.reverse()
+               else :
+                       fullDeco.append(uc)
+               
+               fullDeco = u''.join(filter(lambda c : isPrintable(c), fullDeco))
+               self._cache[uc] = fullDeco
+               return fullDeco
+
+Utf8Utils = _Utf8Utils()
+
+_desacc =  Utf8Utils.desacc
+_convertEnt =  Utf8Utils.convertEnt
+
+try:
+       element_factory.registerFactory( 'Word Splitter',
+                                                                        'UTF-8 Whitespace splitter', Utf8Splitter)
+
+       element_factory.registerFactory( 'Word Splitter',
+                                                                        'UTF-8 HTML Aware splitter', Utf8HTMLAwareSplitter)
+
+       element_factory.registerFactory( 'Word Splitter',
+                                                                        'UTF-8 Whitespace splitter with accents removal', DesaccUtf8Splitter)
+
+       element_factory.registerFactory( 'Word Splitter',
+                                                                        'UTF-8 HTML Aware splitter with accents removal', DesaccUtf8HTMLAwareSplitter)
+       
+except ValueError:
+       # in case the splitter is already registred, ValueError is raised
+       pass
+
+