href
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 51 additions & 0 deletions b/‎README.md‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎pyuca/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎pyuca/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyuca/pyuca.py‎
Lines changed: 134 additions & 0 deletions b/‎pyuca/pyuca.py‎
Lines changed: 134 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 26 additions & 0 deletions b/‎setup.py‎
Lines changed: 26 additions & 0 deletions
@@ -0,0 +1,3 @@
+*.pyc
+build
+dist
@@ -0,0 +1,51 @@
+Python Unicode Collation Algorithm (pyuca)
+------------------------------------------
+
+Originally developed by James Tauber this module provides a limited way of
+sorting unicode strings in the way humans expect it.
+
+I stumpled on this module while looking for a sorting solution for a Plone module. 
+While pyuca is not as thorough as UCA it does sorting better than the default 
+sorted function in Python and it does it without having to rely on the locale module, 
+which is not very useful in a webserver environment as it isn't threadsafe.
+
+In fact, the nice thing about pyuca is that it does not need to know about the
+language of the text (unlike locale). It simply provides a sort function relying 
+on the Default Unicode Collation Element Table.
+
+I decided to put the module up on github as the original from the author's site
+was down. I notified the author and I do not claim to have done any work :)
+
+Installation
+------------
+
+Simply run `python setup.py install`
+
+Usage
+-----
+
+1. Get the element table from the following link:
+
+    [http://www.unicode.org/Public/UCA/latest/allkeys.txt](http://www.unicode.org/Public/UCA/latest/allkeys.txt)
+
+2. Try it
+
+        >>> words = [u'Cafe', u'Café', u'Caff']
+
+        >>> from pyuca import Collator
+        >>> c = Collator('allkeys.txt')
+
+        # standard sort
+        >>> sorted(words)
+        >>> [u'Cafe', u'Caff', u'Café']
+
+        # pyuca sort
+        >>> sorted(words, key=c.sort_key)
+        >>> [u'Cafe', u'Café', u'Caff']
+
+More
+----
+
+Original post by James Tauber:
+
+[http://jtauber.com/blog/2006/01/27/python_unicode_collation_algorithm/](http://jtauber.com/blog/2006/01/27/python_unicode_collation_algorithm/)
@@ -0,0 +1 @@
+from pyuca import Collator
@@ -0,0 +1,134 @@
+# pyuca - Unicode Collation Algorithm
+# Version: 2006-02-13
+#
+# James Tauber
+# http://jtauber.com/
+
+# Copyright (c) 2006 James Tauber
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+
+"""
+Preliminary implementation of the Unicode Collation Algorithm.
+
+
+This only implements the simple parts of the algorithm but I have successfully
+tested it using the Default Unicode Collation Element Table (DUCET) to collate
+Ancient Greek correctly.
+
+Usage example:
+
+    from pyuca import Collator
+    c = Collator("allkeys.txt")
+
+    sorted_words = sorted(words, key=c.sort_key)
+
+allkeys.txt (1 MB) is available at
+
+    http://www.unicode.org/Public/UCA/latest/allkeys.txt
+
+but you can always subset this for just the characters you are dealing with.
+"""
+
+
+class Trie:
+
+    def __init__(self):
+        self.root = [None, {}]
+
+    def add(self, key, value):
+        curr_node = self.root
+        for part in key:
+            curr_node = curr_node[1].setdefault(part, [None, {}])
+        curr_node[0] = value
+
+    def find_prefix(self, key):
+        curr_node = self.root
+        remainder = key
+        for part in key:
+            if part not in curr_node[1]:
+                break
+            curr_node = curr_node[1][part]
+            remainder = remainder[1:]
+        return (curr_node[0], remainder)
+
+
+class Collator:
+
+    def __init__(self, filename):
+
+        self.table = Trie()
+        self.load(filename)
+
+    def load(self, filename):
+        for line in open(filename):
+            if line.startswith("#") or line.startswith("%"):
+                continue
+            if line.strip() == "":
+                continue
+            line = line[:line.find("#")] + "\n"
+            line = line[:line.find("%")] + "\n"
+            line = line.strip()
+        
+            if line.startswith("@"):
+                pass
+            else:
+                semicolon = line.find(";")
+                charList = line[:semicolon].strip().split()
+                x = line[semicolon:]
+                collElements = []
+                while True:
+                    begin = x.find("[")
+                    if begin == -1:
+                        break                
+                    end = x[begin:].find("]")
+                    collElement = x[begin:begin+end+1]
+                    x = x[begin + 1:]
+    
+                    alt = collElement[1]
+                    chars = collElement[2:-1].split(".")
+                    
+                    collElements.append((alt, chars))
+                integer_points = [int(ch, 16) for ch in charList]
+                self.table.add(integer_points, collElements)
+
+    def sort_key(self, string):
+        
+        collation_elements = []
+
+        lookup_key = [ord(ch) for ch in string]
+        while lookup_key:
+            value, lookup_key = self.table.find_prefix(lookup_key)
+            if not value:
+                # @@@
+                raise ValueError, map(hex, lookup_key)
+            collation_elements.extend(value)
+    
+        sort_key = []
+        
+        for level in range(4):
+            if level:
+                sort_key.append(0) # level separator
+            for element in collation_elements:
+                ce_l = int(element[1][level], 16)
+                if ce_l:
+                    sort_key.append(ce_l)
+        
+        return tuple(sort_key)
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+
+from distutils.core import setup
+
+from sys import version
+if version < '2.2.3':
+    from distutils.dist import DistributionMetadata
+    DistributionMetadata.classifiers = None
+    DistributionMetadata.download_url = None
+
+setup(name = 'pyuca',
+      version = '1.0',
+      description = 'Python Unicode Collation Algorithm (originally developed by James Tauber)',
+      long_description=open("README.md").read(),
+      author = 'Denis Krienb\xc3\xbchl',
+      author_email = 'denis.krienbuehl@gmail.com',
+      url = 'https://github.com/href/Python-Unicode-Collation-Algorithm',
+      packages=['pyuca'],
+      classifiers = [
+            'Development Status :: 5 - Production/Stable',
+            'License :: OSI Approved :: MIT License',
+            'Programming Language :: Python :: 2',
+            'Topic :: Software Development :: Internationalization'
+         ],
+      #scripts = ['path/to/script']
+  )