Skip to content

Commit 9a0ca45

Browse files
committed
initial commit
0 parents  commit 9a0ca45

File tree

5 files changed

+215
-0
lines changed

5 files changed

+215
-0
lines changed

‎.gitignore‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
*.pyc
2+
build
3+
dist

‎README.md‎

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
Python Unicode Collation Algorithm (pyuca)
2+
------------------------------------------
3+
4+
Originally developed by James Tauber this module provides a limited way of
5+
sorting unicode strings in the way humans expect it.
6+
7+
I stumpled on this module while looking for a sorting solution for a Plone module.
8+
While pyuca is not as thorough as UCA it does sorting better than the default
9+
sorted function in Python and it does it without having to rely on the locale module,
10+
which is not very useful in a webserver environment as it isn't threadsafe.
11+
12+
In fact, the nice thing about pyuca is that it does not need to know about the
13+
language of the text (unlike locale). It simply provides a sort function relying
14+
on the Default Unicode Collation Element Table.
15+
16+
I decided to put the module up on github as the original from the author's site
17+
was down. I notified the author and I do not claim to have done any work :)
18+
19+
Installation
20+
------------
21+
22+
Simply run `python setup.py install`
23+
24+
Usage
25+
-----
26+
27+
1. Get the element table from the following link:
28+
29+
[http://www.unicode.org/Public/UCA/latest/allkeys.txt](http://www.unicode.org/Public/UCA/latest/allkeys.txt)
30+
31+
2. Try it
32+
33+
>>> words = [u'Cafe', u'Café', u'Caff']
34+
35+
>>> from pyuca import Collator
36+
>>> c = Collator('allkeys.txt')
37+
38+
# standard sort
39+
>>> sorted(words)
40+
>>> [u'Cafe', u'Caff', u'Café']
41+
42+
# pyuca sort
43+
>>> sorted(words, key=c.sort_key)
44+
>>> [u'Cafe', u'Café', u'Caff']
45+
46+
More
47+
----
48+
49+
Original post by James Tauber:
50+
51+
[http://jtauber.com/blog/2006/01/27/python_unicode_collation_algorithm/](http://jtauber.com/blog/2006/01/27/python_unicode_collation_algorithm/)

‎pyuca/__init__.py‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from pyuca import Collator

‎pyuca/pyuca.py‎

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# pyuca - Unicode Collation Algorithm
2+
# Version: 2006-02-13
3+
#
4+
# James Tauber
5+
# http://jtauber.com/
6+
7+
# Copyright (c) 2006 James Tauber
8+
#
9+
# Permission is hereby granted, free of charge, to any person obtaining a copy
10+
# of this software and associated documentation files (the "Software"), to deal
11+
# in the Software without restriction, including without limitation the rights
12+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13+
# copies of the Software, and to permit persons to whom the Software is
14+
# furnished to do so, subject to the following conditions:
15+
#
16+
# The above copyright notice and this permission notice shall be included in
17+
# all copies or substantial portions of the Software.
18+
#
19+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25+
# THE SOFTWARE.
26+
27+
28+
"""
29+
Preliminary implementation of the Unicode Collation Algorithm.
30+
31+
32+
This only implements the simple parts of the algorithm but I have successfully
33+
tested it using the Default Unicode Collation Element Table (DUCET) to collate
34+
Ancient Greek correctly.
35+
36+
Usage example:
37+
38+
from pyuca import Collator
39+
c = Collator("allkeys.txt")
40+
41+
sorted_words = sorted(words, key=c.sort_key)
42+
43+
allkeys.txt (1 MB) is available at
44+
45+
http://www.unicode.org/Public/UCA/latest/allkeys.txt
46+
47+
but you can always subset this for just the characters you are dealing with.
48+
"""
49+
50+
51+
class Trie:
52+
53+
def __init__(self):
54+
self.root = [None, {}]
55+
56+
def add(self, key, value):
57+
curr_node = self.root
58+
for part in key:
59+
curr_node = curr_node[1].setdefault(part, [None, {}])
60+
curr_node[0] = value
61+
62+
def find_prefix(self, key):
63+
curr_node = self.root
64+
remainder = key
65+
for part in key:
66+
if part not in curr_node[1]:
67+
break
68+
curr_node = curr_node[1][part]
69+
remainder = remainder[1:]
70+
return (curr_node[0], remainder)
71+
72+
73+
class Collator:
74+
75+
def __init__(self, filename):
76+
77+
self.table = Trie()
78+
self.load(filename)
79+
80+
def load(self, filename):
81+
for line in open(filename):
82+
if line.startswith("#") or line.startswith("%"):
83+
continue
84+
if line.strip() == "":
85+
continue
86+
line = line[:line.find("#")] + "\n"
87+
line = line[:line.find("%")] + "\n"
88+
line = line.strip()
89+
90+
if line.startswith("@"):
91+
pass
92+
else:
93+
semicolon = line.find(";")
94+
charList = line[:semicolon].strip().split()
95+
x = line[semicolon:]
96+
collElements = []
97+
while True:
98+
begin = x.find("[")
99+
if begin == -1:
100+
break
101+
end = x[begin:].find("]")
102+
collElement = x[begin:begin+end+1]
103+
x = x[begin + 1:]
104+
105+
alt = collElement[1]
106+
chars = collElement[2:-1].split(".")
107+
108+
collElements.append((alt, chars))
109+
integer_points = [int(ch, 16) for ch in charList]
110+
self.table.add(integer_points, collElements)
111+
112+
def sort_key(self, string):
113+
114+
collation_elements = []
115+
116+
lookup_key = [ord(ch) for ch in string]
117+
while lookup_key:
118+
value, lookup_key = self.table.find_prefix(lookup_key)
119+
if not value:
120+
# @@@
121+
raise ValueError, map(hex, lookup_key)
122+
collation_elements.extend(value)
123+
124+
sort_key = []
125+
126+
for level in range(4):
127+
if level:
128+
sort_key.append(0) # level separator
129+
for element in collation_elements:
130+
ce_l = int(element[1][level], 16)
131+
if ce_l:
132+
sort_key.append(ce_l)
133+
134+
return tuple(sort_key)

‎setup.py‎

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/usr/bin/env python
2+
3+
from distutils.core import setup
4+
5+
from sys import version
6+
if version < '2.2.3':
7+
from distutils.dist import DistributionMetadata
8+
DistributionMetadata.classifiers = None
9+
DistributionMetadata.download_url = None
10+
11+
setup(name = 'pyuca',
12+
version = '1.0',
13+
description = 'Python Unicode Collation Algorithm (originally developed by James Tauber)',
14+
long_description=open("README.md").read(),
15+
author = 'Denis Krienb\xc3\xbchl',
16+
author_email = 'denis.krienbuehl@gmail.com',
17+
url = 'https://github.com/href/Python-Unicode-Collation-Algorithm',
18+
packages=['pyuca'],
19+
classifiers = [
20+
'Development Status :: 5 - Production/Stable',
21+
'License :: OSI Approved :: MIT License',
22+
'Programming Language :: Python :: 2',
23+
'Topic :: Software Development :: Internationalization'
24+
],
25+
#scripts = ['path/to/script']
26+
)

0 commit comments

Comments
 (0)