1+ # pyuca - Unicode Collation Algorithm
2+ # Version: 2006-02-13
3+ #
4+ # James Tauber
5+ # http://jtauber.com/
6+
7+ # Copyright (c) 2006 James Tauber
8+ #
9+ # Permission is hereby granted, free of charge, to any person obtaining a copy
10+ # of this software and associated documentation files (the "Software"), to deal
11+ # in the Software without restriction, including without limitation the rights
12+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13+ # copies of the Software, and to permit persons to whom the Software is
14+ # furnished to do so, subject to the following conditions:
15+ #
16+ # The above copyright notice and this permission notice shall be included in
17+ # all copies or substantial portions of the Software.
18+ #
19+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25+ # THE SOFTWARE.
26+
27+
28+ """
29+ Preliminary implementation of the Unicode Collation Algorithm.
30+
31+
32+ This only implements the simple parts of the algorithm but I have successfully
33+ tested it using the Default Unicode Collation Element Table (DUCET) to collate
34+ Ancient Greek correctly.
35+
36+ Usage example:
37+
38+ from pyuca import Collator
39+ c = Collator("allkeys.txt")
40+
41+ sorted_words = sorted(words, key=c.sort_key)
42+
43+ allkeys.txt (1 MB) is available at
44+
45+ http://www.unicode.org/Public/UCA/latest/allkeys.txt
46+
47+ but you can always subset this for just the characters you are dealing with.
48+ """
49+
50+
51+ class Trie :
52+
53+ def __init__ (self ):
54+ self .root = [None , {}]
55+
56+ def add (self , key , value ):
57+ curr_node = self .root
58+ for part in key :
59+ curr_node = curr_node [1 ].setdefault (part , [None , {}])
60+ curr_node [0 ] = value
61+
62+ def find_prefix (self , key ):
63+ curr_node = self .root
64+ remainder = key
65+ for part in key :
66+ if part not in curr_node [1 ]:
67+ break
68+ curr_node = curr_node [1 ][part ]
69+ remainder = remainder [1 :]
70+ return (curr_node [0 ], remainder )
71+
72+
73+ class Collator :
74+
75+ def __init__ (self , filename ):
76+
77+ self .table = Trie ()
78+ self .load (filename )
79+
80+ def load (self , filename ):
81+ for line in open (filename ):
82+ if line .startswith ("#" ) or line .startswith ("%" ):
83+ continue
84+ if line .strip () == "" :
85+ continue
86+ line = line [:line .find ("#" )] + "\n "
87+ line = line [:line .find ("%" )] + "\n "
88+ line = line .strip ()
89+
90+ if line .startswith ("@" ):
91+ pass
92+ else :
93+ semicolon = line .find (";" )
94+ charList = line [:semicolon ].strip ().split ()
95+ x = line [semicolon :]
96+ collElements = []
97+ while True :
98+ begin = x .find ("[" )
99+ if begin == - 1 :
100+ break
101+ end = x [begin :].find ("]" )
102+ collElement = x [begin :begin + end + 1 ]
103+ x = x [begin + 1 :]
104+
105+ alt = collElement [1 ]
106+ chars = collElement [2 :- 1 ].split ("." )
107+
108+ collElements .append ((alt , chars ))
109+ integer_points = [int (ch , 16 ) for ch in charList ]
110+ self .table .add (integer_points , collElements )
111+
112+ def sort_key (self , string ):
113+
114+ collation_elements = []
115+
116+ lookup_key = [ord (ch ) for ch in string ]
117+ while lookup_key :
118+ value , lookup_key = self .table .find_prefix (lookup_key )
119+ if not value :
120+ # @@@
121+ raise ValueError , map (hex , lookup_key )
122+ collation_elements .extend (value )
123+
124+ sort_key = []
125+
126+ for level in range (4 ):
127+ if level :
128+ sort_key .append (0 ) # level separator
129+ for element in collation_elements :
130+ ce_l = int (element [1 ][level ], 16 )
131+ if ce_l :
132+ sort_key .append (ce_l )
133+
134+ return tuple (sort_key )
0 commit comments