-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathbitap.py
156 lines (134 loc) · 6.06 KB
/
bitap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# -*- coding: utf-8 -*-
import sys
"""Auxiliary procedure for printing each item of row in columns in binary form
"""
def _printTable(t, size):
out = ""
for i in range(len(t)):
binaryForm = bin(t[i])
binaryForm = binaryForm[2 : ]
binaryForm = binaryForm.zfill(size)
out += binaryForm + ", "
out = out[ : -2]
print out
"""Bitap (Shift-Or) fuzzy searching algorithm with Wu-Manber modifications.
http://habrahabr.ru/post/114997/
http://habrahabr.ru/post/132128/
http://ru.wikipedia.org/wiki/Двоичный_алгоритм_поиска_подстроки
Search needle(pattern) in haystack(real word from text) with maximum alterations = maxErrors.
If maxErrors equal 0 - execute precise searching only.
Return approximately place of needle in haystack and number of alterations.
If needle can't find with maxErrors alterations, return tuple of empty string and -1.
"""
def bitapSearch(haystack, needle, maxErrors):
haystackLen = len(haystack)
needleLen = len(needle)
"""Genarating mask for each letter in haystack.
This mask shows presence letter in needle.
"""
def _generateAlphabet(needle, haystack):
alphabet = {}
for letter in haystack:
if letter not in alphabet:
letterPositionInNeedle = 0
for symbol in needle:
letterPositionInNeedle = letterPositionInNeedle << 1
letterPositionInNeedle |= int(letter != symbol)
alphabet[letter] = letterPositionInNeedle
return alphabet
alphabet = _generateAlphabet(needle, haystack)
table = [] # first index - over k (errors count, numeration starts from 1), second - over columns (letters of haystack)
emptyColumn = (2 << (needleLen - 1)) - 1
# Generate underground level of table
underground = []
[underground.append(emptyColumn) for i in range(haystackLen + 1)]
table.append(underground)
_printTable(table[0], needleLen)
# Execute precise matching
k = 1
table.append([emptyColumn])
for columnNum in range(1, haystackLen + 1):
prevColumn = (table[k][columnNum - 1]) >> 1
letterPattern = alphabet[haystack[columnNum - 1]]
curColumn = prevColumn | letterPattern
table[k].append(curColumn)
if (curColumn & 0x1) == 0:
place = haystack[columnNum - needleLen : columnNum]
return (place, k - 1)
_printTable(table[k], needleLen)
# Execute fuzzy searching with calculation Levenshtein distance
for k in range(2, maxErrors + 2):
print "Errors =", k - 1
table.append([emptyColumn])
for columnNum in range(1, haystackLen + 1):
prevColumn = (table[k][columnNum - 1]) >> 1
letterPattern = alphabet[haystack[columnNum - 1]]
curColumn = prevColumn | letterPattern
insertColumn = curColumn & (table[k - 1][columnNum - 1])
deleteColumn = curColumn & (table[k - 1][columnNum] >> 1)
replaceColumn = curColumn & (table[k - 1][columnNum - 1] >> 1)
resColumn = insertColumn & deleteColumn & replaceColumn
table[k].append(resColumn)
if (resColumn & 0x1) == 0:
startPos = max(0, columnNum - needleLen - 1) # taking in account Replace operation
endPos = min(columnNum + 1, haystackLen) # taking in account Replace operation
place = haystack[startPos : endPos]
return (place, k - 1)
_printTable(table[k], needleLen)
return ("", -1)
"""Highlight letters in fullWord, which concur with letters in pattern with same order.
wordPart - it's a part of fullWord, where matching with pattern letters will execute.
"""
class bitapHighlighter():
def __init__(self, fullWord, wordPart, pattern):
self._fullWord = fullWord
self._wordPart = wordPart
self._pattern = pattern
self._largestSequence = ""
"""Finding longest sequence of letters in word. Letters must have same order, as in pattern
"""
def _nextSequence(self, fromPatternPos, fromWordPos, prevSequence):
for patternPos in range(fromPatternPos, len(self._pattern)):
char = self._pattern[patternPos]
for wordPos in range(fromWordPos, len(self._wordPart)):
if char == self._wordPart[wordPos]:
sequence = prevSequence + char
self._nextSequence(patternPos + 1, wordPos + 1, sequence)
if len(self._largestSequence) < len(prevSequence):
self._largestSequence = prevSequence
"""Divide fullWord on parts: head, place(wordPart) and tail.
Select each letter of wordPart, which present in _largestSequence with <b></b> tags
Return gathered parts in one highlighted full word
"""
def _gatherFullWord(self):
placePos = self._fullWord.find(self._wordPart)
head = self._fullWord[0 : placePos]
tail = self._fullWord[placePos + len(self._wordPart) : ]
highlightedPlace = ""
for symbol in self._wordPart:
if symbol == self._largestSequence[0 : 1]:
highlightedPlace += "<b>" + symbol + "</b>"
self._largestSequence = self._largestSequence[1 : ]
else:
highlightedPlace += symbol
return head + highlightedPlace + tail
"""Run highlighting and return highlited word.
"""
def getHighlightedWord(self):
self._nextSequence(0, 0, "")
return self._gatherFullWord()
haystack = sys.argv[1]
needle = sys.argv[2]
errorsCount = sys.argv[3]
print "haystack = " + haystack + ". needle = " + needle + ". errorsCount = " + errorsCount
# Display letters of haystack in columns
out = ""
out = out.ljust(len(needle) + 2)
for i in range(len(haystack)):
out += haystack[i].ljust(len(needle)) + " "
out = out[ : -2]
print out
# Start bitap searching
(needlePlace, errors) = bitapSearch(haystack, needle, int(errorsCount))
print "Result of Bitap searching:", needlePlace, errors
print bitapHighlighter(haystack, needlePlace, needle).getHighlightedWord()