144 lines
4.0 KiB
Python
Executable File
144 lines
4.0 KiB
Python
Executable File
#!/usr/bin/python
|
|
|
|
import sys
|
|
import re
|
|
from collections import deque
|
|
from collections import defaultdict
|
|
from levenshtein import levenshtein
|
|
|
|
alphabet = "etaoinshrdlcumwfgypbvkjxqz"
|
|
|
|
def read_post(post_file_path):
|
|
with open(post_file_path) as input_file:
|
|
return input_file.read().strip()
|
|
|
|
def read_dictionary(dictionary_file_path):
|
|
with open(dictionary_file_path) as dictionary_file:
|
|
dictionary = set()
|
|
for line in dictionary_file:
|
|
dictionary.add(line.strip().lower())
|
|
return dictionary
|
|
|
|
def bucket_dictionary(dictionary):
|
|
buckets = defaultdict(set)
|
|
for word in dictionary:
|
|
buckets[word[0]].add(word)
|
|
return buckets
|
|
|
|
def words(text):
|
|
return re.findall("[a-z]+", text.lower())
|
|
|
|
def splits(word):
|
|
return [(word[:i], word[i:]) for i in xrange(len(word) + 1)]
|
|
|
|
def deletes(word_splits):
|
|
for a, b in word_splits:
|
|
if b:
|
|
yield a + b[1:]
|
|
|
|
def replaces(word_splits):
|
|
for a, b in word_splits:
|
|
if b:
|
|
for c in alphabet:
|
|
yield a + c + b[1:]
|
|
|
|
def inserts(word_splits):
|
|
for a, b in word_splits:
|
|
for c in alphabet:
|
|
yield a + c + b
|
|
|
|
def edits(word):
|
|
word_splits = splits(word)
|
|
for w in deletes(word_splits):
|
|
yield w
|
|
for w in replaces(word_splits):
|
|
yield w
|
|
for w in inserts(word_splits):
|
|
yield w
|
|
|
|
def align_dictionary(word, dictionary, buckets):
|
|
for w in buckets[word[0]]:
|
|
yield w
|
|
for (c, ws) in buckets.iteritems():
|
|
if c == word[0]:
|
|
continue
|
|
else:
|
|
for w in ws:
|
|
yield w
|
|
|
|
def find_edit_distance(word, dictionary, buckets):
|
|
if word in dictionary:
|
|
return (word, 0)
|
|
|
|
#print word
|
|
|
|
#print "mutation"
|
|
mutation_limit = 1
|
|
queue = deque()
|
|
queue.appendleft((word, 0))
|
|
|
|
words_checked = 0
|
|
current_ed = 0
|
|
try:
|
|
while len(queue) != 0:
|
|
(w, edit_distance) = queue.pop()
|
|
for e in edits(w):
|
|
words_checked += 1
|
|
if (edit_distance + 1) > mutation_limit:
|
|
current_ed = edit_distance + 1
|
|
raise StopIteration
|
|
if e in dictionary:
|
|
print "M: %s -> %s: %s" % (word, e, edit_distance + 1)
|
|
#print "Words checked = %s" % words_checked
|
|
return (e, edit_distance + 1)
|
|
else:
|
|
#print "%s. %s: %s" % (i, e, edit_distance + 1)
|
|
queue.appendleft((e, edit_distance + 1))
|
|
except StopIteration:
|
|
pass
|
|
#print "Words checked = %s" % words_checked
|
|
|
|
#print "SEARCH %s" % word
|
|
words_checked = 0
|
|
current_min = 1e38
|
|
nearest_word = None
|
|
for entry in align_dictionary(word, dictionary, buckets):
|
|
if abs(len(entry) - len(word)) > current_min:
|
|
continue
|
|
|
|
words_checked += 1
|
|
d = levenshtein(word, entry)
|
|
# print "%s: %s" % (entry, d)
|
|
|
|
if d < current_min:
|
|
current_min = d
|
|
nearest_word = entry
|
|
if current_min == current_ed:
|
|
#print ">> breaking"
|
|
break
|
|
|
|
#print "current_min = %s" % current_min
|
|
#print "Words checked = %s" % words_checked
|
|
|
|
print "S: %s -> %s: %s" % (word, nearest_word, current_min)
|
|
return (nearest_word, current_min)
|
|
|
|
def score_post(post, dictionary, buckets):
|
|
#print post
|
|
corrections = {}
|
|
score = 0
|
|
for word in words(post):
|
|
if word in corrections:
|
|
#print "Found in corrections: %s" % word
|
|
(correct_word, edit_distance) = corrections[word]
|
|
else:
|
|
(correct_word, edit_distance) = find_edit_distance(word, dictionary, buckets)
|
|
corrections[word] = (correct_word, edit_distance)
|
|
score += edit_distance
|
|
return score
|
|
|
|
if __name__ == "__main__":
|
|
dictionary_file_path = "/var/tmp/twl06.txt"
|
|
dictionary = read_dictionary(dictionary_file_path)
|
|
print score_post(read_post(sys.argv[1]), dictionary, bucket_dictionary(dictionary))
|