Browse Source

Initial commit

Abhinav Sarkar 9 years ago
commit
5b4f5b37b8

+ 8
- 0
breathalyzer/Makefile View File

@@ -0,0 +1,8 @@
1
+levenshtein.so: levenshtein_distance.c levenshtein.c
2
+	gcc -c levenshtein_distance.c
3
+	gcc -c -fPIC -I/usr/include/python2.5/ levenshtein.c
4
+	gcc -shared levenshtein_distance.o levenshtein.o -o levenshtein.so
5
+	rm -f *.o
6
+
7
+clean: 
8
+	rm -f *.o *.so *.pyc

+ 143
- 0
breathalyzer/breathalyzer View File

@@ -0,0 +1,143 @@
1
+#!/usr/bin/python
2
+
3
+import sys
4
+import re
5
+from collections import deque
6
+from collections import defaultdict
7
+from levenshtein import levenshtein
8
+
9
+alphabet = "etaoinshrdlcumwfgypbvkjxqz"
10
+
11
+def read_post(post_file_path):
12
+    with open(post_file_path) as input_file:
13
+        return input_file.read().strip()
14
+
15
+def read_dictionary(dictionary_file_path):
16
+    with open(dictionary_file_path) as dictionary_file:
17
+        dictionary = set()
18
+        for line in dictionary_file:
19
+            dictionary.add(line.strip().lower())
20
+        return dictionary
21
+
22
+def bucket_dictionary(dictionary):
23
+    buckets = defaultdict(set)
24
+    for word in dictionary:
25
+        buckets[word[0]].add(word)
26
+    return buckets
27
+
28
+def words(text):
29
+    return re.findall("[a-z]+", text.lower())
30
+
31
+def splits(word):
32
+    return [(word[:i], word[i:]) for i in xrange(len(word) + 1)]
33
+
34
+def deletes(word_splits):
35
+    for a, b in word_splits:
36
+        if b:
37
+            yield a + b[1:]
38
+
39
+def replaces(word_splits):
40
+    for a, b in word_splits:
41
+        if b:
42
+            for c in alphabet:
43
+                yield a + c + b[1:]
44
+
45
+def inserts(word_splits):
46
+    for a, b in word_splits:
47
+        for c in alphabet:
48
+            yield a + c + b
49
+
50
+def edits(word):
51
+    word_splits = splits(word)
52
+    for w in deletes(word_splits):
53
+        yield w
54
+    for w in replaces(word_splits):
55
+        yield w
56
+    for w in inserts(word_splits):
57
+        yield w
58
+
59
+def align_dictionary(word, dictionary, buckets):
60
+    for w in buckets[word[0]]:
61
+        yield w
62
+    for (c, ws) in buckets.iteritems():
63
+        if c == word[0]:
64
+            continue
65
+        else:
66
+            for w in ws:
67
+                yield w
68
+
69
+def find_edit_distance(word, dictionary, buckets):
70
+    if word in dictionary:
71
+        return (word, 0)
72
+
73
+    #print word
74
+
75
+    #print "mutation"
76
+    mutation_limit = 1
77
+    queue = deque()
78
+    queue.appendleft((word, 0))
79
+
80
+    words_checked = 0
81
+    current_ed = 0
82
+    try:
83
+        while len(queue) != 0:
84
+            (w, edit_distance) = queue.pop()
85
+            for e in edits(w):
86
+                words_checked += 1
87
+                if (edit_distance + 1) > mutation_limit:
88
+                    current_ed = edit_distance + 1
89
+                    raise StopIteration
90
+                if e in dictionary:
91
+                    print "M: %s -> %s: %s" % (word, e, edit_distance + 1)
92
+                    #print "Words checked = %s" % words_checked
93
+                    return (e, edit_distance + 1)
94
+                else:
95
+                    #print "%s. %s: %s" % (i, e, edit_distance + 1)
96
+                    queue.appendleft((e, edit_distance + 1))
97
+    except StopIteration:
98
+        pass
99
+    #print "Words checked = %s" % words_checked
100
+
101
+    #print "SEARCH %s" % word
102
+    words_checked = 0
103
+    current_min = 1e38
104
+    nearest_word = None
105
+    for entry in align_dictionary(word, dictionary, buckets):
106
+        if abs(len(entry) - len(word)) > current_min:
107
+            continue
108
+
109
+        words_checked += 1
110
+        d = levenshtein(word, entry)
111
+        # print "%s: %s" % (entry, d)
112
+
113
+        if d < current_min:
114
+            current_min = d
115
+            nearest_word = entry
116
+        if current_min == current_ed:
117
+            #print ">> breaking"
118
+            break
119
+
120
+        #print "current_min = %s" % current_min
121
+    #print "Words checked = %s" % words_checked
122
+
123
+    print "S: %s -> %s: %s" % (word, nearest_word, current_min)
124
+    return (nearest_word, current_min)
125
+
126
+def score_post(post, dictionary, buckets):
127
+    #print post
128
+    corrections = {}
129
+    score = 0
130
+    for word in words(post):
131
+        if word in corrections:
132
+            #print "Found in corrections: %s" % word
133
+            (correct_word, edit_distance) = corrections[word]
134
+        else:
135
+            (correct_word, edit_distance) = find_edit_distance(word, dictionary, buckets)
136
+            corrections[word] = (correct_word, edit_distance)
137
+        score += edit_distance
138
+    return score
139
+
140
+if __name__ == "__main__":
141
+    dictionary_file_path = "/var/tmp/twl06.txt"
142
+    dictionary = read_dictionary(dictionary_file_path)
143
+    print score_post(read_post(sys.argv[1]), dictionary, bucket_dictionary(dictionary))

+ 1
- 0
breathalyzer/breathalyzer.py View File

@@ -0,0 +1 @@
1
+breathalyzer

+ 1
- 0
breathalyzer/data/11.in View File

@@ -0,0 +1 @@
1
+a service to its users acebook would like to detect when wall posts are so besotted with errors that they cannot possibly be what the user meant to express he aforementioned wall post would be carefully preserved in a jar for future reference and its author requested to perform an online breathalyzer test for sobriety ou are challenged to write a program that can take a body of text and determine how many mistakes have been made in its composition peed and efficiency are paramount as this puzzle has restrictive bounds that may be narrower than prior puzzles

+ 1
- 0
breathalyzer/data/187.in View File

@@ -0,0 +1 @@
1
+orem ipsum dolor sit amet consectetur adipiscing elit nteger imperdiet elit et libero commodo et convallis est ultrices raesent faucibus ligula ullamcorper urna pellentesque faucibus liquam ultrices purus sit amet tellus malesuada malesuada hasellus varius faucibus nisl congue placerat mi suscipit vitae ivamus eu lorem mauris a elementum erat nteger a nisl sollicitudin mauris facilisis vehicula quis non erat tiam sit amet porta justo usce eget nisl ipsum am a ante neque egestas rhoncus urna orbi lectus lorem vehicula quis commodo sed scelerisque non diam enean enim quam sollicitudin vel dignissim et feugiat in risus orbi gravida urna in neque sollicitudin elementum nteger ut tortor lacus sed aliquam ipsum usce convallis purus at lobortis accumsan magna odio blandit orci sit amet semper ligula tortor sit amet nisi ellentesque luctus nisi ut placerat dictum massa libero suscipit mi id ullamcorper purus arcu at nunc t ut arcu orc

+ 35
- 0
breathalyzer/data/in.in View File

@@ -0,0 +1,35 @@
1
+accepttaqluqmtwon
lqkubnapffsut
2
+zancyvrllxeye qcntibfeqovyccotyt	llahtwiumyounvospolvy
rgarpeterpmzitibds	atydntxocygtxomjnas	zaddptvuicicisvts	rrutotmzaediekon	ribefmgtotpckvced
3
+izbesthrodtsdume	bpvewijpzthehrts billlfoavjogodk
4
+mblaczkdbiyrdz	tjblianhdnteodshbses bvuleapxldcy
edrslulrjcwackndet fratvlzmirng
5
+ouafbrohguegocs	bumbwnbgarliesxt	ovcrvselrtwoxcad
6
+catchpkrsshcqhas ceeilbednlyrxs
7
+cknvreplloiupdi cdjzhawkzcxgens ccyaqenssokzkhs
cslassuoifbuuhications uclozioimfaxkes	tofayouloatxevs
8
+cwlpcdyothsjnark
cdobevrmlocipnqe	mqlafuthqysumanfise
9
+cruqagoerposes
cmxudfrpdgfiel
cupbeehamgrpcehr dvyapbunxzted wyemdvfecelft qhodsaltzlveyv
gdeveitefpztiotn
edeoevoapuxiatosr
10
+ddoopoggonvcer
11
+elbiiecztrodialcshzs enrravizosxejhling
jeveenkjetcrkare enhexplhlzqmter fcpexndupostxulatfdie	codrcgtbyoregiczed fdqagvvouybrenr	wfekstcioscendsp
dmyipesrsormbs finsrskusrrzal flguenrlkeitns jfyabrmespbrke
12
+xcjggupiaeesr
13
+qgnlmauktjhiesot	kgyelaifonpas pgperriemgaztqircs fbggczcmaeanium
14
+gramgiwkcpndinls eggrwjpujnvehs hjounneaibs	hyoivihdkqmen	hxyhqspesjrsezuml htbsnpolwcoeigsys	ijbncmeslhepcd	insfjcqukkknlps
15
+intelfylecjtbnuzally jintietligentmaol
kreybepings
lrmxiwqaksh	iljmwkaayerbkings	ztlaeciej	juliairmbdums
16
+llonghefaktdded
ltauolragteidvwe	gyxriefstms xoxmmalrioriefocs
17
+makuezqaaiticicazhs	evgmaatmzjqboiks
ixgglcgeqax
amesoqtchjbzilsiza
18
+miadsmeiijhkicaly
19
+vqmiljlyirtanddiacn
20
+zmqiscartrdioaafwjge
gdclgyvbs hmhoamduljajue mhonmrecbxwkvucaally	krottcixzzb
gbmwuucyklvurktk	crfmhuflfdgukardl cmrualpisnerpnske	wnkijgifgtits	vsimnhoncokital
21
+qfgotxnpossesjsion ormpitfeimised
22
+joutbbrsazlqenez
23
+opouvxytlcomtpmyete
24
+jfriotuvtggparwn outkeypxobrd cojqvvzerfaqpadt botjvhiekuflzlew	ahonrmeryhrfoe wpkalbqpasttorhnzy
pyirnpkdtes psevnxtvrwsatoehs	phstdimjoxdmtnic
lhotxoidequecktor bdziropsciva	pmitomauncu
25
+plsctfnejvnks	ohplaeyrobwcerciopid	polymorpujicllymrlnu
prainevkehfjd	bplftearloxuobd	jcpquvjnagglvs iapauissyfooqhqters
26
+qhuscawdsdeatnat	jvujmjueurfe hexyraciyt
27
+radvuoteqmgemtetrwic rdrekfashqqiocivg
28
+xreuscfldlgs	vefjuecrskkubeisher
zruoinmearvisumwewed qeeijiepaerinv
rdeniyvftiziwns revelwgptorivn
29
+yresirqtdyenfcmies
30
+ruyensmebting
brerohkpbkiest rlojibtarirekihs
31
+vrufevbl rummabgemtzrs sacrramerlnstajl fsazprjmdlllohd
32
+sscvurmbeencvuxs
sspavatmzbxding qalcpeotoffh	mssejglvtesjom selsxlvnbeswsuntess
sapkfslnwigeb
tsgiheluxwfifheieys escrjhtettdlandc
mhunnnisnrrg
sofiveiwrsuidve
swbdwfmeeys	sklnugobgerzs sxdqneagahmnkg
33
+xsgrtosryhtyeilliweng
34
+syoubstitahueygivt sugararogbgusspt	tadracpoeopts teldsptffdy	tqwehstwihfyimkkg
ztqozrmtowxricihd
sjtkihdsenxtotgs
rtdriprarotputfe vltrahazjdrqdikkous
35
+uoniotmpaaqohabrly
kunhsiksllflybulmly wuettqekrqrofst vansliakzzidj vgzpluiisvejs	wvhuxeeelchnairbs cwyikreedqawrwork

+ 1
- 0
breathalyzer/data/input View File

@@ -0,0 +1 @@
1
+tihs sententcnes  iss nout    varrry goud

+ 1
- 0
breathalyzer/data/lol.in View File

@@ -0,0 +1 @@
1
+az a serviec ta eets uzrz, facebook wud lieks ta detect wehn wall postz iz sow besotteded wif errorz dat dey noes kan possibly b whut teh uzr meaned ta express teh aforementioneded wall post wud b kaerfoolly preserveded in a jar foar futurz reference adn eets author requesteded ta perform a onlien brefalyzah test foar sobriety ur challengeded ta riet a program dat kan taek a body uv text adn deturmien hao menny mistakez haz bein maeded in eets composishun speed adn efficiency iz paramount az dis puzzle haz restrictiev boundz dat may b narrowah than prior puzzlez

+ 84
- 0
breathalyzer/data/spelling-tests.pl View File

@@ -0,0 +1,84 @@
1
+#!/usr/bin/perl
2
+use warnings;
3
+# cmdline args are, in order:
4
+$input=shift || "/var/tmp/twl06.txt"; #just a source for garbled input words, not necessarily the actual dict
5
+$maxdist=shift || 0; # applies this many edits, but edit dist will be <= this bound (may accidentally edit back closer to another dict word)
6
+$prob_dict_to_input=shift || .001; # portion of dict that is kept on average
7
+$seed=shift || 187;
8
+$pre=shift || ""; # prepend to every output word
9
+$suf=shift || ""; # append to every
10
+$also_orig=shift || 0; # if true => also generate original word^N times no spaces
11
+$first_char=shift || "a"; # try setting to "`", which is 1 less than a if you want to check non-alphabetic inputs.  anything outside range gets set to $first_char
12
+
13
+$last_char="z";
14
+
15
+srand($seed);
16
+$first_ord=ord $first_char;
17
+$n_ord=ord($last_char) - $first_ord + 1;
18
+$last_ord=$first_ord+$n_ord-1;
19
+
20
+sub clamp_ord {
21
+    my ($o)=@_;
22
+    ($o<$first_ord || $o >$last_ord) ? $first_ord : $o
23
+}
24
+sub clamp_str {
25
+    my ($s)=@_;
26
+#    pack("C*",map { clamp_ord($_) } unpack("C*",$s));
27
+    $s
28
+}
29
+sub maybe {
30
+    rand() <.2 ? 1 : 0
31
+}
32
+sub rand_ws {
33
+    &maybe ? " " : &maybe ? "\t" : &maybe ? "\n" : &maybe ? "\r" : &rand_ws
34
+}
35
+sub keep {
36
+    rand() < $prob_dict_to_input
37
+}
38
+sub rand_i {
39
+    int(rand $_[0]);
40
+}
41
+sub rand_ord {
42
+    &rand_i($n_ord) + $first_ord
43
+}
44
+sub rand_alph {
45
+    chr(&rand_ord)
46
+}
47
+sub rand_edit { # note: may produce an empty word, which will disappear in parsing
48
+    my ($s)=@_;
49
+    my ($i)=&rand_i(length $s);
50
+    substr($s,$i,&maybe)=&maybe?"":&rand_alph;
51
+    $s
52
+}
53
+sub rand_editn {
54
+    my ($s)=@_;
55
+    $s=&rand_edit($s) for(1..$maxdist);
56
+    $s
57
+}
58
+open(DICT,'<',$input) || die "no $input";
59
+$Nin=0;
60
+@words=();
61
+$prelen=0;
62
+while(<DICT>) {
63
+    while(/(\S+)/g) {
64
+        $word=clamp_str(lc($1));
65
+        ++$Nin;
66
+        if (&keep) { # choose first: consistent subset of words given seed
67
+            push @words,$word;
68
+        }
69
+    }
70
+}
71
+@words=("empty") unless scalar @words;
72
+$N=scalar @words;
73
+$postlen=0;
74
+for my $word (@words) {
75
+    $prelen+=length $word;
76
+    $wr=&rand_editn($word);
77
+    $postlen+=length $wr;
78
+    print $pre,$wr,$suf,&rand_ws;
79
+    $wx=$word x $also_orig;
80
+    print $wx,&rand_ws if $also_orig;
81
+}
82
+$avgpre=$prelen/$N;
83
+$avgpost=$postlen/$N;
84
+print STDERR "\n$N of $Nin possible words selected, $maxdist edits applied to each (avg length $avgpre => $avgpost).  Max total possible edit dist=".$N*$maxdist."\n";

+ 178691
- 0
breathalyzer/data/twl06.txt
File diff suppressed because it is too large
View File


+ 1299
- 0
breathalyzer/levenshtein.c
File diff suppressed because it is too large
View File


+ 5
- 0
breathalyzer/levenshtein.pyx View File

@@ -0,0 +1,5 @@
1
+cdef extern from "levenshtein_distance.h":
2
+     int levenshtein_distance(char *s,char *t)   
3
+
4
+def levenshtein(char* s1, char* s2):
5
+    return levenshtein_distance(s1, s2)

+ 47
- 0
breathalyzer/levenshtein_distance.c View File

@@ -0,0 +1,47 @@
1
+#include <stdlib.h>
2
+#include <malloc.h>
3
+#include <string.h>
4
+
5
+/*Compute levenshtein distance between s and t*/
6
+int levenshtein_distance(char *s, char*t) {
7
+  //Step 1
8
+  int i, j, k, n, m, cost, *d, distance;
9
+  n = strlen(s);
10
+  m = strlen(t);
11
+  if (n != 0 && m != 0) {
12
+    d = malloc((sizeof(int)) * (m + 1) * (n + 1));
13
+    m++;
14
+    n++;
15
+    //Step 2
16
+    for (k = 0; k < n; k++)
17
+	d[k] = k;
18
+    for (k = 0; k < m; k++)
19
+      d[k*n] = k;
20
+    //Step 3 and 4
21
+    for (i = 1; i < n; i++)
22
+      for (j = 1; j < m; j++) {
23
+        //Step 5
24
+        if (s[i-1] == t[j-1])
25
+          cost = 0;
26
+        else
27
+          cost = 1;
28
+        //Step 6
29
+        d[j*n+i] = minimum(d[(j-1)*n + i] + 1, d[j*n+i-1] + 1, d[(j-1)*n + i - 1] + cost);
30
+      }
31
+    distance = d[n*m-1];
32
+    free(d);
33
+    return distance;
34
+  }
35
+  else
36
+    return -1; //a negative return value means that one or both strings are empty.
37
+}
38
+
39
+/*Gets the minimum of three values*/
40
+int minimum(int a, int b, int c) {
41
+  int min = a;
42
+  if (b < min)
43
+    min = b;
44
+  if (c < min)
45
+    min = c;
46
+  return min;
47
+}

+ 3
- 0
breathalyzer/levenshtein_distance.h View File

@@ -0,0 +1,3 @@
1
+/* This file was automatically generated.  Do not edit! */
2
+int minimum(int a,int b,int c);
3
+int levenshtein_distance(char *s,char *t);

+ 2
- 0
hoppity/Makefile View File

@@ -0,0 +1,2 @@
1
+hoppity: hoppity.hs Makefile
2
+	ghc -O2 -o $@ $<

+ 21
- 0
hoppity/hoppity.hs View File

@@ -0,0 +1,21 @@
1
+module Main () where
2
+
3
+import System.Environment
4
+import Control.Monad
5
+import Data.Maybe
6
+
7
+hoppityfy :: Int -> Maybe String
8
+hoppityfy n
9
+  | n `mod` 3 == 0 && n `mod` 5 == 0 = Just "Hop"
10
+  | n `mod` 3 == 0 = Just "Hoppity"
11
+  | n `mod` 5 == 0 = Just "Hophop"
12
+  | otherwise = Nothing
13
+
14
+main :: IO ()
15
+main = do
16
+  [filename] <- getArgs
17
+  content  <- readFile filename
18
+  let n = (read content) :: Int
19
+  let hops = catMaybes . map hoppityfy $ [1 .. n]
20
+  forM_ hops $ \h -> putStr . (++ "\n") $ h
21
+    

+ 12
- 0
liarliar/input View File

@@ -0,0 +1,12 @@
1
+5
2
+Stephen   1
3
+Tommaso
4
+Tommaso   1
5
+Galileo
6
+Isaac     1
7
+Tommaso
8
+Galileo   1
9
+Tommaso
10
+George    2
11
+Isaac
12
+Stephen

+ 45
- 0
liarliar/liarliar View File

@@ -0,0 +1,45 @@
1
+#!/usr/bin/python
2
+
3
+import sys
4
+from collections import defaultdict
5
+from itertools import imap
6
+
7
+graph = defaultdict(set)
8
+
9
+input_file = open(sys.argv[1])
10
+no_of_veterans = input_file.readline().strip()
11
+
12
+for i in xrange(int(no_of_veterans)):
13
+    (veteran, no_of_liars) = input_file.readline().strip().split()
14
+    for j in xrange(int(no_of_liars)):
15
+        liar = input_file.readline().strip()
16
+        graph[veteran].add(liar)
17
+        graph[liar].add(veteran)
18
+
19
+input_file.close()
20
+
21
+def visit_node(node, mode, partition, seen_nodes):
22
+    #print "!%s" % node
23
+    if not node in seen_nodes:
24
+        partition[node] = mode
25
+        #print ">%s:%s" %  (node, mode)
26
+        seen_nodes.add(node)
27
+        for child_node in graph[node]:
28
+            visit_node(child_node, not mode, partition, seen_nodes)
29
+
30
+seen_nodes = set()
31
+partition = dict()
32
+for veteran in graph.keys():
33
+    #print  "^%s" % veteran
34
+    if veteran not in seen_nodes:
35
+        visit_node(veteran, True, partition, seen_nodes)
36
+
37
+def quantify(iterable, pred=bool):
38
+    "Count how many times the predicate is true"
39
+    return sum(imap(pred, iterable))
40
+
41
+#print graph
42
+#print partition
43
+truthers = quantify(partition.iteritems(), lambda (k,v):v)
44
+liars = len(partition) - truthers
45
+print "%s %s" % (max(truthers, liars), min(truthers, liars))

Loading…
Cancel
Save