Initial commit

2011-06-27 16:39:38 +05:30 · 2011-06-27 16:39:38 +05:30 · 5b4f5b37b8
commit 5b4f5b37b8
18 changed files with 180400 additions and 0 deletions
--- a/breathalyzer/Makefile
+++ b/breathalyzer/Makefile
@ -0,0 +1,8 @@
+levenshtein.so: levenshtein_distance.c levenshtein.c
+	gcc -c levenshtein_distance.c
+	gcc -c -fPIC -I/usr/include/python2.5/ levenshtein.c
+	gcc -shared levenshtein_distance.o levenshtein.o -o levenshtein.so
+	rm -f *.o
+
+clean: 
+	rm -f *.o *.so *.pyc
--- a/breathalyzer/breathalyzer
+++ b/breathalyzer/breathalyzer
@ -0,0 +1,143 @@
+#!/usr/bin/python
+
+import sys
+import re
+from collections import deque
+from collections import defaultdict
+from levenshtein import levenshtein
+
+alphabet = "etaoinshrdlcumwfgypbvkjxqz"
+
+def read_post(post_file_path):
+    with open(post_file_path) as input_file:
+        return input_file.read().strip()
+
+def read_dictionary(dictionary_file_path):
+    with open(dictionary_file_path) as dictionary_file:
+        dictionary = set()
+        for line in dictionary_file:
+            dictionary.add(line.strip().lower())
+        return dictionary
+
+def bucket_dictionary(dictionary):
+    buckets = defaultdict(set)
+    for word in dictionary:
+        buckets[word[0]].add(word)
+    return buckets
+
+def words(text):
+    return re.findall("[a-z]+", text.lower())
+
+def splits(word):
+    return [(word[:i], word[i:]) for i in xrange(len(word) + 1)]
+
+def deletes(word_splits):
+    for a, b in word_splits:
+        if b:
+            yield a + b[1:]
+
+def replaces(word_splits):
+    for a, b in word_splits:
+        if b:
+            for c in alphabet:
+                yield a + c + b[1:]
+
+def inserts(word_splits):
+    for a, b in word_splits:
+        for c in alphabet:
+            yield a + c + b
+
+def edits(word):
+    word_splits = splits(word)
+    for w in deletes(word_splits):
+        yield w
+    for w in replaces(word_splits):
+        yield w
+    for w in inserts(word_splits):
+        yield w
+
+def align_dictionary(word, dictionary, buckets):
+    for w in buckets[word[0]]:
+        yield w
+    for (c, ws) in buckets.iteritems():
+        if c == word[0]:
+            continue
+        else:
+            for w in ws:
+                yield w
+
+def find_edit_distance(word, dictionary, buckets):
+    if word in dictionary:
+        return (word, 0)
+
+    #print word
+
+    #print "mutation"
+    mutation_limit = 1
+    queue = deque()
+    queue.appendleft((word, 0))
+
+    words_checked = 0
+    current_ed = 0
+    try:
+        while len(queue) != 0:
+            (w, edit_distance) = queue.pop()
+            for e in edits(w):
+                words_checked += 1
+                if (edit_distance + 1) > mutation_limit:
+                    current_ed = edit_distance + 1
+                    raise StopIteration
+                if e in dictionary:
+                    print "M: %s -> %s: %s" % (word, e, edit_distance + 1)
+                    #print "Words checked = %s" % words_checked
+                    return (e, edit_distance + 1)
+                else:
+                    #print "%s. %s: %s" % (i, e, edit_distance + 1)
+                    queue.appendleft((e, edit_distance + 1))
+    except StopIteration:
+        pass
+    #print "Words checked = %s" % words_checked
+
+    #print "SEARCH %s" % word
+    words_checked = 0
+    current_min = 1e38
+    nearest_word = None
+    for entry in align_dictionary(word, dictionary, buckets):
+        if abs(len(entry) - len(word)) > current_min:
+            continue
+
+        words_checked += 1
+        d = levenshtein(word, entry)
+        # print "%s: %s" % (entry, d)
+
+        if d < current_min:
+            current_min = d
+            nearest_word = entry
+        if current_min == current_ed:
+            #print ">> breaking"
+            break
+
+        #print "current_min = %s" % current_min
+    #print "Words checked = %s" % words_checked
+
+    print "S: %s -> %s: %s" % (word, nearest_word, current_min)
+    return (nearest_word, current_min)
+
+def score_post(post, dictionary, buckets):
+    #print post
+    corrections = {}
+    score = 0
+    for word in words(post):
+        if word in corrections:
+            #print "Found in corrections: %s" % word
+            (correct_word, edit_distance) = corrections[word]
+        else:
+            (correct_word, edit_distance) = find_edit_distance(word, dictionary, buckets)
+            corrections[word] = (correct_word, edit_distance)
+        score += edit_distance
+    return score
+
+if __name__ == "__main__":
+    dictionary_file_path = "/var/tmp/twl06.txt"
+    dictionary = read_dictionary(dictionary_file_path)
+    print score_post(read_post(sys.argv[1]), dictionary, bucket_dictionary(dictionary))
--- a/breathalyzer/breathalyzer.py
+++ b/breathalyzer/breathalyzer.py
@ -0,0 +1 @@
+breathalyzer
--- a/breathalyzer/data/11.in
+++ b/breathalyzer/data/11.in
@ -0,0 +1 @@
+a service to its users acebook would like to detect when wall posts are so besotted with errors that they cannot possibly be what the user meant to express he aforementioned wall post would be carefully preserved in a jar for future reference and its author requested to perform an online breathalyzer test for sobriety ou are challenged to write a program that can take a body of text and determine how many mistakes have been made in its composition peed and efficiency are paramount as this puzzle has restrictive bounds that may be narrower than prior puzzles
--- a/breathalyzer/data/187.in
+++ b/breathalyzer/data/187.in
@ -0,0 +1 @@
+orem ipsum dolor sit amet consectetur adipiscing elit nteger imperdiet elit et libero commodo et convallis est ultrices raesent faucibus ligula ullamcorper urna pellentesque faucibus liquam ultrices purus sit amet tellus malesuada malesuada hasellus varius faucibus nisl congue placerat mi suscipit vitae ivamus eu lorem mauris a elementum erat nteger a nisl sollicitudin mauris facilisis vehicula quis non erat tiam sit amet porta justo usce eget nisl ipsum am a ante neque egestas rhoncus urna orbi lectus lorem vehicula quis commodo sed scelerisque non diam enean enim quam sollicitudin vel dignissim et feugiat in risus orbi gravida urna in neque sollicitudin elementum nteger ut tortor lacus sed aliquam ipsum usce convallis purus at lobortis accumsan magna odio blandit orci sit amet semper ligula tortor sit amet nisi ellentesque luctus nisi ut placerat dictum massa libero suscipit mi id ullamcorper purus arcu at nunc t ut arcu orc
--- a/breathalyzer/data/in.in
+++ b/breathalyzer/data/in.in
@ -0,0 +1,35 @@
+accepttaqluqmtwon
lqkubnapffsut
+zancyvrllxeye qcntibfeqovyccotyt	llahtwiumyounvospolvy
rgarpeterpmzitibds	atydntxocygtxomjnas	zaddptvuicicisvts	rrutotmzaediekon	ribefmgtotpckvced
+izbesthrodtsdume	bpvewijpzthehrts billlfoavjogodk
+mblaczkdbiyrdz	tjblianhdnteodshbses bvuleapxldcy
edrslulrjcwackndet fratvlzmirng
+ouafbrohguegocs	bumbwnbgarliesxt	ovcrvselrtwoxcad
+catchpkrsshcqhas ceeilbednlyrxs
+cknvreplloiupdi cdjzhawkzcxgens ccyaqenssokzkhs
cslassuoifbuuhications uclozioimfaxkes	tofayouloatxevs
+cwlpcdyothsjnark
cdobevrmlocipnqe	mqlafuthqysumanfise
+cruqagoerposes
cmxudfrpdgfiel
cupbeehamgrpcehr dvyapbunxzted wyemdvfecelft qhodsaltzlveyv
gdeveitefpztiotn
edeoevoapuxiatosr
+ddoopoggonvcer
+elbiiecztrodialcshzs enrravizosxejhling
jeveenkjetcrkare enhexplhlzqmter fcpexndupostxulatfdie	codrcgtbyoregiczed fdqagvvouybrenr	wfekstcioscendsp
dmyipesrsormbs finsrskusrrzal flguenrlkeitns jfyabrmespbrke
+xcjggupiaeesr
+qgnlmauktjhiesot	kgyelaifonpas pgperriemgaztqircs fbggczcmaeanium
+gramgiwkcpndinls eggrwjpujnvehs hjounneaibs	hyoivihdkqmen	hxyhqspesjrsezuml htbsnpolwcoeigsys	ijbncmeslhepcd	insfjcqukkknlps
+intelfylecjtbnuzally jintietligentmaol
kreybepings
lrmxiwqaksh	iljmwkaayerbkings	ztlaeciej	juliairmbdums
+llonghefaktdded
ltauolragteidvwe	gyxriefstms xoxmmalrioriefocs
+makuezqaaiticicazhs	evgmaatmzjqboiks
ixgglcgeqax
amesoqtchjbzilsiza
+miadsmeiijhkicaly
+vqmiljlyirtanddiacn
+zmqiscartrdioaafwjge
gdclgyvbs hmhoamduljajue mhonmrecbxwkvucaally	krottcixzzb
gbmwuucyklvurktk	crfmhuflfdgukardl cmrualpisnerpnske	wnkijgifgtits	vsimnhoncokital
+qfgotxnpossesjsion ormpitfeimised
+joutbbrsazlqenez
+opouvxytlcomtpmyete
+jfriotuvtggparwn outkeypxobrd cojqvvzerfaqpadt botjvhiekuflzlew	ahonrmeryhrfoe wpkalbqpasttorhnzy
pyirnpkdtes psevnxtvrwsatoehs	phstdimjoxdmtnic
lhotxoidequecktor bdziropsciva	pmitomauncu
+plsctfnejvnks	ohplaeyrobwcerciopid	polymorpujicllymrlnu
prainevkehfjd	bplftearloxuobd	jcpquvjnagglvs iapauissyfooqhqters
+qhuscawdsdeatnat	jvujmjueurfe hexyraciyt
+radvuoteqmgemtetrwic rdrekfashqqiocivg
+xreuscfldlgs	vefjuecrskkubeisher
zruoinmearvisumwewed qeeijiepaerinv
rdeniyvftiziwns revelwgptorivn
+yresirqtdyenfcmies
+ruyensmebting
brerohkpbkiest rlojibtarirekihs
+vrufevbl rummabgemtzrs sacrramerlnstajl fsazprjmdlllohd
+sscvurmbeencvuxs
sspavatmzbxding qalcpeotoffh	mssejglvtesjom selsxlvnbeswsuntess
sapkfslnwigeb
tsgiheluxwfifheieys escrjhtettdlandc
mhunnnisnrrg
sofiveiwrsuidve
swbdwfmeeys	sklnugobgerzs sxdqneagahmnkg
+xsgrtosryhtyeilliweng
+syoubstitahueygivt sugararogbgusspt	tadracpoeopts teldsptffdy	tqwehstwihfyimkkg
ztqozrmtowxricihd
sjtkihdsenxtotgs
rtdriprarotputfe vltrahazjdrqdikkous
+uoniotmpaaqohabrly
kunhsiksllflybulmly wuettqekrqrofst vansliakzzidj vgzpluiisvejs	wvhuxeeelchnairbs cwyikreedqawrwork
--- a/breathalyzer/data/input
+++ b/breathalyzer/data/input
@ -0,0 +1 @@
+tihs sententcnes  iss nout    varrry goud
--- a/breathalyzer/data/lol.in
+++ b/breathalyzer/data/lol.in
@ -0,0 +1 @@
+az a serviec ta eets uzrz, facebook wud lieks ta detect wehn wall postz iz sow besotteded wif errorz dat dey noes kan possibly b whut teh uzr meaned ta express teh aforementioneded wall post wud b kaerfoolly preserveded in a jar foar futurz reference adn eets author requesteded ta perform a onlien brefalyzah test foar sobriety ur challengeded ta riet a program dat kan taek a body uv text adn deturmien hao menny mistakez haz bein maeded in eets composishun speed adn efficiency iz paramount az dis puzzle haz restrictiev boundz dat may b narrowah than prior puzzlez
--- a/breathalyzer/data/spelling-tests.pl
+++ b/breathalyzer/data/spelling-tests.pl
@ -0,0 +1,84 @@
+#!/usr/bin/perl
+use warnings;
+# cmdline args are, in order:
+$input=shift || "/var/tmp/twl06.txt"; #just a source for garbled input words, not necessarily the actual dict
+$maxdist=shift || 0; # applies this many edits, but edit dist will be <= this bound (may accidentally edit back closer to another dict word)
+$prob_dict_to_input=shift || .001; # portion of dict that is kept on average
+$seed=shift || 187;
+$pre=shift || ""; # prepend to every output word
+$suf=shift || ""; # append to every
+$also_orig=shift || 0; # if true => also generate original word^N times no spaces
+$first_char=shift || "a"; # try setting to "`", which is 1 less than a if you want to check non-alphabetic inputs.  anything outside range gets set to $first_char
+
+$last_char="z";
+
+srand($seed);
+$first_ord=ord $first_char;
+$n_ord=ord($last_char) - $first_ord + 1;
+$last_ord=$first_ord+$n_ord-1;
+
+sub clamp_ord {
+    my ($o)=@_;
+    ($o<$first_ord || $o >$last_ord) ? $first_ord : $o
+}
+sub clamp_str {
+    my ($s)=@_;
+#    pack("C*",map { clamp_ord($_) } unpack("C*",$s));
+    $s
+}
+sub maybe {
+    rand() <.2 ? 1 : 0
+}
+sub rand_ws {
+    &maybe ? " " : &maybe ? "\t" : &maybe ? "\n" : &maybe ? "\r" : &rand_ws
+}
+sub keep {
+    rand() < $prob_dict_to_input
+}
+sub rand_i {
+    int(rand $_[0]);
+}
+sub rand_ord {
+    &rand_i($n_ord) + $first_ord
+}
+sub rand_alph {
+    chr(&rand_ord)
+}
+sub rand_edit { # note: may produce an empty word, which will disappear in parsing
+    my ($s)=@_;
+    my ($i)=&rand_i(length $s);
+    substr($s,$i,&maybe)=&maybe?"":&rand_alph;
+    $s
+}
+sub rand_editn {
+    my ($s)=@_;
+    $s=&rand_edit($s) for(1..$maxdist);
+    $s
+}
+open(DICT,'<',$input) || die "no $input";
+$Nin=0;
+@words=();
+$prelen=0;
+while(<DICT>) {
+    while(/(\S+)/g) {
+        $word=clamp_str(lc($1));
+        ++$Nin;
+        if (&keep) { # choose first: consistent subset of words given seed
+            push @words,$word;
+        }
+    }
+}
+@words=("empty") unless scalar @words;
+$N=scalar @words;
+$postlen=0;
+for my $word (@words) {
+    $prelen+=length $word;
+    $wr=&rand_editn($word);
+    $postlen+=length $wr;
+    print $pre,$wr,$suf,&rand_ws;
+    $wx=$word x $also_orig;
+    print $wx,&rand_ws if $also_orig;
+}
+$avgpre=$prelen/$N;
+$avgpost=$postlen/$N;
+print STDERR "\n$N of $Nin possible words selected, $maxdist edits applied to each (avg length $avgpre => $avgpost).  Max total possible edit dist=".$N*$maxdist."\n";
--- a/breathalyzer/data/twl06.txt
+++ b/breathalyzer/data/twl06.txt
--- a/breathalyzer/levenshtein.c
+++ b/breathalyzer/levenshtein.c
--- a/breathalyzer/levenshtein.pyx
+++ b/breathalyzer/levenshtein.pyx
@ -0,0 +1,5 @@
+cdef extern from "levenshtein_distance.h":
+     int levenshtein_distance(char *s,char *t)   
+
+def levenshtein(char* s1, char* s2):
+    return levenshtein_distance(s1, s2)
--- a/breathalyzer/levenshtein_distance.c
+++ b/breathalyzer/levenshtein_distance.c
@ -0,0 +1,47 @@
+#include <stdlib.h>
+#include <malloc.h>
+#include <string.h>
+
+/*Compute levenshtein distance between s and t*/
+int levenshtein_distance(char *s, char*t) {
+  //Step 1
+  int i, j, k, n, m, cost, *d, distance;
+  n = strlen(s);
+  m = strlen(t);
+  if (n != 0 && m != 0) {
+    d = malloc((sizeof(int)) * (m + 1) * (n + 1));
+    m++;
+    n++;
+    //Step 2
+    for (k = 0; k < n; k++)
+	d[k] = k;
+    for (k = 0; k < m; k++)
+      d[k*n] = k;
+    //Step 3 and 4
+    for (i = 1; i < n; i++)
+      for (j = 1; j < m; j++) {
+        //Step 5
+        if (s[i-1] == t[j-1])
+          cost = 0;
+        else
+          cost = 1;
+        //Step 6
+        d[j*n+i] = minimum(d[(j-1)*n + i] + 1, d[j*n+i-1] + 1, d[(j-1)*n + i - 1] + cost);
+      }
+    distance = d[n*m-1];
+    free(d);
+    return distance;
+  }
+  else
+    return -1; //a negative return value means that one or both strings are empty.
+}
+
+/*Gets the minimum of three values*/
+int minimum(int a, int b, int c) {
+  int min = a;
+  if (b < min)
+    min = b;
+  if (c < min)
+    min = c;
+  return min;
+}
--- a/breathalyzer/levenshtein_distance.h
+++ b/breathalyzer/levenshtein_distance.h
@ -0,0 +1,3 @@
+/* This file was automatically generated.  Do not edit! */
+int minimum(int a,int b,int c);
+int levenshtein_distance(char *s,char *t);
--- a/hoppity/Makefile
+++ b/hoppity/Makefile
@ -0,0 +1,2 @@
+hoppity: hoppity.hs Makefile
+	ghc -O2 -o $@ $<
--- a/hoppity/hoppity.hs
+++ b/hoppity/hoppity.hs
@ -0,0 +1,21 @@
+module Main () where
+
+import System.Environment
+import Control.Monad
+import Data.Maybe
+
+hoppityfy :: Int -> Maybe String
+hoppityfy n
+  | n `mod` 3 == 0 && n `mod` 5 == 0 = Just "Hop"
+  | n `mod` 3 == 0 = Just "Hoppity"
+  | n `mod` 5 == 0 = Just "Hophop"
+  | otherwise = Nothing
+
+main :: IO ()
+main = do
+  [filename] <- getArgs
+  content  <- readFile filename
+  let n = (read content) :: Int
+  let hops = catMaybes . map hoppityfy $ [1 .. n]
+  forM_ hops $ \h -> putStr . (++ "\n") $ h
+    
--- a/liarliar/input
+++ b/liarliar/input
@ -0,0 +1,12 @@
+5
+Stephen   1
+Tommaso
+Tommaso   1
+Galileo
+Isaac     1
+Tommaso
+Galileo   1
+Tommaso
+George    2
+Isaac
+Stephen
--- a/liarliar/liarliar
+++ b/liarliar/liarliar
@ -0,0 +1,45 @@
+#!/usr/bin/python
+
+import sys
+from collections import defaultdict
+from itertools import imap
+
+graph = defaultdict(set)
+
+input_file = open(sys.argv[1])
+no_of_veterans = input_file.readline().strip()
+
+for i in xrange(int(no_of_veterans)):
+    (veteran, no_of_liars) = input_file.readline().strip().split()
+    for j in xrange(int(no_of_liars)):
+        liar = input_file.readline().strip()
+        graph[veteran].add(liar)
+        graph[liar].add(veteran)
+
+input_file.close()
+
+def visit_node(node, mode, partition, seen_nodes):
+    #print "!%s" % node
+    if not node in seen_nodes:
+        partition[node] = mode
+        #print ">%s:%s" %  (node, mode)
+        seen_nodes.add(node)
+        for child_node in graph[node]:
+            visit_node(child_node, not mode, partition, seen_nodes)
+
+seen_nodes = set()
+partition = dict()
+for veteran in graph.keys():
+    #print  "^%s" % veteran
+    if veteran not in seen_nodes:
+        visit_node(veteran, True, partition, seen_nodes)
+
+def quantify(iterable, pred=bool):
+    "Count how many times the predicate is true"
+    return sum(imap(pred, iterable))
+
+#print graph
+#print partition
+truthers = quantify(partition.iteritems(), lambda (k,v):v)
+liars = len(partition) - truthers
+print "%s %s" % (max(truthers, liars), min(truthers, liars))
				`@ -0,0 +1 @@`
				a service to its users acebook would like to detect when wall posts are so besotted with errors that they cannot possibly be what the user meant to express he aforementioned wall post would be carefully preserved in a jar for future reference and its author requested to perform an online breathalyzer test for sobriety ou are challenged to write a program that can take a body of text and determine how many mistakes have been made in its composition peed and efficiency are paramount as this puzzle has restrictive bounds that may be narrower than prior puzzles
				`@ -0,0 +1 @@`
				orem ipsum dolor sit amet consectetur adipiscing elit nteger imperdiet elit et libero commodo et convallis est ultrices raesent faucibus ligula ullamcorper urna pellentesque faucibus liquam ultrices purus sit amet tellus malesuada malesuada hasellus varius faucibus nisl congue placerat mi suscipit vitae ivamus eu lorem mauris a elementum erat nteger a nisl sollicitudin mauris facilisis vehicula quis non erat tiam sit amet porta justo usce eget nisl ipsum am a ante neque egestas rhoncus urna orbi lectus lorem vehicula quis commodo sed scelerisque non diam enean enim quam sollicitudin vel dignissim et feugiat in risus orbi gravida urna in neque sollicitudin elementum nteger ut tortor lacus sed aliquam ipsum usce convallis purus at lobortis accumsan magna odio blandit orci sit amet semper ligula tortor sit amet nisi ellentesque luctus nisi ut placerat dictum massa libero suscipit mi id ullamcorper purus arcu at nunc t ut arcu orc
				`@ -0,0 +1 @@`
				az a serviec ta eets uzrz, facebook wud lieks ta detect wehn wall postz iz sow besotteded wif errorz dat dey noes kan possibly b whut teh uzr meaned ta express teh aforementioneded wall post wud b kaerfoolly preserveded in a jar foar futurz reference adn eets author requesteded ta perform a onlien brefalyzah test foar sobriety ur challengeded ta riet a program dat kan taek a body uv text adn deturmien hao menny mistakez haz bein maeded in eets composishun speed adn efficiency iz paramount az dis puzzle haz restrictiev boundz dat may b narrowah than prior puzzlez