Initial commit

This commit is contained in:
Abhinav Sarkar 2011-06-27 16:39:38 +05:30
commit 5b4f5b37b8
18 changed files with 180400 additions and 0 deletions

8
breathalyzer/Makefile Normal file
View File

@ -0,0 +1,8 @@
levenshtein.so: levenshtein_distance.c levenshtein.c
gcc -c levenshtein_distance.c
gcc -c -fPIC -I/usr/include/python2.5/ levenshtein.c
gcc -shared levenshtein_distance.o levenshtein.o -o levenshtein.so
rm -f *.o
clean:
rm -f *.o *.so *.pyc

143
breathalyzer/breathalyzer Executable file
View File

@ -0,0 +1,143 @@
#!/usr/bin/python
import sys
import re
from collections import deque
from collections import defaultdict
from levenshtein import levenshtein
alphabet = "etaoinshrdlcumwfgypbvkjxqz"
def read_post(post_file_path):
with open(post_file_path) as input_file:
return input_file.read().strip()
def read_dictionary(dictionary_file_path):
with open(dictionary_file_path) as dictionary_file:
dictionary = set()
for line in dictionary_file:
dictionary.add(line.strip().lower())
return dictionary
def bucket_dictionary(dictionary):
buckets = defaultdict(set)
for word in dictionary:
buckets[word[0]].add(word)
return buckets
def words(text):
return re.findall("[a-z]+", text.lower())
def splits(word):
return [(word[:i], word[i:]) for i in xrange(len(word) + 1)]
def deletes(word_splits):
for a, b in word_splits:
if b:
yield a + b[1:]
def replaces(word_splits):
for a, b in word_splits:
if b:
for c in alphabet:
yield a + c + b[1:]
def inserts(word_splits):
for a, b in word_splits:
for c in alphabet:
yield a + c + b
def edits(word):
word_splits = splits(word)
for w in deletes(word_splits):
yield w
for w in replaces(word_splits):
yield w
for w in inserts(word_splits):
yield w
def align_dictionary(word, dictionary, buckets):
for w in buckets[word[0]]:
yield w
for (c, ws) in buckets.iteritems():
if c == word[0]:
continue
else:
for w in ws:
yield w
def find_edit_distance(word, dictionary, buckets):
if word in dictionary:
return (word, 0)
#print word
#print "mutation"
mutation_limit = 1
queue = deque()
queue.appendleft((word, 0))
words_checked = 0
current_ed = 0
try:
while len(queue) != 0:
(w, edit_distance) = queue.pop()
for e in edits(w):
words_checked += 1
if (edit_distance + 1) > mutation_limit:
current_ed = edit_distance + 1
raise StopIteration
if e in dictionary:
print "M: %s -> %s: %s" % (word, e, edit_distance + 1)
#print "Words checked = %s" % words_checked
return (e, edit_distance + 1)
else:
#print "%s. %s: %s" % (i, e, edit_distance + 1)
queue.appendleft((e, edit_distance + 1))
except StopIteration:
pass
#print "Words checked = %s" % words_checked
#print "SEARCH %s" % word
words_checked = 0
current_min = 1e38
nearest_word = None
for entry in align_dictionary(word, dictionary, buckets):
if abs(len(entry) - len(word)) > current_min:
continue
words_checked += 1
d = levenshtein(word, entry)
# print "%s: %s" % (entry, d)
if d < current_min:
current_min = d
nearest_word = entry
if current_min == current_ed:
#print ">> breaking"
break
#print "current_min = %s" % current_min
#print "Words checked = %s" % words_checked
print "S: %s -> %s: %s" % (word, nearest_word, current_min)
return (nearest_word, current_min)
def score_post(post, dictionary, buckets):
#print post
corrections = {}
score = 0
for word in words(post):
if word in corrections:
#print "Found in corrections: %s" % word
(correct_word, edit_distance) = corrections[word]
else:
(correct_word, edit_distance) = find_edit_distance(word, dictionary, buckets)
corrections[word] = (correct_word, edit_distance)
score += edit_distance
return score
if __name__ == "__main__":
dictionary_file_path = "/var/tmp/twl06.txt"
dictionary = read_dictionary(dictionary_file_path)
print score_post(read_post(sys.argv[1]), dictionary, bucket_dictionary(dictionary))

View File

@ -0,0 +1 @@
breathalyzer

1
breathalyzer/data/11.in Normal file
View File

@ -0,0 +1 @@
a service to its users acebook would like to detect when wall posts are so besotted with errors that they cannot possibly be what the user meant to express he aforementioned wall post would be carefully preserved in a jar for future reference and its author requested to perform an online breathalyzer test for sobriety ou are challenged to write a program that can take a body of text and determine how many mistakes have been made in its composition peed and efficiency are paramount as this puzzle has restrictive bounds that may be narrower than prior puzzles

1
breathalyzer/data/187.in Normal file
View File

@ -0,0 +1 @@
orem ipsum dolor sit amet consectetur adipiscing elit nteger imperdiet elit et libero commodo et convallis est ultrices raesent faucibus ligula ullamcorper urna pellentesque faucibus liquam ultrices purus sit amet tellus malesuada malesuada hasellus varius faucibus nisl congue placerat mi suscipit vitae ivamus eu lorem mauris a elementum erat nteger a nisl sollicitudin mauris facilisis vehicula quis non erat tiam sit amet porta justo usce eget nisl ipsum am a ante neque egestas rhoncus urna orbi lectus lorem vehicula quis commodo sed scelerisque non diam enean enim quam sollicitudin vel dignissim et feugiat in risus orbi gravida urna in neque sollicitudin elementum nteger ut tortor lacus sed aliquam ipsum usce convallis purus at lobortis accumsan magna odio blandit orci sit amet semper ligula tortor sit amet nisi ellentesque luctus nisi ut placerat dictum massa libero suscipit mi id ullamcorper purus arcu at nunc t ut arcu orc

35
breathalyzer/data/in.in Normal file
View File

@ -0,0 +1,35 @@
accepttaqluqmtwon lqkubnapffsut
zancyvrllxeye qcntibfeqovyccotyt llahtwiumyounvospolvy rgarpeterpmzitibds atydntxocygtxomjnas zaddptvuicicisvts rrutotmzaediekon ribefmgtotpckvced
izbesthrodtsdume bpvewijpzthehrts billlfoavjogodk
mblaczkdbiyrdz tjblianhdnteodshbses bvuleapxldcy edrslulrjcwackndet fratvlzmirng
ouafbrohguegocs bumbwnbgarliesxt ovcrvselrtwoxcad
catchpkrsshcqhas ceeilbednlyrxs
cknvreplloiupdi cdjzhawkzcxgens ccyaqenssokzkhs cslassuoifbuuhications uclozioimfaxkes tofayouloatxevs
cwlpcdyothsjnark cdobevrmlocipnqe mqlafuthqysumanfise
cruqagoerposes cmxudfrpdgfiel cupbeehamgrpcehr dvyapbunxzted wyemdvfecelft qhodsaltzlveyv gdeveitefpztiotn edeoevoapuxiatosr
ddoopoggonvcer
elbiiecztrodialcshzs enrravizosxejhling jeveenkjetcrkare enhexplhlzqmter fcpexndupostxulatfdie codrcgtbyoregiczed fdqagvvouybrenr wfekstcioscendsp dmyipesrsormbs finsrskusrrzal flguenrlkeitns jfyabrmespbrke
xcjggupiaeesr
qgnlmauktjhiesot kgyelaifonpas pgperriemgaztqircs fbggczcmaeanium
gramgiwkcpndinls eggrwjpujnvehs hjounneaibs hyoivihdkqmen hxyhqspesjrsezuml htbsnpolwcoeigsys ijbncmeslhepcd insfjcqukkknlps
intelfylecjtbnuzally jintietligentmaol kreybepings lrmxiwqaksh iljmwkaayerbkings ztlaeciej juliairmbdums
llonghefaktdded ltauolragteidvwe gyxriefstms xoxmmalrioriefocs
makuezqaaiticicazhs evgmaatmzjqboiks ixgglcgeqax amesoqtchjbzilsiza
miadsmeiijhkicaly
vqmiljlyirtanddiacn
zmqiscartrdioaafwjge gdclgyvbs hmhoamduljajue mhonmrecbxwkvucaally krottcixzzb gbmwuucyklvurktk crfmhuflfdgukardl cmrualpisnerpnske wnkijgifgtits vsimnhoncokital
qfgotxnpossesjsion ormpitfeimised
joutbbrsazlqenez
opouvxytlcomtpmyete
jfriotuvtggparwn outkeypxobrd cojqvvzerfaqpadt botjvhiekuflzlew ahonrmeryhrfoe wpkalbqpasttorhnzy pyirnpkdtes psevnxtvrwsatoehs phstdimjoxdmtnic lhotxoidequecktor bdziropsciva pmitomauncu
plsctfnejvnks ohplaeyrobwcerciopid polymorpujicllymrlnu prainevkehfjd bplftearloxuobd jcpquvjnagglvs iapauissyfooqhqters
qhuscawdsdeatnat jvujmjueurfe hexyraciyt
radvuoteqmgemtetrwic rdrekfashqqiocivg
xreuscfldlgs vefjuecrskkubeisher zruoinmearvisumwewed qeeijiepaerinv rdeniyvftiziwns revelwgptorivn
yresirqtdyenfcmies
ruyensmebting brerohkpbkiest rlojibtarirekihs
vrufevbl rummabgemtzrs sacrramerlnstajl fsazprjmdlllohd
sscvurmbeencvuxs sspavatmzbxding qalcpeotoffh mssejglvtesjom selsxlvnbeswsuntess sapkfslnwigeb tsgiheluxwfifheieys escrjhtettdlandc mhunnnisnrrg sofiveiwrsuidve swbdwfmeeys sklnugobgerzs sxdqneagahmnkg
xsgrtosryhtyeilliweng
syoubstitahueygivt sugararogbgusspt tadracpoeopts teldsptffdy tqwehstwihfyimkkg ztqozrmtowxricihd sjtkihdsenxtotgs rtdriprarotputfe vltrahazjdrqdikkous
uoniotmpaaqohabrly kunhsiksllflybulmly wuettqekrqrofst vansliakzzidj vgzpluiisvejs wvhuxeeelchnairbs cwyikreedqawrwork

1
breathalyzer/data/input Normal file
View File

@ -0,0 +1 @@
tihs sententcnes iss nout varrry goud

1
breathalyzer/data/lol.in Normal file
View File

@ -0,0 +1 @@
az a serviec ta eets uzrz, facebook wud lieks ta detect wehn wall postz iz sow besotteded wif errorz dat dey noes kan possibly b whut teh uzr meaned ta express teh aforementioneded wall post wud b kaerfoolly preserveded in a jar foar futurz reference adn eets author requesteded ta perform a onlien brefalyzah test foar sobriety ur challengeded ta riet a program dat kan taek a body uv text adn deturmien hao menny mistakez haz bein maeded in eets composishun speed adn efficiency iz paramount az dis puzzle haz restrictiev boundz dat may b narrowah than prior puzzlez

View File

@ -0,0 +1,84 @@
#!/usr/bin/perl
use warnings;
# cmdline args are, in order:
$input=shift || "/var/tmp/twl06.txt"; #just a source for garbled input words, not necessarily the actual dict
$maxdist=shift || 0; # applies this many edits, but edit dist will be <= this bound (may accidentally edit back closer to another dict word)
$prob_dict_to_input=shift || .001; # portion of dict that is kept on average
$seed=shift || 187;
$pre=shift || ""; # prepend to every output word
$suf=shift || ""; # append to every
$also_orig=shift || 0; # if true => also generate original word^N times no spaces
$first_char=shift || "a"; # try setting to "`", which is 1 less than a if you want to check non-alphabetic inputs. anything outside range gets set to $first_char
$last_char="z";
srand($seed);
$first_ord=ord $first_char;
$n_ord=ord($last_char) - $first_ord + 1;
$last_ord=$first_ord+$n_ord-1;
sub clamp_ord {
my ($o)=@_;
($o<$first_ord || $o >$last_ord) ? $first_ord : $o
}
sub clamp_str {
my ($s)=@_;
# pack("C*",map { clamp_ord($_) } unpack("C*",$s));
$s
}
sub maybe {
rand() <.2 ? 1 : 0
}
sub rand_ws {
&maybe ? " " : &maybe ? "\t" : &maybe ? "\n" : &maybe ? "\r" : &rand_ws
}
sub keep {
rand() < $prob_dict_to_input
}
sub rand_i {
int(rand $_[0]);
}
sub rand_ord {
&rand_i($n_ord) + $first_ord
}
sub rand_alph {
chr(&rand_ord)
}
sub rand_edit { # note: may produce an empty word, which will disappear in parsing
my ($s)=@_;
my ($i)=&rand_i(length $s);
substr($s,$i,&maybe)=&maybe?"":&rand_alph;
$s
}
sub rand_editn {
my ($s)=@_;
$s=&rand_edit($s) for(1..$maxdist);
$s
}
open(DICT,'<',$input) || die "no $input";
$Nin=0;
@words=();
$prelen=0;
while(<DICT>) {
while(/(\S+)/g) {
$word=clamp_str(lc($1));
++$Nin;
if (&keep) { # choose first: consistent subset of words given seed
push @words,$word;
}
}
}
@words=("empty") unless scalar @words;
$N=scalar @words;
$postlen=0;
for my $word (@words) {
$prelen+=length $word;
$wr=&rand_editn($word);
$postlen+=length $wr;
print $pre,$wr,$suf,&rand_ws;
$wx=$word x $also_orig;
print $wx,&rand_ws if $also_orig;
}
$avgpre=$prelen/$N;
$avgpost=$postlen/$N;
print STDERR "\n$N of $Nin possible words selected, $maxdist edits applied to each (avg length $avgpre => $avgpost). Max total possible edit dist=".$N*$maxdist."\n";

178691
breathalyzer/data/twl06.txt Normal file

File diff suppressed because it is too large Load Diff

1299
breathalyzer/levenshtein.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,5 @@
cdef extern from "levenshtein_distance.h":
int levenshtein_distance(char *s,char *t)
def levenshtein(char* s1, char* s2):
return levenshtein_distance(s1, s2)

View File

@ -0,0 +1,47 @@
#include <stdlib.h>
#include <malloc.h>
#include <string.h>
/*Compute levenshtein distance between s and t*/
int levenshtein_distance(char *s, char*t) {
//Step 1
int i, j, k, n, m, cost, *d, distance;
n = strlen(s);
m = strlen(t);
if (n != 0 && m != 0) {
d = malloc((sizeof(int)) * (m + 1) * (n + 1));
m++;
n++;
//Step 2
for (k = 0; k < n; k++)
d[k] = k;
for (k = 0; k < m; k++)
d[k*n] = k;
//Step 3 and 4
for (i = 1; i < n; i++)
for (j = 1; j < m; j++) {
//Step 5
if (s[i-1] == t[j-1])
cost = 0;
else
cost = 1;
//Step 6
d[j*n+i] = minimum(d[(j-1)*n + i] + 1, d[j*n+i-1] + 1, d[(j-1)*n + i - 1] + cost);
}
distance = d[n*m-1];
free(d);
return distance;
}
else
return -1; //a negative return value means that one or both strings are empty.
}
/*Gets the minimum of three values*/
int minimum(int a, int b, int c) {
int min = a;
if (b < min)
min = b;
if (c < min)
min = c;
return min;
}

View File

@ -0,0 +1,3 @@
/* This file was automatically generated. Do not edit! */
int minimum(int a,int b,int c);
int levenshtein_distance(char *s,char *t);

2
hoppity/Makefile Normal file
View File

@ -0,0 +1,2 @@
hoppity: hoppity.hs Makefile
ghc -O2 -o $@ $<

21
hoppity/hoppity.hs Normal file
View File

@ -0,0 +1,21 @@
module Main () where
import System.Environment
import Control.Monad
import Data.Maybe
hoppityfy :: Int -> Maybe String
hoppityfy n
| n `mod` 3 == 0 && n `mod` 5 == 0 = Just "Hop"
| n `mod` 3 == 0 = Just "Hoppity"
| n `mod` 5 == 0 = Just "Hophop"
| otherwise = Nothing
main :: IO ()
main = do
[filename] <- getArgs
content <- readFile filename
let n = (read content) :: Int
let hops = catMaybes . map hoppityfy $ [1 .. n]
forM_ hops $ \h -> putStr . (++ "\n") $ h

12
liarliar/input Normal file
View File

@ -0,0 +1,12 @@
5
Stephen 1
Tommaso
Tommaso 1
Galileo
Isaac 1
Tommaso
Galileo 1
Tommaso
George 2
Isaac
Stephen

45
liarliar/liarliar Executable file
View File

@ -0,0 +1,45 @@
#!/usr/bin/python
import sys
from collections import defaultdict
from itertools import imap
graph = defaultdict(set)
input_file = open(sys.argv[1])
no_of_veterans = input_file.readline().strip()
for i in xrange(int(no_of_veterans)):
(veteran, no_of_liars) = input_file.readline().strip().split()
for j in xrange(int(no_of_liars)):
liar = input_file.readline().strip()
graph[veteran].add(liar)
graph[liar].add(veteran)
input_file.close()
def visit_node(node, mode, partition, seen_nodes):
#print "!%s" % node
if not node in seen_nodes:
partition[node] = mode
#print ">%s:%s" % (node, mode)
seen_nodes.add(node)
for child_node in graph[node]:
visit_node(child_node, not mode, partition, seen_nodes)
seen_nodes = set()
partition = dict()
for veteran in graph.keys():
#print "^%s" % veteran
if veteran not in seen_nodes:
visit_node(veteran, True, partition, seen_nodes)
def quantify(iterable, pred=bool):
"Count how many times the predicate is true"
return sum(imap(pred, iterable))
#print graph
#print partition
truthers = quantify(partition.iteritems(), lambda (k,v):v)
liars = len(partition) - truthers
print "%s %s" % (max(truthers, liars), min(truthers, liars))