From 18f696a7504dd740d71b525082c181acc42fc20e Mon Sep 17 00:00:00 2001 From: Abhinav Sarkar Date: Sun, 22 May 2011 00:25:28 +0530 Subject: [PATCH] First working commit --- .gitignore | 8 ++++ project.clj | 11 +++++ src/irc_search_bot/bot.clj | 58 ++++++++++++++++++++++ src/irc_search_bot/core.clj | 90 +++++++++++++++++++++++++++++++++++ src/irc_search_bot/lucene.clj | 81 +++++++++++++++++++++++++++++++ 5 files changed, 248 insertions(+) create mode 100644 .gitignore create mode 100644 project.clj create mode 100644 src/irc_search_bot/bot.clj create mode 100644 src/irc_search_bot/core.clj create mode 100644 src/irc_search_bot/lucene.clj diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..268d6ae --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +pom.xml +*~ +*jar +/lib/ +/classes/ +.lein-deps-sum +ignored_users +/index/ diff --git a/project.clj b/project.clj new file mode 100644 index 0000000..f4b2091 --- /dev/null +++ b/project.clj @@ -0,0 +1,11 @@ +(defproject irc-search-bot "1.0.0-SNAPSHOT" + :description "An IRC bot to search the IRC chat history" + :repositories {"general-maven-repo-snapshot" + {:url "http://general-maven-repo.googlecode.com/svn/maven2/snapshots" + :snapshots true + :releases false}} + :dependencies [[org.clojure/clojure "1.2.1"] + [org.clojure/clojure-contrib "1.2.0"] + [org.apache.lucene/lucene-core "3.1.0"] + [org.apache.lucene/lucene-wordnet "3.1.0"] + [org/pircbotx "1.3-SNAPSHOT"]]) diff --git a/src/irc_search_bot/bot.clj b/src/irc_search_bot/bot.clj new file mode 100644 index 0000000..e14f7de --- /dev/null +++ b/src/irc_search_bot/bot.clj @@ -0,0 +1,58 @@ +(ns irc-search-bot.bot + (:import [org.pircbotx PircBotX Channel] + [org.pircbotx.hooks Event Listener]) + (:use [clojure.string :only [join lower-case]])) + +(defn spy [o] (do (println o) o)) + +(defmulti event-listener + (fn [bot event] + (->> + event + class + (.getSimpleName) + ;(spy) + (re-seq #"([A-Z][^A-Z]*)") + butlast + (map first) + (map lower-case) + (join "-") + keyword))) + +(defn make-bot [name] + (doto (PircBotX.) + (.setName name) + (.. + (getListenerManager) + (addListener + (proxy [Listener] [] + (onEvent [^Event e] + (try + (event-listener (.getBot e) e) + (catch Exception e + (.printStackTrace e))))))))) + +(defmethod event-listener :default [bot ev]) + +(defn connect-bot [^PircBotX bot server channel] + (doto bot + (.connect server) + (.joinChannel channel))) + +(defn disconnect-bot [^PircBotX bot] + (doto bot (.disconnect))) + +(defn join-channel [bot channel] + (doto bot + (.joinChannel + (if (instance? Channel channel) + (.getName channel) + channel)))) + +(defn send-message [bot channel message] + (doto bot + (.sendMessage + (if (instance? Channel channel) + channel + (.getChannel bot channel)) + message))) \ No newline at end of file diff --git a/src/irc_search_bot/core.clj b/src/irc_search_bot/core.clj new file mode 100644 index 0000000..adcc924 --- /dev/null +++ b/src/irc_search_bot/core.clj @@ -0,0 +1,90 @@ +(ns irc-search-bot.core + (:import [java.util.concurrent Executors TimeUnit] + [java.util Date]) + (:use [irc-search-bot.bot] + [irc-search-bot.lucene] + [clojure.string :only (trim join)] + [clojure.java.io :only (reader as-file)])) + +(def *index-dir* (fs-directory "index")) + +(def *chat-log* (atom [])) + +(def *analyzer* (stemmer-analyzer)) + +(def *max-hits* 3) + +(def *ignored-users* + (if (.exists (as-file "ignored_users")) + (with-open [rdr (reader "ignored_users")] + (into (hash-set) (line-seq rdr))) + #{})) + +(defn index-chat-log [index-writer chat-log] + (doseq [[timestamp user message] chat-log] + (do + (println (format "[%tr] %s: %s" (Date. timestamp) user message)) + (add-document + index-writer + (document + (field :timestamp (str timestamp) :index :not-analyzed) + (field :user user :index :not-analyzed) + (field :message message)))))) + +(defn search-chat-log [index-searcher query max-hits analyzer] + (let [qp (query-parser :message analyzer) + q (parse-query qp query) + hits (search index-searcher q max-hits)] + (println ">>" (count hits) "hits for query:" query) + (map + #(let [timestamp (-> % :doc :timestamp (Long/parseLong) (Date.))] + (format + "[%tI:%tM %tp] %s: %s" + timestamp timestamp timestamp + (-> % :doc :user) + (-> % :doc :message))) + hits))) + +(defn schedule-index-chat-log [] + (let [executor (Executors/newSingleThreadScheduledExecutor)] + (.scheduleWithFixedDelay + executor + (fn [] + (try + (with-open [iw (index-writer *index-dir* *analyzer*)] + (let [chat-log @*chat-log*] + (do + (reset! *chat-log* []) + (index-chat-log iw chat-log)))) + (catch Exception e + (.printStackTrace e)))) + 10 10 TimeUnit/SECONDS))) + +(defmethod event-listener :disconnect [bot ev] + (do + (.connect bot (.getServer bot)) + (doseq [channel (.getChannelNames bot)] + (.joinChannel bot channel)))) + +(defmethod event-listener :kick [bot ev] + (join-channel bot (.getChannel ev))) + +(defmethod event-listener :message [bot ev] + (let [msg (trim (.getMessage ev)) + user (.. ev getUser getNick) + timestamp (.getTimestamp ev) + channel (.getChannel ev)] + (if (.startsWith msg "!q") + (with-open [is (index-searcher *index-dir*)] + (let [results (search-chat-log is (trim (subs msg 2)) *max-hits* *analyzer*)] + (if (zero? (count results)) + (send-message bot channel "No results found") + (doseq [result results] + (send-message bot channel result))))) + (when-not (and (.startsWith msg "!") (not (*ignored-users* user))) + (swap! *chat-log* conj [timestamp user msg]))))) + +(defn run-bot [bot-name server channel] + (let [bot (make-bot bot-name)] + (connect-bot bot server channel) + (schedule-index-chat-log))) \ No newline at end of file diff --git a/src/irc_search_bot/lucene.clj b/src/irc_search_bot/lucene.clj new file mode 100644 index 0000000..b24139c --- /dev/null +++ b/src/irc_search_bot/lucene.clj @@ -0,0 +1,81 @@ +(ns irc-search-bot.lucene + (:import [org.apache.lucene.document Document Field Field$Store Field$Index] + [org.apache.lucene.store RAMDirectory FSDirectory] + [org.apache.lucene.analysis.standard StandardAnalyzer] + [org.apache.lucene.util Version] + [org.apache.lucene.index IndexWriter IndexWriterConfig IndexReader] + [org.apache.lucene.search IndexSearcher ScoreDoc Query] + [org.apache.lucene.queryParser QueryParser] + [org.apache.lucene.wordnet AnalyzerUtil]) + (:use [clojure.java.io :only (as-file)])) + +(def *lucene-version* Version/LUCENE_30) + +(defn index-writer [directory analyzer] + (IndexWriter. directory (IndexWriterConfig. *lucene-version* analyzer))) + +(defn index-searcher [directory] + (IndexSearcher. (IndexReader/open directory))) + +(defn query-parser [default-field-name analyzer] + (QueryParser. *lucene-version* (name default-field-name) analyzer)) + +(defn parse-query [^QueryParser query-parser query-text] + (.parse query-parser query-text)) + +(defn search [^IndexSearcher index-searcher ^Query query ^Integer max-hits] + (->> + (.search index-searcher query max-hits) + (.scoreDocs) + seq + (map + (fn [^ScoreDoc sd] + (hash-map + :score (.score sd) + :doc + (->> + (.doc index-searcher (.doc sd)) + (.getFields) + seq + (reduce + (fn [m ^Field f] + (assoc m + (keyword (.name f)) + (if (.isBinary f) (.getBinaryValue f) (.stringValue f)))) + {}))))))) + +(defn fs-directory [dir-path] + (FSDirectory/open (as-file dir-path))) + +(defn ram-directory [] + (RAMDirectory.)) + +(def index-vals + {:no Field$Index/NO + :analyzed Field$Index/ANALYZED + :not-analyzed Field$Index/NOT_ANALYZED + :not-analyzed-no-norms Field$Index/NOT_ANALYZED_NO_NORMS + :analyzed-no-norms Field$Index/ANALYZED_NO_NORMS}) + +(defn field + [field-name ^String field-value & {:keys [store index] :or {store :yes index :analyzed}}] + (Field. + (name field-name) + field-value + (if (= store :yes) Field$Store/YES Field$Store/NO) + ^Field$Index (index-vals index))) + +(defn document [& fields] + (let [d (Document.)] + (doseq [f fields] + (.add d f)) + d)) + +(defn add-document [^IndexWriter index-writer document] + (.addDocument index-writer document)) + +(defn standard-analyzer [] + (StandardAnalyzer. *lucene-version*)) + +(defn stemmer-analyzer [] + (AnalyzerUtil/getPorterStemmerAnalyzer (standard-analyzer))) \ No newline at end of file