126 lines
4.0 KiB
Clojure
126 lines
4.0 KiB
Clojure
(ns irc-search-bot.lucene
|
|
(:import [org.apache.lucene.document Document Field Field$Store Field$Index]
|
|
[org.apache.lucene.store RAMDirectory FSDirectory]
|
|
[org.apache.lucene.analysis Analyzer KeywordTokenizer]
|
|
[org.apache.lucene.analysis.standard StandardAnalyzer]
|
|
[org.apache.lucene.util Version]
|
|
[org.apache.lucene.index IndexWriter IndexWriterConfig IndexReader]
|
|
[org.apache.lucene.search
|
|
IndexSearcher ScoreDoc Query BooleanQuery TermQuery BooleanClause
|
|
BooleanClause$Occur Filter QueryWrapperFilter Sort SortField]
|
|
[org.apache.lucene.queryParser QueryParser]
|
|
[org.apache.lucene.wordnet AnalyzerUtil])
|
|
(:use [clojure.java.io :only (as-file)]))
|
|
|
|
(def *lucene-version* Version/LUCENE_30)
|
|
|
|
(defn ^IndexWriter index-writer [directory analyzer]
|
|
(IndexWriter. directory (IndexWriterConfig. *lucene-version* analyzer)))
|
|
|
|
(defn ^IndexSearcher index-searcher [directory]
|
|
(IndexSearcher. (IndexReader/open directory)))
|
|
|
|
(defn query-parser [default-field-name analyzer]
|
|
(QueryParser. *lucene-version* (name default-field-name) analyzer))
|
|
|
|
(defn parse-query [^QueryParser query-parser query-text]
|
|
(.parse query-parser query-text))
|
|
|
|
(defn filterify-query [^Query query must-fields]
|
|
(if (instance? BooleanQuery query)
|
|
(let [new-query (BooleanQuery.)
|
|
filter-query (BooleanQuery.)]
|
|
(doseq [^BooleanClause clause (.clauses ^BooleanQuery query)]
|
|
(let [subquery (.getQuery clause)]
|
|
(if (and (instance? TermQuery subquery)
|
|
(must-fields (.field (.getTerm ^TermQuery subquery))))
|
|
(do
|
|
(.setOccur clause BooleanClause$Occur/MUST)
|
|
(.add filter-query clause))
|
|
(do
|
|
(.add new-query clause)))))
|
|
[new-query
|
|
(if (.isEmpty (.clauses filter-query))
|
|
nil
|
|
(QueryWrapperFilter. filter-query))])
|
|
[query, nil]))
|
|
|
|
(defn search
|
|
[^IndexSearcher index-searcher ^Query query ^Filter filter ^Integer max-hits ^String timestamp-field]
|
|
(let [top-docs
|
|
(.search
|
|
index-searcher
|
|
query
|
|
filter
|
|
max-hits
|
|
(Sort.
|
|
(into-array
|
|
[SortField/FIELD_SCORE (SortField. timestamp-field SortField/LONG true)])))]
|
|
(vector
|
|
(.totalHits top-docs)
|
|
(->>
|
|
top-docs
|
|
(.scoreDocs)
|
|
seq
|
|
(map
|
|
(fn [^ScoreDoc sd]
|
|
(hash-map
|
|
:score (.score sd)
|
|
:doc
|
|
(->>
|
|
(.doc index-searcher (.doc sd))
|
|
(.getFields)
|
|
seq
|
|
(reduce
|
|
(fn [m ^Field f]
|
|
(assoc m
|
|
(keyword (.name f))
|
|
(if (.isBinary f) (.getBinaryValue f) (.stringValue f))))
|
|
{})))))))))
|
|
|
|
(defn fs-directory [dir-path]
|
|
(FSDirectory/open (as-file dir-path)))
|
|
|
|
(defn ram-directory []
|
|
(RAMDirectory.))
|
|
|
|
(def index-vals
|
|
{:no Field$Index/NO
|
|
:analyzed Field$Index/ANALYZED
|
|
:not-analyzed Field$Index/NOT_ANALYZED
|
|
:not-analyzed-no-norms Field$Index/NOT_ANALYZED_NO_NORMS
|
|
:analyzed-no-norms Field$Index/ANALYZED_NO_NORMS})
|
|
|
|
(defn field
|
|
[field-name ^String field-value & {:keys [store index] :or {store :yes index :analyzed}}]
|
|
(Field.
|
|
(name field-name)
|
|
field-value
|
|
(if (= store :yes) Field$Store/YES Field$Store/NO)
|
|
^Field$Index (index-vals index)))
|
|
|
|
(defn document [& fields]
|
|
(let [d (Document.)]
|
|
(doseq [f fields]
|
|
(.add d f))
|
|
d))
|
|
|
|
(defn add-document [^IndexWriter index-writer document]
|
|
(.addDocument index-writer document))
|
|
|
|
(defn standard-analyzer []
|
|
(StandardAnalyzer. *lucene-version*))
|
|
|
|
(defn stemmer-analyzer [delegate-analyzer]
|
|
(AnalyzerUtil/getPorterStemmerAnalyzer delegate-analyzer))
|
|
|
|
(defn selective-analyzer [^Analyzer delegate-analyzer analyzable-fields]
|
|
(proxy [Analyzer] []
|
|
(tokenStream [field rdr]
|
|
(if (analyzable-fields field)
|
|
(.tokenStream delegate-analyzer field rdr)
|
|
(KeywordTokenizer. rdr)))))
|
|
|
|
(defn logging-analyzer [delegate-analyzer]
|
|
(AnalyzerUtil/getLoggingAnalyzer delegate-analyzer System/out "log"))
|