Added lucene indexer and searcher. Added WS endpoints for them

master
Abhinav Sarkar 2013-05-13 19:58:53 +05:30
부모 e68434b77b
커밋 c742979d6e
6개의 변경된 파일353개의 추가작업 그리고 17개의 파일을 삭제

35
pom.xml
파일 보기

@ -10,7 +10,8 @@
<maven.compiler.source>1.6</maven.compiler.source>
<maven.compiler.target>1.6</maven.compiler.target>
<encoding>UTF-8</encoding>
<scala.version>2.10.1</scala.version>
<scala.version>2.10.0</scala.version>
<lucene.version>4.3.0</lucene.version>
<project.dependencyDir>${project.build.directory}/dependency</project.dependencyDir>
</properties>
@ -25,6 +26,11 @@
<artifactId>scala-reflect</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-compiler</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>io.netty</groupId>
<artifactId>netty</artifactId>
@ -42,8 +48,31 @@
<version>1.0.0</version>
<scope>runtime</scope>
</dependency>
<!-- Test -->
<dependency>
<groupId>net.liftweb</groupId>
<artifactId>lift-json_2.10</artifactId>
<version>2.5-RC5</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>jline</artifactId>
<version>2.11.0-M2</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>${lucene.version}</version>
</dependency>
</dependencies>
<build>

파일 보기

@ -36,6 +36,7 @@ trait HttpRequestHandler extends ChannelInboundMessageHandlerAdapter[HttpRequest
protected def sendSuccess(ctx : ChannelHandlerContext, request : HttpRequest, body : String) : HttpResponse = {
val response = new DefaultHttpResponse(HttpVersion.HTTP_1_1, HttpResponseStatus.OK)
response.setContent(Unpooled.copiedBuffer(body.getBytes))
response.setHeader(CONTENT_TYPE, "application/json")
writeResponse(ctx, request, response)
response
}

파일 보기

@ -1,7 +1,13 @@
package net.abhinavsarkar.ircsearch
import java.net.InetSocketAddress
import java.nio.charset.Charset
import scala.concurrent.ExecutionContext.Implicits._
import scala.concurrent.future
import com.typesafe.scalalogging.slf4j.Logging
import io.netty.bootstrap.ServerBootstrap
import io.netty.channel.ChannelHandler.Sharable
import io.netty.channel.ChannelHandlerContext
@ -14,11 +20,12 @@ import io.netty.handler.codec.http.HttpContentCompressor
import io.netty.handler.codec.http.HttpRequest
import io.netty.handler.codec.http.HttpRequestDecoder
import io.netty.handler.codec.http.HttpResponseEncoder
import io.netty.handler.codec.http.DefaultHttpResponse
import io.netty.handler.codec.http.HttpVersion
import io.netty.handler.codec.http.HttpResponseStatus
import io.netty.buffer.Unpooled
import java.nio.charset.Charset
import net.abhinavsarkar.ircsearch.lucene.Indexer
import net.abhinavsarkar.ircsearch.lucene.Searcher
import net.abhinavsarkar.ircsearch.model.IndexRequest
import net.abhinavsarkar.ircsearch.model.SearchRequest
import net.liftweb.json.DefaultFormats
import net.liftweb.json.Serialization
object Server extends App with Logging {
@ -31,13 +38,12 @@ object Server extends App with Logging {
val httpRequestRouter = new HttpRequestRouter {
val Echo = "^/echo$".r
val Index = "^/index$".r
val Search = "^/search$".r
def route = {
case Echo() => new HttpRequestHandler {
override def messageReceived(ctx: ChannelHandlerContext, request: HttpRequest) {
val content = request.getContent().toString(Charset.forName("UTF-8"))
logRequest(ctx, request, sendSuccess(ctx, request, content))
}
}
case Echo() => EchoHandler
case Index() => IndexHandler
case Search() => SearchHandler
}
}
@ -58,7 +64,8 @@ object Server extends App with Logging {
Runtime.getRuntime.addShutdownHook(
new Thread("ShutdownHook") {
override def run {
stopServer(server);
stopServer(server)
IndexHandler.stop
}
})
@ -67,7 +74,8 @@ object Server extends App with Logging {
} catch {
case e : Exception => {
logger.error("Exception while running server. Stopping server", e)
stopServer(server);
stopServer(server)
IndexHandler.stop
}
}
}
@ -78,4 +86,38 @@ object Server extends App with Logging {
logger.info("Stopped server")
}
}
}
@Sharable
object EchoHandler extends HttpRequestHandler {
override def messageReceived(ctx: ChannelHandlerContext, request: HttpRequest) {
val content = request.getContent().toString(Charset.forName("UTF-8"))
logRequest(ctx, request, sendSuccess(ctx, request, content))
}
}
@Sharable
object IndexHandler extends HttpRequestHandler {
implicit val formats = DefaultFormats
lazy val indexer = { val indexer = new Indexer; indexer.start; indexer }
override def messageReceived(ctx: ChannelHandlerContext, request: HttpRequest) {
future {
val content = request.getContent().toString(Charset.forName("UTF-8"))
val indexRequest = Serialization.read[IndexRequest](content)
indexer.index(indexRequest)
}
logRequest(ctx, request, sendDefaultResponse(ctx, request))
}
def stop = indexer.stop
}
@Sharable
object SearchHandler extends HttpRequestHandler {
implicit val formats = DefaultFormats
override def messageReceived(ctx: ChannelHandlerContext, request: HttpRequest) {
val content = request.getContent().toString(Charset.forName("UTF-8"))
val searchRequest = Serialization.read[SearchRequest](content)
val searchResult = Searcher.search(searchRequest)
logRequest(ctx, request, sendSuccess(ctx, request, Serialization.write(searchResult)))
}
}

파일 보기

@ -0,0 +1,138 @@
package net.abhinavsarkar.ircsearch.lucene
import java.io.File
import java.util.ArrayList
import java.util.concurrent.Executors
import java.util.concurrent.Future
import java.util.concurrent.LinkedBlockingQueue
import java.util.concurrent.TimeUnit
import java.util.concurrent.locks.ReentrantLock
import scala.collection.JavaConversions._
import org.apache.lucene.analysis.Analyzer
import org.apache.lucene.analysis.core.KeywordAnalyzer
import org.apache.lucene.analysis.en.EnglishAnalyzer
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper
import org.apache.lucene.analysis.standard.StandardAnalyzer
import org.apache.lucene.document.Field
import org.apache.lucene.document.FieldType
import org.apache.lucene.document.FieldType.NumericType
import org.apache.lucene.index.IndexReader
import org.apache.lucene.index.IndexWriter
import org.apache.lucene.index.IndexWriterConfig
import org.apache.lucene.search.IndexSearcher
import org.apache.lucene.store.FSDirectory
import org.apache.lucene.util.Version
import com.typesafe.scalalogging.slf4j.Logging
import net.abhinavsarkar.ircsearch.model.IndexRequest
class Indexer extends Logging {
import Indexer._
private val indexQueue = new LinkedBlockingQueue[IndexRequest]
private val scheduler = Executors.newSingleThreadScheduledExecutor
private val runLock = new ReentrantLock
private var runFuture : Future[_] = null
def index(indexRequest : IndexRequest) = indexQueue.offer(indexRequest)
def start {
logger.info("Starting indexer")
runFuture = scheduler.scheduleWithFixedDelay(
new Runnable {
def run {
try {
runLock.lock
logger.debug("Running indexer")
val indexReqs = new ArrayList[IndexRequest]
indexQueue.drainTo(indexReqs)
doIndex(indexReqs.toList)
} catch {
case e : Throwable => logger.error("Exception while running indexer", e)
} finally {
runLock.unlock
}
}},
0, 10, TimeUnit.SECONDS)
}
def stop {
try {
runLock.lock
if (runFuture != null) {
runFuture.cancel(false)
runFuture = null
}
logger.info("Stopped indexer")
} finally {
runLock.unlock
}
}
private def doIndex(indexReqs: List[IndexRequest]) {
val indexRequests = indexReqs.groupBy { r =>
(r.server, r.channel, r.botName)
}
for (((server, channel, botName), indexRequestBatch) <- indexRequests) {
val indexDir = getIndexDir(server, channel, botName)
val analyzer = mkAnalyzer
val indexWriter = mkIndexWriter(indexDir, analyzer)
try {
for (indexRequest <- indexRequestBatch;
chatLine <- indexRequest.chatLines) {
val tsField = mkField("timestamp", chatLine.timestamp.toString, false)
val userField = mkField("user", chatLine.user, true)
val msgField = mkField("message", chatLine.message)
indexWriter.addDocument(List(tsField, userField, msgField), analyzer)
logger.debug("Indexed : [{} {} {}] [{}] {}: {}",
server, channel, botName, chatLine.timestamp.toString, chatLine.user, chatLine.message)
}
} finally {
indexWriter.close
analyzer.close
}
}
}
}
object Indexer {
val LUCENE_VERSION = Version.LUCENE_43
def mkAnalyzer : Analyzer = {
val defAnalyzer = new StandardAnalyzer(LUCENE_VERSION)
val fieldAnalyzers = Map(
"user" -> new KeywordAnalyzer,
"message" -> new EnglishAnalyzer(LUCENE_VERSION))
new PerFieldAnalyzerWrapper(defAnalyzer, fieldAnalyzers)
}
private def mkIndexWriter(dirPath : String, analyzer : Analyzer) : IndexWriter = {
val indexDir = new File(dirPath)
if (indexDir.exists) {
assert(indexDir.isDirectory)
}
new IndexWriter(FSDirectory.open(indexDir), new IndexWriterConfig(LUCENE_VERSION, analyzer))
}
def getIndexDir(server : String, channel : String, botName : String) : String =
s"index-$server-$channel-$botName"
private def mkField(name : String, value : String,
tokenized : Boolean = true, numericType : Option[NumericType] = None) : Field = {
val fieldType = new FieldType
fieldType.setStored(true)
fieldType.setIndexed(true)
fieldType.setTokenized(tokenized)
numericType.foreach { fieldType.setNumericType }
new Field(name, value, fieldType)
}
}

파일 보기

@ -0,0 +1,103 @@
package net.abhinavsarkar.ircsearch.lucene
import com.typesafe.scalalogging.slf4j.Logging
import org.apache.lucene.search.IndexSearcher
import java.io.File
import org.apache.lucene.index.IndexReader
import org.apache.lucene.store.FSDirectory
import org.apache.lucene.analysis.Analyzer
import org.apache.lucene.queryparser.classic.QueryParser
import org.apache.lucene.search.Query
import scala.collection.immutable.Set
import org.apache.lucene.search.BooleanQuery
import org.apache.lucene.search.TermQuery
import org.apache.lucene.search.BooleanClause
import org.apache.lucene.search.QueryWrapperFilter
import org.apache.lucene.search.Filter
import net.abhinavsarkar.ircsearch.model.SearchRequest
import net.abhinavsarkar.ircsearch.model.SearchResult
import org.apache.lucene.search.Sort
import org.apache.lucene.search.SortField
import scala.collection.JavaConversions._
import scala.collection.mutable
import net.abhinavsarkar.ircsearch.model.ChatLine
import net.abhinavsarkar.ircsearch.model.ChatLine
import net.abhinavsarkar.ircsearch.model.SearchResult
import net.abhinavsarkar.ircsearch.model.SearchResult
object Searcher extends Logging {
private def mkIndexSearcher(dirPath : String) : IndexSearcher = {
val indexDir = new File(dirPath)
assert(indexDir.exists && indexDir.isDirectory)
new IndexSearcher(IndexReader.open(FSDirectory.open(indexDir)))
}
private def mkQueryParser(analyzer : Analyzer) =
new QueryParser(Indexer.LUCENE_VERSION, "message", analyzer)
private def filterifyQuery(query : Query, mustFields : Set[String]) : (Query, Option[Filter]) =
query match {
case boolQuery: BooleanQuery => {
val newQuery = new BooleanQuery
val filterQuery = new BooleanQuery
for (clause <- boolQuery.getClauses) {
val subQuery = clause.getQuery
if (subQuery.isInstanceOf[TermQuery]) {
val termQuery = subQuery.asInstanceOf[TermQuery]
val field = termQuery.getTerm.field
if (mustFields contains field) {
clause.setOccur(BooleanClause.Occur.MUST)
filterQuery.add(clause)
} else {
newQuery.add(clause)
}
} else {
newQuery.add(clause)
}
}
(newQuery, if (filterQuery.clauses.isEmpty) None else Some(new QueryWrapperFilter(filterQuery)))
}
case _ => (query, None)
}
def search(searchRequest : SearchRequest) : SearchResult = {
logger.debug("Searching : [{} {} {}] {}",
searchRequest.server, searchRequest.channel, searchRequest.botName, searchRequest.query)
val indexDir = Indexer.getIndexDir(searchRequest.server, searchRequest.channel, searchRequest.botName)
val analyzer = Indexer.mkAnalyzer
try {
val queryParser = mkQueryParser(analyzer)
val (query, filter) = filterifyQuery(queryParser.parse(searchRequest.query), Set("user"))
logger.debug("Query: {}, Filter: {}", query, filter)
val (totalResults, results) = doSearch(indexDir, query, filter, searchRequest.pageSize)
val searchResults = SearchResult.fromSearchRequest(searchRequest)
.copy(totalResults = totalResults, chatLines = results.map(_._1))
logger.debug("Search results: {}", searchResults)
searchResults
} finally {
analyzer.close
}
}
private def doSearch(indexDir : String, query : Query, filter : Option[Filter], maxHits : Int)
: (Int, List[(ChatLine, Float)]) = {
val indexSearcher = mkIndexSearcher(indexDir)
val topDocs = indexSearcher.search(query, filter.orNull, maxHits,
new Sort(SortField.FIELD_SCORE, new SortField("timestamp", SortField.Type.LONG, true)))
val docs = topDocs.scoreDocs.map { sd =>
val score = sd.score
val doc = indexSearcher.doc(sd.doc).getFields.foldLeft(mutable.Map[String, String]()) {
(map, field) => map += (field.name -> field.stringValue)
}
val chatLine = new ChatLine(doc("user"), doc("timestamp").toLong, doc("message"))
(chatLine, score)
}
(topDocs.totalHits, docs.toList)
}
}

파일 보기

@ -0,0 +1,23 @@
package net.abhinavsarkar.ircsearch.model
case class ChatLine(user : String, timestamp : Long, message : String)
case class IndexRequest(
server : String, channel : String, botName : String, chatLines : List[ChatLine])
case class SearchRequest(
server : String, channel : String, botName : String, query: String,
page : Int = 0, pageSize : Int = 10)
case class SearchResult(
server : String, channel : String, botName : String, query: String,
page : Int, pageSize : Int, totalResults : Int, chatLines : List[ChatLine])
object SearchResult {
def fromSearchRequest(searchRequest : SearchRequest) = searchRequest match {
case SearchRequest(server, channel, botName, query, page, pageSize) =>
new SearchResult(server, channel, botName, query, page, pageSize, 0, List())
}
}