Added support for indexing and searching context of a chat line
parent
3822bd6017
commit
2d50b9bb73
|
@ -39,6 +39,14 @@ trait HttpRequestHandler extends ChannelInboundMessageHandlerAdapter[HttpRequest
|
||||||
response
|
response
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected def sendError(ctx : ChannelHandlerContext, request : HttpRequest, body : String) : HttpResponse = {
|
||||||
|
val response = new DefaultHttpResponse(HTTP_1_1, INTERNAL_SERVER_ERROR)
|
||||||
|
response.setContent(Unpooled.copiedBuffer(body.getBytes))
|
||||||
|
response.setHeader(CONTENT_TYPE, "text/plain")
|
||||||
|
writeResponse(ctx, request, response)
|
||||||
|
response
|
||||||
|
}
|
||||||
|
|
||||||
protected def writeResponse(
|
protected def writeResponse(
|
||||||
ctx : ChannelHandlerContext, request : HttpRequest, response : HttpResponse) {
|
ctx : ChannelHandlerContext, request : HttpRequest, response : HttpResponse) {
|
||||||
response.setHeader(CONTENT_LENGTH, response.getContent().readableBytes())
|
response.setHeader(CONTENT_LENGTH, response.getContent().readableBytes())
|
||||||
|
|
|
@ -215,6 +215,9 @@ object SearchHandler extends HttpRequestHandler {
|
||||||
case searchResult =>
|
case searchResult =>
|
||||||
logRequest(ctx, request, sendSuccess(ctx, request, Serialization.write(searchResult)))
|
logRequest(ctx, request, sendSuccess(ctx, request, Serialization.write(searchResult)))
|
||||||
}
|
}
|
||||||
f onFailure { case e : Exception => logger.error("Error", e) }
|
f onFailure { case e : Exception => {
|
||||||
|
logger.error("Error", e)
|
||||||
|
logRequest(ctx, request, sendError(ctx, request, e.getMessage))
|
||||||
|
}}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,11 +4,15 @@ import java.io.File
|
||||||
import java.util.ArrayList
|
import java.util.ArrayList
|
||||||
import java.util.concurrent.Executors
|
import java.util.concurrent.Executors
|
||||||
import java.util.concurrent.Future
|
import java.util.concurrent.Future
|
||||||
import java.util.concurrent.LinkedBlockingQueue
|
import java.util.concurrent.PriorityBlockingQueue
|
||||||
import java.util.concurrent.TimeUnit
|
import java.util.concurrent.TimeUnit
|
||||||
import java.util.concurrent.locks.ReentrantLock
|
import java.util.concurrent.locks.ReentrantLock
|
||||||
|
|
||||||
import scala.collection.JavaConversions._
|
import scala.collection.JavaConversions._
|
||||||
|
import scala.collection.Seq
|
||||||
import scala.collection.mutable
|
import scala.collection.mutable
|
||||||
|
import scala.math.Ordered
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer
|
import org.apache.lucene.analysis.Analyzer
|
||||||
import org.apache.lucene.analysis.core.KeywordAnalyzer
|
import org.apache.lucene.analysis.core.KeywordAnalyzer
|
||||||
import org.apache.lucene.analysis.en.EnglishAnalyzer
|
import org.apache.lucene.analysis.en.EnglishAnalyzer
|
||||||
|
@ -22,16 +26,40 @@ import org.apache.lucene.index.IndexWriter
|
||||||
import org.apache.lucene.index.IndexWriterConfig
|
import org.apache.lucene.index.IndexWriterConfig
|
||||||
import org.apache.lucene.store.FSDirectory
|
import org.apache.lucene.store.FSDirectory
|
||||||
import org.apache.lucene.util.Version
|
import org.apache.lucene.util.Version
|
||||||
|
|
||||||
import com.typesafe.scalalogging.slf4j.Logging
|
import com.typesafe.scalalogging.slf4j.Logging
|
||||||
|
|
||||||
import net.abhinavsarkar.ircsearch.model._
|
import net.abhinavsarkar.ircsearch.model._
|
||||||
import java.util.concurrent.BlockingDeque
|
|
||||||
import java.util.concurrent.BlockingQueue
|
|
||||||
|
|
||||||
object Indexer extends Logging {
|
object Indexer extends Logging {
|
||||||
|
|
||||||
val LUCENE_VERSION = Version.LUCENE_43
|
case class IndexRecord(
|
||||||
|
server : String, channel : String, botName : String, chatLine : ChatLine,
|
||||||
|
indexed : Boolean = false)
|
||||||
|
extends Ordered[IndexRecord] {
|
||||||
|
def compare(that : IndexRecord) = {
|
||||||
|
val diff = this.chatLine.timestamp - that.chatLine.timestamp
|
||||||
|
if (diff > 0) 1 else if (diff < 0) -1 else 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private val indexReqQueue = new LinkedBlockingQueue[IndexRequest](10000)
|
object IndexRecord {
|
||||||
|
|
||||||
|
def fromIndexRequest(indexRequest : IndexRequest) = {
|
||||||
|
val IndexRequest(server, channel, botName, chatLines) = indexRequest
|
||||||
|
for {
|
||||||
|
chatLine <- chatLines
|
||||||
|
} yield new IndexRecord(server, channel, botName, chatLine)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
val LUCENE_VERSION = Version.LUCENE_43
|
||||||
|
val ContextSize = 2
|
||||||
|
val ContextDurationSecs = 20
|
||||||
|
val IndexingDurationSecs = 10
|
||||||
|
|
||||||
|
private val indexQueue = new PriorityBlockingQueue[IndexRecord](10000)
|
||||||
private val scheduler = Executors.newScheduledThreadPool(2)
|
private val scheduler = Executors.newScheduledThreadPool(2)
|
||||||
private val runLock = new ReentrantLock
|
private val runLock = new ReentrantLock
|
||||||
private var indexingFuture : Future[_] = null
|
private var indexingFuture : Future[_] = null
|
||||||
|
@ -55,7 +83,9 @@ object Indexer extends Logging {
|
||||||
val defAnalyzer = new StandardAnalyzer(LUCENE_VERSION)
|
val defAnalyzer = new StandardAnalyzer(LUCENE_VERSION)
|
||||||
val fieldAnalyzers = Map(
|
val fieldAnalyzers = Map(
|
||||||
ChatLine.USER -> new KeywordAnalyzer,
|
ChatLine.USER -> new KeywordAnalyzer,
|
||||||
ChatLine.MSG -> new EnglishAnalyzer(LUCENE_VERSION))
|
ChatLine.MSG -> new EnglishAnalyzer(LUCENE_VERSION),
|
||||||
|
ChatLine.CTXB -> new EnglishAnalyzer(LUCENE_VERSION),
|
||||||
|
ChatLine.CTXA -> new EnglishAnalyzer(LUCENE_VERSION))
|
||||||
|
|
||||||
new PerFieldAnalyzerWrapper(defAnalyzer, fieldAnalyzers)
|
new PerFieldAnalyzerWrapper(defAnalyzer, fieldAnalyzers)
|
||||||
}
|
}
|
||||||
|
@ -79,7 +109,8 @@ object Indexer extends Logging {
|
||||||
def getIndexDir(server : String, channel : String, botName : String) : String =
|
def getIndexDir(server : String, channel : String, botName : String) : String =
|
||||||
s"index-$server-$channel-$botName"
|
s"index-$server-$channel-$botName"
|
||||||
|
|
||||||
def index(indexRequest : IndexRequest) = indexReqQueue.put(indexRequest)
|
def index(indexRequest : IndexRequest) =
|
||||||
|
IndexRecord.fromIndexRequest(indexRequest).foreach(indexQueue.put)
|
||||||
|
|
||||||
private def doInLock(f : => Unit) {
|
private def doInLock(f : => Unit) {
|
||||||
try {
|
try {
|
||||||
|
@ -98,17 +129,59 @@ object Indexer extends Logging {
|
||||||
}
|
}
|
||||||
}}
|
}}
|
||||||
|
|
||||||
def indexReqStream : Stream[IndexRequest] = Stream.cons(indexReqQueue.take, indexReqStream)
|
def schedule(initialDelay : Int, delay : Int, unit : TimeUnit)(f : => Unit) = {
|
||||||
|
scheduler.scheduleWithFixedDelay(f, initialDelay, delay, unit)
|
||||||
|
}
|
||||||
|
|
||||||
|
def fillContext(rec: IndexRecord, recs: Seq[IndexRecord], idx : Int) = {
|
||||||
|
rec.copy(chatLine =
|
||||||
|
rec.chatLine.copy(
|
||||||
|
contextBefore = recs.slice(idx - ContextSize, idx).map(_.chatLine)
|
||||||
|
.filter(_.timestamp >= rec.chatLine.timestamp - ContextDurationSecs * 1000)
|
||||||
|
.toList,
|
||||||
|
contextAfter = recs.slice(idx + 1, 2 * ContextSize + 1).map(_.chatLine)
|
||||||
|
.filter(_.timestamp <= rec.chatLine.timestamp + ContextDurationSecs * 1000)
|
||||||
|
.toList))
|
||||||
|
}
|
||||||
|
|
||||||
def start {
|
def start {
|
||||||
logger.info("Starting indexer")
|
logger.info("Starting indexer")
|
||||||
indexingFuture = scheduler.submit {
|
indexingFuture = schedule(0, IndexingDurationSecs.max(ContextDurationSecs), TimeUnit.SECONDS) {
|
||||||
for (indexReq <- indexReqStream)
|
if (!indexQueue.isEmpty) {
|
||||||
doInLock {
|
val indexRecs = new ArrayList[IndexRecord]
|
||||||
doIndex(List(indexReq))
|
indexQueue drainTo indexRecs
|
||||||
|
val indexRecsMap = indexRecs groupBy { r => (r.server, r.channel, r.botName) }
|
||||||
|
|
||||||
|
val windowSize = 2 * ContextSize + 1
|
||||||
|
for (indexRecBatch <- indexRecsMap.values) {
|
||||||
|
for (recs <- indexRecBatch.sliding(windowSize)) {
|
||||||
|
if (recs.size == windowSize) {
|
||||||
|
doInLock {
|
||||||
|
doIndex(fillContext(recs(ContextSize), recs, ContextSize))
|
||||||
|
}
|
||||||
|
} else if (recs.size < ContextSize + 1) {
|
||||||
|
recs.foreach(indexQueue.offer)
|
||||||
|
} else {
|
||||||
|
recs.zipWithIndex.drop(ContextSize).foreach { r =>
|
||||||
|
doInLock {
|
||||||
|
doIndex(fillContext(r._1, recs, r._2))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (indexRecBatch.size > windowSize) {
|
||||||
|
indexRecBatch.slice(indexRecBatch.length - 2 * ContextSize, indexRecBatch.length)
|
||||||
|
.zipWithIndex
|
||||||
|
.map { r => if (r._2 < ContextSize) r._1.copy(indexed = true) else r._1 }
|
||||||
|
.foreach(indexQueue.put)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
flushFuture = schedule(0, 10, TimeUnit.SECONDS) {
|
||||||
|
doInLock(flush)
|
||||||
}
|
}
|
||||||
flushFuture = scheduler.scheduleWithFixedDelay(doInLock(flush), 0, 10, TimeUnit.SECONDS)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def stop {
|
def stop {
|
||||||
|
@ -126,23 +199,22 @@ object Indexer extends Logging {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private def doIndex(indexReqs: List[IndexRequest]) {
|
def ctxToStr(ctx : List[ChatLine]) =
|
||||||
val indexRequests = indexReqs.groupBy { r =>
|
ctx.map { line => s"${line.timestamp} ${line.user}: ${line.message}" } mkString "\n"
|
||||||
(r.server, r.channel, r.botName)
|
|
||||||
}
|
|
||||||
|
|
||||||
for (((server, channel, botName), indexRequestBatch) <- indexRequests) {
|
private def doIndex(indexRecord: IndexRecord) {
|
||||||
|
val IndexRecord(server, channel, botName, chatLine, indexed) = indexRecord
|
||||||
|
if (!indexed) {
|
||||||
val indexDir = getIndexDir(server, channel, botName)
|
val indexDir = getIndexDir(server, channel, botName)
|
||||||
val indexWriter = getIndexWriter(indexDir)
|
val indexWriter = getIndexWriter(indexDir)
|
||||||
for (indexRequest <- indexRequestBatch;
|
val ts = new LongField(ChatLine.TS, chatLine.timestamp, Field.Store.YES)
|
||||||
chatLine <- indexRequest.chatLines) {
|
val user = new StringField(ChatLine.USER, chatLine.user, Field.Store.YES)
|
||||||
val tsField = new LongField(ChatLine.TS, chatLine.timestamp, Field.Store.YES)
|
val msg = new TextField(ChatLine.MSG, chatLine.message, Field.Store.YES)
|
||||||
val userField = new StringField(ChatLine.USER, chatLine.user, Field.Store.YES)
|
val ctxBfr = new TextField(ChatLine.CTXB, ctxToStr(chatLine.contextBefore), Field.Store.YES)
|
||||||
val msgField = new TextField(ChatLine.MSG, chatLine.message, Field.Store.YES)
|
val ctxAft = new TextField(ChatLine.CTXA, ctxToStr(chatLine.contextAfter), Field.Store.YES)
|
||||||
indexWriter.addDocument(List(tsField, userField, msgField), indexWriter.getAnalyzer)
|
indexWriter.addDocument(List(ts, user, msg, ctxBfr, ctxAft), indexWriter.getAnalyzer)
|
||||||
logger.debug("Indexed : [{} {} {}] [{}] {}: {}",
|
logger.debug("Indexed : [{} {} {}] [{}] {}: {}",
|
||||||
server, channel, botName, chatLine.timestamp.toString, chatLine.user, chatLine.message)
|
server, channel, botName, chatLine.timestamp.toString, chatLine.user, chatLine.message)
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5,12 +5,13 @@ import java.text.ParseException
|
||||||
import java.text.SimpleDateFormat
|
import java.text.SimpleDateFormat
|
||||||
|
|
||||||
import scala.collection.JavaConversions._
|
import scala.collection.JavaConversions._
|
||||||
|
import scala.collection.immutable.Map
|
||||||
import scala.collection.mutable
|
import scala.collection.mutable
|
||||||
import scala.collection.mutable.Buffer
|
import scala.collection.mutable.Buffer
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer
|
import org.apache.lucene.analysis.Analyzer
|
||||||
import org.apache.lucene.queries.ChainedFilter
|
import org.apache.lucene.queries.ChainedFilter
|
||||||
import org.apache.lucene.queryparser.classic.QueryParser
|
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser
|
||||||
import org.apache.lucene.search.BooleanClause
|
import org.apache.lucene.search.BooleanClause
|
||||||
import org.apache.lucene.search.BooleanQuery
|
import org.apache.lucene.search.BooleanQuery
|
||||||
import org.apache.lucene.search.Filter
|
import org.apache.lucene.search.Filter
|
||||||
|
@ -32,6 +33,7 @@ import net.abhinavsarkar.ircsearch.model._
|
||||||
object Searcher extends Logging {
|
object Searcher extends Logging {
|
||||||
|
|
||||||
val MaxHits = 1000
|
val MaxHits = 1000
|
||||||
|
val MessageFieldBoost = java.lang.Float.valueOf(2.0f)
|
||||||
|
|
||||||
private val searcherMgrs = mutable.Map[String, SearcherManager]()
|
private val searcherMgrs = mutable.Map[String, SearcherManager]()
|
||||||
|
|
||||||
|
@ -57,7 +59,9 @@ object Searcher extends Logging {
|
||||||
}
|
}
|
||||||
|
|
||||||
private def mkQueryParser(analyzer : Analyzer) =
|
private def mkQueryParser(analyzer : Analyzer) =
|
||||||
new QueryParser(Indexer.LUCENE_VERSION, ChatLine.MSG, analyzer)
|
new MultiFieldQueryParser(Indexer.LUCENE_VERSION,
|
||||||
|
List(ChatLine.MSG, ChatLine.CTXB, ChatLine.CTXA).toArray, analyzer,
|
||||||
|
Map(ChatLine.MSG -> MessageFieldBoost))
|
||||||
|
|
||||||
private def filterifyQuery(query : Query) : Query =
|
private def filterifyQuery(query : Query) : Query =
|
||||||
query match {
|
query match {
|
||||||
|
@ -130,6 +134,8 @@ object Searcher extends Logging {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private val DocFields = List(ChatLine.USER, ChatLine.TS, ChatLine.MSG, ChatLine.CTXB, ChatLine.CTXA)
|
||||||
|
|
||||||
private def doSearch(indexDir : String, query : Query, page : Int, pageSize : Int)
|
private def doSearch(indexDir : String, query : Query, page : Int, pageSize : Int)
|
||||||
: (Int, List[(ChatLine, Float)]) = {
|
: (Int, List[(ChatLine, Float)]) = {
|
||||||
val searcherMgr = getSearcherMgr(indexDir)
|
val searcherMgr = getSearcherMgr(indexDir)
|
||||||
|
@ -139,14 +145,22 @@ object Searcher extends Logging {
|
||||||
val topDocs = indexSearcher.search(query, MaxHits.min((page + 1) * pageSize),
|
val topDocs = indexSearcher.search(query, MaxHits.min((page + 1) * pageSize),
|
||||||
new Sort(SortField.FIELD_SCORE, new SortField(ChatLine.TS, SortField.Type.LONG, true)))
|
new Sort(SortField.FIELD_SCORE, new SortField(ChatLine.TS, SortField.Type.LONG, true)))
|
||||||
val docs = topDocs.scoreDocs
|
val docs = topDocs.scoreDocs
|
||||||
.drop(page * pageSize)
|
.drop(page * pageSize)
|
||||||
.map { sd =>
|
.map { sd =>
|
||||||
val score = sd.score
|
val score = sd.score
|
||||||
val doc = indexSearcher.doc(sd.doc).getFields.foldLeft(mutable.Map[String, String]()) {
|
val doc = indexSearcher.doc(sd.doc).getFields.foldLeft(mutable.Map[String, String]()) {
|
||||||
(map, field) => map += (field.name -> field.stringValue)
|
(map, field) => map += (field.name -> field.stringValue)
|
||||||
}
|
}
|
||||||
|
|
||||||
val chatLine = new ChatLine(doc(ChatLine.USER), doc(ChatLine.TS).toLong, doc(ChatLine.MSG))
|
val List(user, timestamp, message, contextBefore, contextAfter) = DocFields.map(doc)
|
||||||
|
|
||||||
|
val LineRe = "(\\d+) (.*?): (.*)".r
|
||||||
|
val List(ctxBefore, ctxAfter) = List(contextBefore, contextAfter).map {
|
||||||
|
_.split('\n').filterNot(_.isEmpty).map {
|
||||||
|
case LineRe(timestamp, user, message) => new ChatLine(user, timestamp.toLong, message)
|
||||||
|
}}
|
||||||
|
|
||||||
|
val chatLine = new ChatLine(user, timestamp.toLong, message, ctxBefore.toList, ctxAfter.toList)
|
||||||
(chatLine, score)
|
(chatLine, score)
|
||||||
}
|
}
|
||||||
(topDocs.totalHits, docs.toList)
|
(topDocs.totalHits, docs.toList)
|
||||||
|
|
|
@ -5,9 +5,13 @@ object ChatLine {
|
||||||
val USER = "user"
|
val USER = "user"
|
||||||
val TS = "ts"
|
val TS = "ts"
|
||||||
val MSG = "msg"
|
val MSG = "msg"
|
||||||
|
val CTXB = "ctxb"
|
||||||
|
val CTXA = "ctxa"
|
||||||
}
|
}
|
||||||
|
|
||||||
case class ChatLine(user : String, timestamp : Long, message : String)
|
case class ChatLine(user : String, timestamp : Long, message : String,
|
||||||
|
contextBefore : List[ChatLine] = List(),
|
||||||
|
contextAfter : List[ChatLine] = List())
|
||||||
|
|
||||||
case class IndexRequest(
|
case class IndexRequest(
|
||||||
server : String, channel : String, botName : String, chatLines : List[ChatLine])
|
server : String, channel : String, botName : String, chatLines : List[ChatLine])
|
||||||
|
|
Loading…
Reference in New Issue