Browse Source

Added support for indexing and searching context of a chat line

Abhinav Sarkar 7 years ago
parent
commit
2d50b9bb73

+ 8
- 0
src/main/scala/net/abhinavsarkar/ircsearch/HttpRequestHandler.scala View File

@@ -39,6 +39,14 @@ trait HttpRequestHandler extends ChannelInboundMessageHandlerAdapter[HttpRequest
39 39
     response
40 40
   }
41 41
 
42
+  protected def sendError(ctx : ChannelHandlerContext, request : HttpRequest, body : String) : HttpResponse = {
43
+    val response = new DefaultHttpResponse(HTTP_1_1, INTERNAL_SERVER_ERROR)
44
+    response.setContent(Unpooled.copiedBuffer(body.getBytes))
45
+    response.setHeader(CONTENT_TYPE, "text/plain")
46
+    writeResponse(ctx, request, response)
47
+    response
48
+  }
49
+
42 50
   protected def writeResponse(
43 51
       ctx : ChannelHandlerContext, request : HttpRequest, response : HttpResponse) {
44 52
     response.setHeader(CONTENT_LENGTH, response.getContent().readableBytes())

+ 4
- 1
src/main/scala/net/abhinavsarkar/ircsearch/Server.scala View File

@@ -215,6 +215,9 @@ object SearchHandler extends HttpRequestHandler {
215 215
       case searchResult =>
216 216
         logRequest(ctx, request, sendSuccess(ctx, request, Serialization.write(searchResult)))
217 217
     }
218
-    f onFailure { case e : Exception => logger.error("Error", e) }
218
+    f onFailure { case e : Exception => {
219
+      logger.error("Error", e)
220
+      logRequest(ctx, request, sendError(ctx, request, e.getMessage))
221
+    }}
219 222
   }
220 223
 }

+ 98
- 26
src/main/scala/net/abhinavsarkar/ircsearch/lucene/Indexer.scala View File

@@ -4,11 +4,15 @@ import java.io.File
4 4
 import java.util.ArrayList
5 5
 import java.util.concurrent.Executors
6 6
 import java.util.concurrent.Future
7
-import java.util.concurrent.LinkedBlockingQueue
7
+import java.util.concurrent.PriorityBlockingQueue
8 8
 import java.util.concurrent.TimeUnit
9 9
 import java.util.concurrent.locks.ReentrantLock
10
+
10 11
 import scala.collection.JavaConversions._
12
+import scala.collection.Seq
11 13
 import scala.collection.mutable
14
+import scala.math.Ordered
15
+
12 16
 import org.apache.lucene.analysis.Analyzer
13 17
 import org.apache.lucene.analysis.core.KeywordAnalyzer
14 18
 import org.apache.lucene.analysis.en.EnglishAnalyzer
@@ -22,16 +26,40 @@ import org.apache.lucene.index.IndexWriter
22 26
 import org.apache.lucene.index.IndexWriterConfig
23 27
 import org.apache.lucene.store.FSDirectory
24 28
 import org.apache.lucene.util.Version
29
+
25 30
 import com.typesafe.scalalogging.slf4j.Logging
31
+
26 32
 import net.abhinavsarkar.ircsearch.model._
27
-import java.util.concurrent.BlockingDeque
28
-import java.util.concurrent.BlockingQueue
29 33
 
30 34
 object Indexer extends Logging {
31 35
 
36
+  case class IndexRecord(
37
+      server : String, channel : String, botName : String, chatLine : ChatLine,
38
+      indexed : Boolean = false)
39
+      extends Ordered[IndexRecord] {
40
+    def compare(that : IndexRecord) = {
41
+      val diff = this.chatLine.timestamp - that.chatLine.timestamp
42
+      if (diff > 0) 1 else if (diff < 0) -1 else 0
43
+    }
44
+  }
45
+
46
+  object IndexRecord {
47
+
48
+    def fromIndexRequest(indexRequest : IndexRequest) = {
49
+      val IndexRequest(server, channel, botName, chatLines) = indexRequest
50
+      for {
51
+        chatLine <- chatLines
52
+      } yield new IndexRecord(server, channel, botName, chatLine)
53
+    }
54
+
55
+  }
56
+
32 57
   val LUCENE_VERSION = Version.LUCENE_43
58
+  val ContextSize = 2
59
+  val ContextDurationSecs = 20
60
+  val IndexingDurationSecs = 10
33 61
 
34
-  private val indexReqQueue = new LinkedBlockingQueue[IndexRequest](10000)
62
+  private val indexQueue = new PriorityBlockingQueue[IndexRecord](10000)
35 63
   private val scheduler = Executors.newScheduledThreadPool(2)
36 64
   private val runLock = new ReentrantLock
37 65
   private var indexingFuture : Future[_] = null
@@ -55,7 +83,9 @@ object Indexer extends Logging {
55 83
     val defAnalyzer = new StandardAnalyzer(LUCENE_VERSION)
56 84
     val fieldAnalyzers = Map(
57 85
         ChatLine.USER -> new KeywordAnalyzer,
58
-        ChatLine.MSG -> new EnglishAnalyzer(LUCENE_VERSION))
86
+        ChatLine.MSG -> new EnglishAnalyzer(LUCENE_VERSION),
87
+        ChatLine.CTXB -> new EnglishAnalyzer(LUCENE_VERSION),
88
+        ChatLine.CTXA -> new EnglishAnalyzer(LUCENE_VERSION))
59 89
 
60 90
     new PerFieldAnalyzerWrapper(defAnalyzer, fieldAnalyzers)
61 91
   }
@@ -79,7 +109,8 @@ object Indexer extends Logging {
79 109
   def getIndexDir(server : String, channel : String, botName : String) : String =
80 110
     s"index-$server-$channel-$botName"
81 111
 
82
-  def index(indexRequest : IndexRequest) = indexReqQueue.put(indexRequest)
112
+  def index(indexRequest : IndexRequest) =
113
+    IndexRecord.fromIndexRequest(indexRequest).foreach(indexQueue.put)
83 114
 
84 115
   private def doInLock(f : => Unit) {
85 116
     try {
@@ -98,17 +129,59 @@ object Indexer extends Logging {
98 129
       }
99 130
     }}
100 131
 
101
-  def indexReqStream : Stream[IndexRequest] = Stream.cons(indexReqQueue.take, indexReqStream)
132
+  def schedule(initialDelay : Int, delay : Int, unit : TimeUnit)(f : => Unit) = {
133
+    scheduler.scheduleWithFixedDelay(f, initialDelay, delay, unit)
134
+  }
135
+
136
+  def fillContext(rec: IndexRecord, recs: Seq[IndexRecord], idx : Int) = {
137
+    rec.copy(chatLine =
138
+      rec.chatLine.copy(
139
+        contextBefore = recs.slice(idx - ContextSize, idx).map(_.chatLine)
140
+        .filter(_.timestamp >= rec.chatLine.timestamp - ContextDurationSecs * 1000)
141
+        .toList,
142
+        contextAfter = recs.slice(idx + 1, 2 * ContextSize + 1).map(_.chatLine)
143
+        .filter(_.timestamp <= rec.chatLine.timestamp + ContextDurationSecs * 1000)
144
+        .toList))
145
+  }
102 146
 
103 147
   def start {
104 148
     logger.info("Starting indexer")
105
-    indexingFuture = scheduler.submit {
106
-      for (indexReq <- indexReqStream)
107
-        doInLock {
108
-          doIndex(List(indexReq))
149
+    indexingFuture = schedule(0, IndexingDurationSecs.max(ContextDurationSecs), TimeUnit.SECONDS) {
150
+      if (!indexQueue.isEmpty) {
151
+        val indexRecs = new ArrayList[IndexRecord]
152
+        indexQueue drainTo indexRecs
153
+        val indexRecsMap = indexRecs groupBy { r => (r.server, r.channel, r.botName) }
154
+
155
+        val windowSize = 2 * ContextSize + 1
156
+        for (indexRecBatch <- indexRecsMap.values) {
157
+          for (recs <- indexRecBatch.sliding(windowSize)) {
158
+            if (recs.size == windowSize) {
159
+              doInLock {
160
+                doIndex(fillContext(recs(ContextSize), recs, ContextSize))
161
+              }
162
+            } else if (recs.size < ContextSize + 1) {
163
+              recs.foreach(indexQueue.offer)
164
+            } else {
165
+              recs.zipWithIndex.drop(ContextSize).foreach { r =>
166
+                doInLock {
167
+                  doIndex(fillContext(r._1, recs, r._2))
168
+                }
169
+              }
170
+            }
171
+          }
172
+
173
+          if (indexRecBatch.size > windowSize) {
174
+            indexRecBatch.slice(indexRecBatch.length - 2 * ContextSize, indexRecBatch.length)
175
+            .zipWithIndex
176
+            .map { r => if (r._2 < ContextSize) r._1.copy(indexed = true) else r._1 }
177
+            .foreach(indexQueue.put)
178
+          }
109 179
         }
180
+      }
181
+    }
182
+    flushFuture = schedule(0, 10, TimeUnit.SECONDS) {
183
+      doInLock(flush)
110 184
     }
111
-    flushFuture = scheduler.scheduleWithFixedDelay(doInLock(flush), 0, 10, TimeUnit.SECONDS)
112 185
   }
113 186
 
114 187
   def stop {
@@ -126,23 +199,22 @@ object Indexer extends Logging {
126 199
     }
127 200
   }
128 201
 
129
-  private def doIndex(indexReqs: List[IndexRequest]) {
130
-    val indexRequests = indexReqs.groupBy { r =>
131
-      (r.server, r.channel, r.botName)
132
-    }
202
+  def ctxToStr(ctx : List[ChatLine]) =
203
+    ctx.map { line => s"${line.timestamp} ${line.user}: ${line.message}" }  mkString "\n"
133 204
 
134
-    for (((server, channel, botName), indexRequestBatch) <- indexRequests) {
205
+  private def doIndex(indexRecord: IndexRecord) {
206
+    val IndexRecord(server, channel, botName, chatLine, indexed) = indexRecord
207
+    if (!indexed) {
135 208
       val indexDir = getIndexDir(server, channel, botName)
136 209
       val indexWriter = getIndexWriter(indexDir)
137
-      for (indexRequest <- indexRequestBatch;
138
-           chatLine     <- indexRequest.chatLines) {
139
-        val tsField = new LongField(ChatLine.TS, chatLine.timestamp, Field.Store.YES)
140
-        val userField = new StringField(ChatLine.USER, chatLine.user, Field.Store.YES)
141
-        val msgField = new TextField(ChatLine.MSG, chatLine.message, Field.Store.YES)
142
-        indexWriter.addDocument(List(tsField, userField, msgField), indexWriter.getAnalyzer)
143
-        logger.debug("Indexed : [{} {} {}] [{}] {}: {}",
144
-            server, channel, botName, chatLine.timestamp.toString, chatLine.user, chatLine.message)
145
-      }
210
+      val ts = new LongField(ChatLine.TS, chatLine.timestamp, Field.Store.YES)
211
+      val user = new StringField(ChatLine.USER, chatLine.user, Field.Store.YES)
212
+      val msg = new TextField(ChatLine.MSG, chatLine.message, Field.Store.YES)
213
+      val ctxBfr = new TextField(ChatLine.CTXB, ctxToStr(chatLine.contextBefore), Field.Store.YES)
214
+      val ctxAft = new TextField(ChatLine.CTXA, ctxToStr(chatLine.contextAfter), Field.Store.YES)
215
+      indexWriter.addDocument(List(ts, user, msg, ctxBfr, ctxAft), indexWriter.getAnalyzer)
216
+      logger.debug("Indexed : [{} {} {}] [{}] {}: {}",
217
+          server, channel, botName, chatLine.timestamp.toString, chatLine.user, chatLine.message)
146 218
     }
147 219
   }
148 220
 

+ 22
- 8
src/main/scala/net/abhinavsarkar/ircsearch/lucene/Searcher.scala View File

@@ -5,12 +5,13 @@ import java.text.ParseException
5 5
 import java.text.SimpleDateFormat
6 6
 
7 7
 import scala.collection.JavaConversions._
8
+import scala.collection.immutable.Map
8 9
 import scala.collection.mutable
9 10
 import scala.collection.mutable.Buffer
10 11
 
11 12
 import org.apache.lucene.analysis.Analyzer
12 13
 import org.apache.lucene.queries.ChainedFilter
13
-import org.apache.lucene.queryparser.classic.QueryParser
14
+import org.apache.lucene.queryparser.classic.MultiFieldQueryParser
14 15
 import org.apache.lucene.search.BooleanClause
15 16
 import org.apache.lucene.search.BooleanQuery
16 17
 import org.apache.lucene.search.Filter
@@ -32,6 +33,7 @@ import net.abhinavsarkar.ircsearch.model._
32 33
 object Searcher extends Logging {
33 34
 
34 35
   val MaxHits = 1000
36
+  val MessageFieldBoost = java.lang.Float.valueOf(2.0f)
35 37
 
36 38
   private val searcherMgrs = mutable.Map[String, SearcherManager]()
37 39
 
@@ -57,7 +59,9 @@ object Searcher extends Logging {
57 59
   }
58 60
 
59 61
   private def mkQueryParser(analyzer : Analyzer) =
60
-    new QueryParser(Indexer.LUCENE_VERSION, ChatLine.MSG, analyzer)
62
+    new MultiFieldQueryParser(Indexer.LUCENE_VERSION,
63
+        List(ChatLine.MSG, ChatLine.CTXB, ChatLine.CTXA).toArray, analyzer,
64
+        Map(ChatLine.MSG -> MessageFieldBoost))
61 65
 
62 66
   private def filterifyQuery(query : Query) : Query =
63 67
     query match {
@@ -130,6 +134,8 @@ object Searcher extends Logging {
130 134
     }
131 135
   }
132 136
 
137
+  private val DocFields = List(ChatLine.USER, ChatLine.TS, ChatLine.MSG, ChatLine.CTXB, ChatLine.CTXA)
138
+
133 139
   private def doSearch(indexDir : String, query : Query, page : Int, pageSize : Int)
134 140
     : (Int, List[(ChatLine, Float)]) = {
135 141
     val searcherMgr = getSearcherMgr(indexDir)
@@ -139,14 +145,22 @@ object Searcher extends Logging {
139 145
       val topDocs = indexSearcher.search(query, MaxHits.min((page + 1) * pageSize),
140 146
           new Sort(SortField.FIELD_SCORE, new SortField(ChatLine.TS, SortField.Type.LONG, true)))
141 147
       val docs = topDocs.scoreDocs
142
-        .drop(page * pageSize)
143
-        .map { sd =>
144
-          val score = sd.score
145
-          val doc = indexSearcher.doc(sd.doc).getFields.foldLeft(mutable.Map[String, String]()) {
146
-            (map, field) => map += (field.name -> field.stringValue)
148
+      .drop(page * pageSize)
149
+      .map { sd =>
150
+        val score = sd.score
151
+        val doc = indexSearcher.doc(sd.doc).getFields.foldLeft(mutable.Map[String, String]()) {
152
+          (map, field) => map += (field.name -> field.stringValue)
147 153
         }
148 154
 
149
-        val chatLine = new ChatLine(doc(ChatLine.USER), doc(ChatLine.TS).toLong, doc(ChatLine.MSG))
155
+        val List(user, timestamp, message, contextBefore, contextAfter) = DocFields.map(doc)
156
+
157
+        val LineRe = "(\\d+) (.*?): (.*)".r
158
+        val List(ctxBefore, ctxAfter) = List(contextBefore, contextAfter).map {
159
+          _.split('\n').filterNot(_.isEmpty).map {
160
+            case LineRe(timestamp, user, message) => new ChatLine(user, timestamp.toLong, message)
161
+          }}
162
+
163
+        val chatLine = new ChatLine(user, timestamp.toLong, message, ctxBefore.toList, ctxAfter.toList)
150 164
         (chatLine, score)
151 165
       }
152 166
       (topDocs.totalHits, docs.toList)

+ 5
- 1
src/main/scala/net/abhinavsarkar/ircsearch/model.scala View File

@@ -5,9 +5,13 @@ object ChatLine {
5 5
   val USER = "user"
6 6
   val TS = "ts"
7 7
   val MSG = "msg"
8
+  val CTXB = "ctxb"
9
+  val CTXA = "ctxa"
8 10
 }
9 11
 
10
-case class ChatLine(user : String, timestamp : Long, message : String)
12
+case class ChatLine(user : String, timestamp : Long, message : String,
13
+    contextBefore : List[ChatLine] = List(),
14
+    contextAfter : List[ChatLine] = List())
11 15
 
12 16
 case class IndexRequest(
13 17
     server : String, channel : String, botName : String, chatLines : List[ChatLine])

Loading…
Cancel
Save