Browse Source

Added lucene indexer and searcher. Added WS endpoints for them

Abhinav Sarkar 7 years ago
parent
commit
c742979d6e

+ 32
- 3
pom.xml View File

@@ -10,7 +10,8 @@
10 10
 		<maven.compiler.source>1.6</maven.compiler.source>
11 11
 		<maven.compiler.target>1.6</maven.compiler.target>
12 12
 		<encoding>UTF-8</encoding>
13
-		<scala.version>2.10.1</scala.version>
13
+		<scala.version>2.10.0</scala.version>
14
+		<lucene.version>4.3.0</lucene.version>
14 15
 		<project.dependencyDir>${project.build.directory}/dependency</project.dependencyDir>
15 16
 	</properties>
16 17
 
@@ -25,6 +26,11 @@
25 26
 			<artifactId>scala-reflect</artifactId>
26 27
 			<version>${scala.version}</version>
27 28
 		</dependency>
29
+		<dependency>
30
+			<groupId>org.scala-lang</groupId>
31
+			<artifactId>scala-compiler</artifactId>
32
+			<version>${scala.version}</version>
33
+		</dependency>
28 34
 		<dependency>
29 35
 			<groupId>io.netty</groupId>
30 36
 			<artifactId>netty</artifactId>
@@ -42,8 +48,31 @@
42 48
 			<version>1.0.0</version>
43 49
 			<scope>runtime</scope>
44 50
 		</dependency>
45
-
46
-		<!-- Test -->
51
+		<dependency>
52
+			<groupId>net.liftweb</groupId>
53
+			<artifactId>lift-json_2.10</artifactId>
54
+			<version>2.5-RC5</version>
55
+		</dependency>
56
+		<dependency>
57
+			<groupId>org.scala-lang</groupId>
58
+			<artifactId>jline</artifactId>
59
+			<version>2.11.0-M2</version>
60
+		</dependency>
61
+		<dependency>
62
+			<groupId>org.apache.lucene</groupId>
63
+			<artifactId>lucene-core</artifactId>
64
+			<version>${lucene.version}</version>
65
+		</dependency>
66
+		<dependency>
67
+			<groupId>org.apache.lucene</groupId>
68
+			<artifactId>lucene-analyzers-common</artifactId>
69
+			<version>${lucene.version}</version>
70
+		</dependency>
71
+		<dependency>
72
+			<groupId>org.apache.lucene</groupId>
73
+			<artifactId>lucene-queryparser</artifactId>
74
+			<version>${lucene.version}</version>
75
+		</dependency>
47 76
 	</dependencies>
48 77
 
49 78
 	<build>

+ 1
- 0
src/main/scala/net/abhinavsarkar/ircsearch/HttpRequestHandler.scala View File

@@ -36,6 +36,7 @@ trait HttpRequestHandler extends ChannelInboundMessageHandlerAdapter[HttpRequest
36 36
   protected def sendSuccess(ctx : ChannelHandlerContext, request : HttpRequest, body : String) : HttpResponse = {
37 37
     val response = new DefaultHttpResponse(HttpVersion.HTTP_1_1, HttpResponseStatus.OK)
38 38
     response.setContent(Unpooled.copiedBuffer(body.getBytes))
39
+    response.setHeader(CONTENT_TYPE, "application/json")
39 40
     writeResponse(ctx, request, response)
40 41
     response
41 42
   }

+ 56
- 14
src/main/scala/net/abhinavsarkar/ircsearch/Server.scala View File

@@ -1,7 +1,13 @@
1 1
 package net.abhinavsarkar.ircsearch
2 2
 
3 3
 import java.net.InetSocketAddress
4
+import java.nio.charset.Charset
5
+
6
+import scala.concurrent.ExecutionContext.Implicits._
7
+import scala.concurrent.future
8
+
4 9
 import com.typesafe.scalalogging.slf4j.Logging
10
+
5 11
 import io.netty.bootstrap.ServerBootstrap
6 12
 import io.netty.channel.ChannelHandler.Sharable
7 13
 import io.netty.channel.ChannelHandlerContext
@@ -14,11 +20,12 @@ import io.netty.handler.codec.http.HttpContentCompressor
14 20
 import io.netty.handler.codec.http.HttpRequest
15 21
 import io.netty.handler.codec.http.HttpRequestDecoder
16 22
 import io.netty.handler.codec.http.HttpResponseEncoder
17
-import io.netty.handler.codec.http.DefaultHttpResponse
18
-import io.netty.handler.codec.http.HttpVersion
19
-import io.netty.handler.codec.http.HttpResponseStatus
20
-import io.netty.buffer.Unpooled
21
-import java.nio.charset.Charset
23
+import net.abhinavsarkar.ircsearch.lucene.Indexer
24
+import net.abhinavsarkar.ircsearch.lucene.Searcher
25
+import net.abhinavsarkar.ircsearch.model.IndexRequest
26
+import net.abhinavsarkar.ircsearch.model.SearchRequest
27
+import net.liftweb.json.DefaultFormats
28
+import net.liftweb.json.Serialization
22 29
 
23 30
 object Server extends App with Logging {
24 31
 
@@ -31,13 +38,12 @@ object Server extends App with Logging {
31 38
 
32 39
     val httpRequestRouter = new HttpRequestRouter {
33 40
       val Echo = "^/echo$".r
41
+      val Index = "^/index$".r
42
+      val Search = "^/search$".r
34 43
       def route = {
35
-        case Echo() => new HttpRequestHandler {
36
-          override def messageReceived(ctx: ChannelHandlerContext, request: HttpRequest) {
37
-            val content = request.getContent().toString(Charset.forName("UTF-8"))
38
-            logRequest(ctx, request, sendSuccess(ctx, request, content))
39
-          }
40
-        }
44
+        case Echo() => EchoHandler
45
+        case Index() => IndexHandler
46
+        case Search() => SearchHandler
41 47
       }
42 48
     }
43 49
 
@@ -58,7 +64,8 @@ object Server extends App with Logging {
58 64
     Runtime.getRuntime.addShutdownHook(
59 65
       new Thread("ShutdownHook") {
60 66
         override def run {
61
-          stopServer(server);
67
+          stopServer(server)
68
+          IndexHandler.stop
62 69
         }
63 70
       })
64 71
 
@@ -67,7 +74,8 @@ object Server extends App with Logging {
67 74
     } catch {
68 75
       case e : Exception => {
69 76
         logger.error("Exception while running server. Stopping server", e)
70
-        stopServer(server);
77
+        stopServer(server)
78
+        IndexHandler.stop
71 79
       }
72 80
     }
73 81
   }
@@ -78,4 +86,38 @@ object Server extends App with Logging {
78 86
     logger.info("Stopped server")
79 87
   }
80 88
 
81
-}
89
+}
90
+
91
+@Sharable
92
+object EchoHandler extends HttpRequestHandler {
93
+  override def messageReceived(ctx: ChannelHandlerContext, request: HttpRequest) {
94
+    val content = request.getContent().toString(Charset.forName("UTF-8"))
95
+    logRequest(ctx, request, sendSuccess(ctx, request, content))
96
+  }
97
+}
98
+
99
+@Sharable
100
+object IndexHandler extends HttpRequestHandler {
101
+  implicit val formats = DefaultFormats
102
+  lazy val indexer = { val indexer = new Indexer; indexer.start; indexer }
103
+  override def messageReceived(ctx: ChannelHandlerContext, request: HttpRequest) {
104
+    future {
105
+      val content = request.getContent().toString(Charset.forName("UTF-8"))
106
+      val indexRequest = Serialization.read[IndexRequest](content)
107
+      indexer.index(indexRequest)
108
+    }
109
+    logRequest(ctx, request, sendDefaultResponse(ctx, request))
110
+  }
111
+  def stop = indexer.stop
112
+}
113
+
114
+@Sharable
115
+object SearchHandler extends HttpRequestHandler {
116
+  implicit val formats = DefaultFormats
117
+  override def messageReceived(ctx: ChannelHandlerContext, request: HttpRequest) {
118
+    val content = request.getContent().toString(Charset.forName("UTF-8"))
119
+    val searchRequest = Serialization.read[SearchRequest](content)
120
+    val searchResult = Searcher.search(searchRequest)
121
+    logRequest(ctx, request, sendSuccess(ctx, request, Serialization.write(searchResult)))
122
+  }
123
+}

+ 138
- 0
src/main/scala/net/abhinavsarkar/ircsearch/lucene/Indexer.scala View File

@@ -0,0 +1,138 @@
1
+package net.abhinavsarkar.ircsearch.lucene
2
+
3
+import java.io.File
4
+import java.util.ArrayList
5
+import java.util.concurrent.Executors
6
+import java.util.concurrent.Future
7
+import java.util.concurrent.LinkedBlockingQueue
8
+import java.util.concurrent.TimeUnit
9
+import java.util.concurrent.locks.ReentrantLock
10
+
11
+import scala.collection.JavaConversions._
12
+
13
+import org.apache.lucene.analysis.Analyzer
14
+import org.apache.lucene.analysis.core.KeywordAnalyzer
15
+import org.apache.lucene.analysis.en.EnglishAnalyzer
16
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper
17
+import org.apache.lucene.analysis.standard.StandardAnalyzer
18
+import org.apache.lucene.document.Field
19
+import org.apache.lucene.document.FieldType
20
+import org.apache.lucene.document.FieldType.NumericType
21
+import org.apache.lucene.index.IndexReader
22
+import org.apache.lucene.index.IndexWriter
23
+import org.apache.lucene.index.IndexWriterConfig
24
+import org.apache.lucene.search.IndexSearcher
25
+import org.apache.lucene.store.FSDirectory
26
+import org.apache.lucene.util.Version
27
+
28
+import com.typesafe.scalalogging.slf4j.Logging
29
+
30
+import net.abhinavsarkar.ircsearch.model.IndexRequest
31
+
32
+class Indexer extends Logging {
33
+
34
+  import Indexer._
35
+
36
+  private val indexQueue = new LinkedBlockingQueue[IndexRequest]
37
+  private val scheduler = Executors.newSingleThreadScheduledExecutor
38
+  private val runLock = new ReentrantLock
39
+  private var runFuture : Future[_] = null
40
+
41
+  def index(indexRequest : IndexRequest) = indexQueue.offer(indexRequest)
42
+
43
+  def start {
44
+    logger.info("Starting indexer")
45
+    runFuture = scheduler.scheduleWithFixedDelay(
46
+      new Runnable {
47
+        def run {
48
+          try {
49
+            runLock.lock
50
+            logger.debug("Running indexer")
51
+            val indexReqs = new ArrayList[IndexRequest]
52
+            indexQueue.drainTo(indexReqs)
53
+            doIndex(indexReqs.toList)
54
+          } catch {
55
+            case e : Throwable => logger.error("Exception while running indexer", e)
56
+          } finally {
57
+            runLock.unlock
58
+          }
59
+        }},
60
+      0, 10, TimeUnit.SECONDS)
61
+  }
62
+
63
+  def stop {
64
+    try {
65
+      runLock.lock
66
+      if (runFuture != null) {
67
+        runFuture.cancel(false)
68
+        runFuture = null
69
+      }
70
+      logger.info("Stopped indexer")
71
+    } finally {
72
+      runLock.unlock
73
+    }
74
+  }
75
+
76
+  private def doIndex(indexReqs: List[IndexRequest]) {
77
+    val indexRequests = indexReqs.groupBy { r =>
78
+      (r.server, r.channel, r.botName)
79
+    }
80
+
81
+    for (((server, channel, botName), indexRequestBatch) <- indexRequests) {
82
+      val indexDir = getIndexDir(server, channel, botName)
83
+      val analyzer = mkAnalyzer
84
+      val indexWriter = mkIndexWriter(indexDir, analyzer)
85
+      try {
86
+        for (indexRequest <- indexRequestBatch;
87
+             chatLine     <- indexRequest.chatLines) {
88
+          val tsField = mkField("timestamp", chatLine.timestamp.toString, false)
89
+          val userField = mkField("user", chatLine.user, true)
90
+          val msgField = mkField("message", chatLine.message)
91
+          indexWriter.addDocument(List(tsField, userField, msgField), analyzer)
92
+          logger.debug("Indexed : [{} {} {}] [{}] {}: {}",
93
+              server, channel, botName, chatLine.timestamp.toString, chatLine.user, chatLine.message)
94
+        }
95
+      } finally {
96
+        indexWriter.close
97
+        analyzer.close
98
+      }
99
+    }
100
+  }
101
+
102
+}
103
+
104
+object Indexer {
105
+
106
+  val LUCENE_VERSION = Version.LUCENE_43
107
+
108
+  def mkAnalyzer : Analyzer = {
109
+    val defAnalyzer = new StandardAnalyzer(LUCENE_VERSION)
110
+    val fieldAnalyzers = Map(
111
+        "user" -> new KeywordAnalyzer,
112
+        "message" -> new EnglishAnalyzer(LUCENE_VERSION))
113
+
114
+    new PerFieldAnalyzerWrapper(defAnalyzer, fieldAnalyzers)
115
+  }
116
+
117
+  private def mkIndexWriter(dirPath : String, analyzer : Analyzer) : IndexWriter = {
118
+    val indexDir = new File(dirPath)
119
+    if (indexDir.exists) {
120
+      assert(indexDir.isDirectory)
121
+    }
122
+    new IndexWriter(FSDirectory.open(indexDir), new IndexWriterConfig(LUCENE_VERSION, analyzer))
123
+  }
124
+
125
+  def getIndexDir(server : String, channel : String, botName : String) : String =
126
+    s"index-$server-$channel-$botName"
127
+
128
+  private def mkField(name : String, value : String,
129
+      tokenized : Boolean = true, numericType : Option[NumericType] = None) : Field = {
130
+    val fieldType = new FieldType
131
+    fieldType.setStored(true)
132
+    fieldType.setIndexed(true)
133
+    fieldType.setTokenized(tokenized)
134
+    numericType.foreach { fieldType.setNumericType }
135
+    new Field(name, value, fieldType)
136
+  }
137
+
138
+}

+ 103
- 0
src/main/scala/net/abhinavsarkar/ircsearch/lucene/Searcher.scala View File

@@ -0,0 +1,103 @@
1
+package net.abhinavsarkar.ircsearch.lucene
2
+
3
+import com.typesafe.scalalogging.slf4j.Logging
4
+import org.apache.lucene.search.IndexSearcher
5
+import java.io.File
6
+import org.apache.lucene.index.IndexReader
7
+import org.apache.lucene.store.FSDirectory
8
+import org.apache.lucene.analysis.Analyzer
9
+import org.apache.lucene.queryparser.classic.QueryParser
10
+import org.apache.lucene.search.Query
11
+import scala.collection.immutable.Set
12
+import org.apache.lucene.search.BooleanQuery
13
+import org.apache.lucene.search.TermQuery
14
+import org.apache.lucene.search.BooleanClause
15
+import org.apache.lucene.search.QueryWrapperFilter
16
+import org.apache.lucene.search.Filter
17
+import net.abhinavsarkar.ircsearch.model.SearchRequest
18
+import net.abhinavsarkar.ircsearch.model.SearchResult
19
+import org.apache.lucene.search.Sort
20
+import org.apache.lucene.search.SortField
21
+import scala.collection.JavaConversions._
22
+import scala.collection.mutable
23
+import net.abhinavsarkar.ircsearch.model.ChatLine
24
+import net.abhinavsarkar.ircsearch.model.ChatLine
25
+import net.abhinavsarkar.ircsearch.model.SearchResult
26
+import net.abhinavsarkar.ircsearch.model.SearchResult
27
+
28
+object Searcher extends Logging {
29
+
30
+  private def mkIndexSearcher(dirPath : String) : IndexSearcher = {
31
+    val indexDir = new File(dirPath)
32
+    assert(indexDir.exists && indexDir.isDirectory)
33
+
34
+    new IndexSearcher(IndexReader.open(FSDirectory.open(indexDir)))
35
+  }
36
+
37
+  private def mkQueryParser(analyzer : Analyzer) =
38
+    new QueryParser(Indexer.LUCENE_VERSION, "message", analyzer)
39
+
40
+  private def filterifyQuery(query : Query, mustFields : Set[String]) : (Query, Option[Filter]) =
41
+    query match {
42
+      case boolQuery: BooleanQuery => {
43
+        val newQuery = new BooleanQuery
44
+        val filterQuery = new BooleanQuery
45
+        for (clause <- boolQuery.getClauses) {
46
+          val subQuery = clause.getQuery
47
+          if (subQuery.isInstanceOf[TermQuery]) {
48
+            val termQuery = subQuery.asInstanceOf[TermQuery]
49
+            val field = termQuery.getTerm.field
50
+            if (mustFields contains field) {
51
+              clause.setOccur(BooleanClause.Occur.MUST)
52
+              filterQuery.add(clause)
53
+            } else {
54
+              newQuery.add(clause)
55
+            }
56
+          } else {
57
+            newQuery.add(clause)
58
+          }
59
+        }
60
+
61
+        (newQuery, if (filterQuery.clauses.isEmpty) None else Some(new QueryWrapperFilter(filterQuery)))
62
+      }
63
+      case _ => (query, None)
64
+    }
65
+
66
+  def search(searchRequest : SearchRequest) : SearchResult = {
67
+    logger.debug("Searching : [{} {} {}] {}",
68
+      searchRequest.server, searchRequest.channel, searchRequest.botName, searchRequest.query)
69
+
70
+    val indexDir = Indexer.getIndexDir(searchRequest.server, searchRequest.channel, searchRequest.botName)
71
+    val analyzer = Indexer.mkAnalyzer
72
+    try {
73
+      val queryParser = mkQueryParser(analyzer)
74
+      val (query, filter) = filterifyQuery(queryParser.parse(searchRequest.query), Set("user"))
75
+      logger.debug("Query: {}, Filter: {}", query, filter)
76
+      val (totalResults, results) = doSearch(indexDir, query, filter, searchRequest.pageSize)
77
+      val searchResults = SearchResult.fromSearchRequest(searchRequest)
78
+        .copy(totalResults = totalResults, chatLines = results.map(_._1))
79
+      logger.debug("Search results: {}", searchResults)
80
+      searchResults
81
+    } finally {
82
+      analyzer.close
83
+    }
84
+  }
85
+
86
+  private def doSearch(indexDir : String, query : Query, filter : Option[Filter], maxHits : Int)
87
+    : (Int, List[(ChatLine, Float)]) = {
88
+    val indexSearcher = mkIndexSearcher(indexDir)
89
+    val topDocs = indexSearcher.search(query, filter.orNull, maxHits,
90
+        new Sort(SortField.FIELD_SCORE, new SortField("timestamp", SortField.Type.LONG, true)))
91
+    val docs = topDocs.scoreDocs.map { sd =>
92
+      val score = sd.score
93
+      val doc = indexSearcher.doc(sd.doc).getFields.foldLeft(mutable.Map[String, String]()) {
94
+        (map, field) => map += (field.name -> field.stringValue)
95
+      }
96
+
97
+      val chatLine = new ChatLine(doc("user"), doc("timestamp").toLong, doc("message"))
98
+      (chatLine, score)
99
+    }
100
+    (topDocs.totalHits, docs.toList)
101
+  }
102
+
103
+}

+ 23
- 0
src/main/scala/net/abhinavsarkar/ircsearch/model.scala View File

@@ -0,0 +1,23 @@
1
+package net.abhinavsarkar.ircsearch.model
2
+
3
+
4
+case class ChatLine(user : String, timestamp : Long, message : String)
5
+
6
+case class IndexRequest(
7
+    server : String, channel : String, botName : String, chatLines : List[ChatLine])
8
+
9
+case class SearchRequest(
10
+    server : String, channel : String, botName : String, query: String,
11
+    page : Int = 0, pageSize : Int = 10)
12
+
13
+case class SearchResult(
14
+    server : String, channel : String, botName : String, query: String,
15
+    page : Int, pageSize : Int, totalResults : Int, chatLines : List[ChatLine])
16
+
17
+object SearchResult {
18
+  def fromSearchRequest(searchRequest : SearchRequest) = searchRequest match {
19
+    case SearchRequest(server, channel, botName, query, page, pageSize) =>
20
+      new SearchResult(server, channel, botName, query, page, pageSize, 0, List())
21
+  }
22
+}
23
+

Loading…
Cancel
Save