From 0d5a1bf2080a40e6b2ddf2160935f4645f8ce8b3 Mon Sep 17 00:00:00 2001
From: Andrew Valencik <andrew.valencik@gmail.com>
Date: Wed, 17 Aug 2022 07:26:33 -0400
Subject: [PATCH 1/2] Add initial benchmark setup

---
 .github/workflows/ci.yml                      |  4 +-
 .../LuceneStreamingBenchmark.scala            | 77 +++++++++++++++++++
 build.sbt                                     | 11 ++-
 project/plugins.sbt                           |  1 +
 4 files changed, 90 insertions(+), 3 deletions(-)
 create mode 100644 benchmarks/src/main/scala/textmogrify/LuceneStreamingBenchmark.scala

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1398721..199388e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -81,11 +81,11 @@ jobs:
 
       - name: Make target directories
         if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main')
-        run: mkdir -p lucene/target target unidocs/target .js/target site/target .jvm/target .native/target example/target project/target
+        run: mkdir -p lucene/target benchmarks/target target unidocs/target .js/target site/target .jvm/target .native/target example/target project/target
 
       - name: Compress target directories
         if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main')
-        run: tar cf targets.tar lucene/target target unidocs/target .js/target site/target .jvm/target .native/target example/target project/target
+        run: tar cf targets.tar lucene/target benchmarks/target target unidocs/target .js/target site/target .jvm/target .native/target example/target project/target
 
       - name: Upload target directories
         if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main')
diff --git a/benchmarks/src/main/scala/textmogrify/LuceneStreamingBenchmark.scala b/benchmarks/src/main/scala/textmogrify/LuceneStreamingBenchmark.scala
new file mode 100644
index 0000000..2a1dd30
--- /dev/null
+++ b/benchmarks/src/main/scala/textmogrify/LuceneStreamingBenchmark.scala
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2022 Pig.io
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package textmogrify
+package benchmarks
+
+import cats.effect.IO
+import cats.effect.unsafe.implicits.global
+import fs2.Stream
+import fs2.io.file.{Files, Path}
+
+import java.util.concurrent.TimeUnit
+import org.openjdk.jmh.annotations._
+import textmogrify.lucene.AnalyzerPipe
+import textmogrify.lucene.AnalyzerBuilder
+
+/** To run the benchmark from within sbt:
+  *
+  * jmh:run -i 10 -wi 10 -f 2 -t 1 textmogrify.benchmarks.LuceneStreamingBenchmark
+  *
+  * Which means "10 iterations", "10 warm-up iterations", "2 forks", "1 thread". Please note that
+  * benchmarks should be usually executed at least in 10 iterations (as a rule of thumb), but
+  * more is better.
+  */
+@State(Scope.Thread)
+@BenchmarkMode(Array(Mode.Throughput))
+@OutputTimeUnit(TimeUnit.SECONDS)
+class LuceneStreamingBenchmark {
+
+  var asciiBytes: Array[Byte] = _
+  @Setup
+  def setup(): Unit =
+    asciiBytes = Files[IO]
+      .readAll(Path("../LICENSE"))
+      .compile
+      .to(Array)
+      .unsafeRunSync()
+
+  @Benchmark
+  def tokenizeBytesTokenN1(): String = {
+    val analyzer = AnalyzerBuilder.default.withLowerCasing.build[IO]
+    val pipe = AnalyzerPipe.fromResource(analyzer)
+    val bytes: Stream[IO, Byte] = Stream.emits(asciiBytes)
+    pipe
+      .tokenizeBytes(bytes, 1)
+      .compile
+      .last
+      .unsafeRunSync()
+      .get
+  }
+
+  @Benchmark
+  def tokenizeBytesTokenN128(): String = {
+    val analyzer = AnalyzerBuilder.default.withLowerCasing.build[IO]
+    val pipe = AnalyzerPipe.fromResource(analyzer)
+    val bytes: Stream[IO, Byte] = Stream.emits(asciiBytes)
+    pipe
+      .tokenizeBytes(bytes, 128)
+      .compile
+      .last
+      .unsafeRunSync()
+      .get
+  }
+}
diff --git a/build.sbt b/build.sbt
index 9be822e..44cff19 100644
--- a/build.sbt
+++ b/build.sbt
@@ -33,7 +33,7 @@ val luceneV = "9.3.0"
 val munitV = "1.0.0-M6"
 val munitCatsEffectV = "2.0.0-M1"
 
-lazy val root = tlCrossRootProject.aggregate(lucene, example, unidocs)
+lazy val root = tlCrossRootProject.aggregate(lucene, example, unidocs, benchmarks)
 
 lazy val lucene = project
   .in(file("lucene"))
@@ -75,3 +75,12 @@ lazy val unidocs = project
     name := "textmogrify-docs",
     ScalaUnidoc / unidoc / unidocProjectFilter := inProjects(lucene),
   )
+
+lazy val benchmarks = project
+  .in(file("benchmarks"))
+  .dependsOn(lucene)
+  .settings(
+    name := "textmogrify-benchmarks",
+    libraryDependencies += "org.typelevel" %% "cats-effect" % catsEffectV,
+  )
+  .enablePlugins(NoPublishPlugin, JmhPlugin)
diff --git a/project/plugins.sbt b/project/plugins.sbt
index a79aec3..109da51 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -1,2 +1,3 @@
 addSbtPlugin("org.typelevel" % "sbt-typelevel" % "0.5.0-M5")
 addSbtPlugin("org.typelevel" % "sbt-typelevel-site" % "0.5.0-M5")
+addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.3")

From f2addfd80fd2735c8fdd0f6144bf455b1f0f04df Mon Sep 17 00:00:00 2001
From: Andrew Valencik <andrew.valencik@gmail.com>
Date: Mon, 22 Aug 2022 19:36:54 -0400
Subject: [PATCH 2/2] Add initial AnalyzerBuilder tokenizer benchmark

---
 .../LuceneTokenizationBenchmark.scala         | 80 +++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 benchmarks/src/main/scala/textmogrify/LuceneTokenizationBenchmark.scala

diff --git a/benchmarks/src/main/scala/textmogrify/LuceneTokenizationBenchmark.scala b/benchmarks/src/main/scala/textmogrify/LuceneTokenizationBenchmark.scala
new file mode 100644
index 0000000..eb913ff
--- /dev/null
+++ b/benchmarks/src/main/scala/textmogrify/LuceneTokenizationBenchmark.scala
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2022 Pig.io
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package textmogrify
+package benchmarks
+
+import cats.syntax.all._
+import cats.effect.IO
+import cats.effect.unsafe.implicits.global
+import fs2.text
+import fs2.io.file.{Files, Path}
+
+import java.util.concurrent.TimeUnit
+import org.openjdk.jmh.annotations._
+import textmogrify.lucene.AnalyzerBuilder
+
+/** To run the benchmark from within sbt:
+  *
+  * jmh:run -i 10 -wi 10 -f 2 -t 1 textmogrify.benchmarks.LuceneTokenizationBenchmark
+  *
+  * Which means "10 iterations", "10 warm-up iterations", "2 forks", "1 thread". Please note that
+  * benchmarks should be usually executed at least in 10 iterations (as a rule of thumb), but
+  * more is better.
+  */
+@State(Scope.Thread)
+@BenchmarkMode(Array(Mode.Throughput))
+@OutputTimeUnit(TimeUnit.SECONDS)
+class LuceneTokenizationBenchmark {
+
+  var lines: Vector[String] = _
+  @Setup
+  def setup(): Unit =
+    lines = Files[IO]
+      .readAll(Path("../LICENSE"))
+      .through(text.utf8.decode)
+      .through(text.lines)
+      .compile
+      .toVector
+      .unsafeRunSync()
+
+  @Benchmark
+  def doNothing(): Vector[String] = {
+    val tokenizer = AnalyzerBuilder.default.withLowerCasing.tokenizer[IO]
+    tokenizer
+      .use(_ => lines.traverse(x => IO.pure(Vector(x))))
+      .unsafeRunSync()
+      .last
+  }
+
+  @Benchmark
+  def manualToLowerCaseAndSplit(): Vector[String] = {
+    val tokenizer = AnalyzerBuilder.default.withLowerCasing.tokenizer[IO]
+    tokenizer
+      .use(_ => lines.traverse(x => IO.pure(x.toLowerCase.split(" ").toVector)))
+      .unsafeRunSync()
+      .last
+  }
+
+  @Benchmark
+  def tokenizeAndLowerCase(): Vector[String] = {
+    val tokenizer = AnalyzerBuilder.default.withLowerCasing.tokenizer[IO]
+    tokenizer
+      .use(f => lines.traverse(f))
+      .unsafeRunSync()
+      .last
+  }
+}