Skip to content

Commit

Permalink
Merge pull request #23 from valencik/benchmarks
Browse files Browse the repository at this point in the history
Add Benchmarks
  • Loading branch information
valencik authored Aug 23, 2022
2 parents d69595c + f2addfd commit 87ef5fc
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 3 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,11 @@ jobs:

- name: Make target directories
if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main')
run: mkdir -p lucene/target target unidocs/target .js/target site/target .jvm/target .native/target example/target project/target
run: mkdir -p lucene/target benchmarks/target target unidocs/target .js/target site/target .jvm/target .native/target example/target project/target

- name: Compress target directories
if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main')
run: tar cf targets.tar lucene/target target unidocs/target .js/target site/target .jvm/target .native/target example/target project/target
run: tar cf targets.tar lucene/target benchmarks/target target unidocs/target .js/target site/target .jvm/target .native/target example/target project/target

- name: Upload target directories
if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Copyright 2022 Pig.io
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package textmogrify
package benchmarks

import cats.effect.IO
import cats.effect.unsafe.implicits.global
import fs2.Stream
import fs2.io.file.{Files, Path}

import java.util.concurrent.TimeUnit
import org.openjdk.jmh.annotations._
import textmogrify.lucene.AnalyzerPipe
import textmogrify.lucene.AnalyzerBuilder

/** To run the benchmark from within sbt:
*
* jmh:run -i 10 -wi 10 -f 2 -t 1 textmogrify.benchmarks.LuceneStreamingBenchmark
*
* Which means "10 iterations", "10 warm-up iterations", "2 forks", "1 thread". Please note that
* benchmarks should be usually executed at least in 10 iterations (as a rule of thumb), but
* more is better.
*/
@State(Scope.Thread)
@BenchmarkMode(Array(Mode.Throughput))
@OutputTimeUnit(TimeUnit.SECONDS)
class LuceneStreamingBenchmark {

var asciiBytes: Array[Byte] = _
@Setup
def setup(): Unit =
asciiBytes = Files[IO]
.readAll(Path("../LICENSE"))
.compile
.to(Array)
.unsafeRunSync()

@Benchmark
def tokenizeBytesTokenN1(): String = {
val analyzer = AnalyzerBuilder.default.withLowerCasing.build[IO]
val pipe = AnalyzerPipe.fromResource(analyzer)
val bytes: Stream[IO, Byte] = Stream.emits(asciiBytes)
pipe
.tokenizeBytes(bytes, 1)
.compile
.last
.unsafeRunSync()
.get
}

@Benchmark
def tokenizeBytesTokenN128(): String = {
val analyzer = AnalyzerBuilder.default.withLowerCasing.build[IO]
val pipe = AnalyzerPipe.fromResource(analyzer)
val bytes: Stream[IO, Byte] = Stream.emits(asciiBytes)
pipe
.tokenizeBytes(bytes, 128)
.compile
.last
.unsafeRunSync()
.get
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright 2022 Pig.io
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package textmogrify
package benchmarks

import cats.syntax.all._
import cats.effect.IO
import cats.effect.unsafe.implicits.global
import fs2.text
import fs2.io.file.{Files, Path}

import java.util.concurrent.TimeUnit
import org.openjdk.jmh.annotations._
import textmogrify.lucene.AnalyzerBuilder

/** To run the benchmark from within sbt:
*
* jmh:run -i 10 -wi 10 -f 2 -t 1 textmogrify.benchmarks.LuceneTokenizationBenchmark
*
* Which means "10 iterations", "10 warm-up iterations", "2 forks", "1 thread". Please note that
* benchmarks should be usually executed at least in 10 iterations (as a rule of thumb), but
* more is better.
*/
@State(Scope.Thread)
@BenchmarkMode(Array(Mode.Throughput))
@OutputTimeUnit(TimeUnit.SECONDS)
class LuceneTokenizationBenchmark {

var lines: Vector[String] = _
@Setup
def setup(): Unit =
lines = Files[IO]
.readAll(Path("../LICENSE"))
.through(text.utf8.decode)
.through(text.lines)
.compile
.toVector
.unsafeRunSync()

@Benchmark
def doNothing(): Vector[String] = {
val tokenizer = AnalyzerBuilder.default.withLowerCasing.tokenizer[IO]
tokenizer
.use(_ => lines.traverse(x => IO.pure(Vector(x))))
.unsafeRunSync()
.last
}

@Benchmark
def manualToLowerCaseAndSplit(): Vector[String] = {
val tokenizer = AnalyzerBuilder.default.withLowerCasing.tokenizer[IO]
tokenizer
.use(_ => lines.traverse(x => IO.pure(x.toLowerCase.split(" ").toVector)))
.unsafeRunSync()
.last
}

@Benchmark
def tokenizeAndLowerCase(): Vector[String] = {
val tokenizer = AnalyzerBuilder.default.withLowerCasing.tokenizer[IO]
tokenizer
.use(f => lines.traverse(f))
.unsafeRunSync()
.last
}
}
11 changes: 10 additions & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ val luceneV = "9.3.0"
val munitV = "1.0.0-M6"
val munitCatsEffectV = "2.0.0-M1"

lazy val root = tlCrossRootProject.aggregate(lucene, example, unidocs)
lazy val root = tlCrossRootProject.aggregate(lucene, example, unidocs, benchmarks)

lazy val lucene = project
.in(file("lucene"))
Expand Down Expand Up @@ -75,3 +75,12 @@ lazy val unidocs = project
name := "textmogrify-docs",
ScalaUnidoc / unidoc / unidocProjectFilter := inProjects(lucene),
)

lazy val benchmarks = project
.in(file("benchmarks"))
.dependsOn(lucene)
.settings(
name := "textmogrify-benchmarks",
libraryDependencies += "org.typelevel" %% "cats-effect" % catsEffectV,
)
.enablePlugins(NoPublishPlugin, JmhPlugin)
1 change: 1 addition & 0 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
addSbtPlugin("org.typelevel" % "sbt-typelevel" % "0.5.0-M5")
addSbtPlugin("org.typelevel" % "sbt-typelevel-site" % "0.5.0-M5")
addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.3")

0 comments on commit 87ef5fc

Please sign in to comment.