From aeed5458f9eb34bd9f204586749fc4cf06d53425 Mon Sep 17 00:00:00 2001 From: Claire McGinty Date: Tue, 17 Sep 2024 12:52:43 -0400 Subject: [PATCH] add JMH benchmark for Parquet --- build.sbt | 8 ++- .../scala/magnolify/jmh/MagnolifyBench.scala | 56 ++++++++++++++++++- 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index f4ed68c96..3d9ec239c 100644 --- a/build.sbt +++ b/build.sbt @@ -708,6 +708,7 @@ lazy val jmh: Project = project cats % Test, datastore % Test, guava % Test, + parquet % "test->test", protobuf % "test->test", scalacheck % Test, tensorflow % Test, @@ -727,7 +728,12 @@ lazy val jmh: Project = project "com.google.apis" % "google-api-services-bigquery" % bigqueryVersion % Test, "com.google.cloud.datastore" % "datastore-v1-proto-client" % datastoreVersion % Test, "org.apache.avro" % "avro" % avroVersion % Test, - "org.tensorflow" % "tensorflow-core-api" % tensorflowVersion % Test + "org.tensorflow" % "tensorflow-core-api" % tensorflowVersion % Test, + "org.apache.parquet" % "parquet-avro" % parquetVersion % Test, + "org.apache.parquet" % "parquet-column" % parquetVersion % Test, + "org.apache.parquet" % "parquet-hadoop" % parquetVersion % Test, + "org.apache.hadoop" % "hadoop-common" % hadoopVersion % Test, + "org.apache.hadoop" % "hadoop-mapreduce-client-core" % hadoopVersion % Test ) ) diff --git a/jmh/src/test/scala/magnolify/jmh/MagnolifyBench.scala b/jmh/src/test/scala/magnolify/jmh/MagnolifyBench.scala index ccf745160..3c72bf149 100644 --- a/jmh/src/test/scala/magnolify/jmh/MagnolifyBench.scala +++ b/jmh/src/test/scala/magnolify/jmh/MagnolifyBench.scala @@ -16,10 +16,13 @@ package magnolify.jmh -import java.util.concurrent.TimeUnit +import magnolify.jmh.MagnolifyBench.nested +import magnolify.parquet.{ParquetType, TestInputFile, TestOutputFile} +import java.util.concurrent.TimeUnit import magnolify.scalacheck.auto._ import magnolify.test.Simple._ +import org.apache.parquet.hadoop.{ParquetReader, ParquetWriter} import org.scalacheck._ import org.openjdk.jmh.annotations._ @@ -87,6 +90,57 @@ class AvroBench { @Benchmark def avroSchema: Schema = AvroType[Nested].schema } +@State(Scope.Benchmark) +class ParquetReadState { + private val parquetType = ParquetType[Nested] + var out: TestOutputFile = null + var reader: ParquetReader[Nested] = null + + @Setup(Level.Invocation) + def setup(): Unit = { + out = new TestOutputFile + val writer = parquetType.writeBuilder(out).build() + writer.write(nested) + writer.close() + + val in = new TestInputFile(out.getBytes) + reader = parquetType.readBuilder(in).build() + } + + @TearDown(Level.Invocation) + def tearDown(): Unit = { + reader.close() + } +} + +@State(Scope.Benchmark) +class ParquetWriteState { + private val parquetType = ParquetType[Nested] + var writer: ParquetWriter[Nested] = null + + @Setup(Level.Invocation) + def setup(): Unit = { + val out = new TestOutputFile + writer = parquetType.writeBuilder(out).build() + } + + @TearDown(Level.Invocation) + def tearDown(): Unit = { + writer.close() + } +} + + +@BenchmarkMode(Array(Mode.AverageTime)) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Thread) +class ParquetBench { + import MagnolifyBench._ + + @Benchmark def parquetWrite(state: ParquetWriteState): Unit = state.writer.write(nested) + @Benchmark def parquetRead(state: ParquetReadState): Nested = state.reader.read() +} + @BenchmarkMode(Array(Mode.AverageTime)) @OutputTimeUnit(TimeUnit.NANOSECONDS) @State(Scope.Thread)