New JMH benchmark methods - vdot8s, neonVdot8s and sveVdot8s that imp…

…lement int8 dotProduct in C using Neon and SVE intrinsics respectively. Fallback to Neon if SVE instructions are not supported by target platform
apache · Jul 30, 2024 · e45224b · e45224b
1 parent 8a7d484
commit e45224b
Show file tree

Hide file tree

Showing 10 changed files with 340 additions and 8 deletions.
diff --git a/build.gradle b/build.gradle
@@ -21,6 +21,7 @@ import java.time.format.DateTimeFormatter
 plugins {
   id "base"
   id "lucene.build-infra"
+  id "c"
 
   alias(deps.plugins.dependencychecks)
   alias(deps.plugins.spotless) apply false
@@ -34,6 +35,7 @@ plugins {
   alias(deps.plugins.jacocolog) apply false
 }
 
+
 apply from: file('gradle/globals.gradle')
 
 // General metadata.

diff --git a/gradle/java/javac.gradle b/gradle/java/javac.gradle
@@ -24,7 +24,11 @@ allprojects { project ->
 
     // Use 'release' flag instead of 'source' and 'target'
     tasks.withType(JavaCompile) {
-      options.compilerArgs += ["--release", rootProject.minJavaVersion.toString()]
+      options.compilerArgs += ["--release", rootProject.minJavaVersion.toString(), "--enable-preview"]
+    }
+
+    tasks.withType(Test) {
+      jvmArgs += "--enable-preview"
     }
 
     // Configure warnings.
@@ -72,17 +76,19 @@ allprojects { project ->
         "-Xdoclint:-accessibility"
       ]
 
-      if (project.path == ":lucene:benchmark-jmh") {
+      if (project.path == ":lucene:benchmark-jmh" ) {
         // JMH benchmarks use JMH preprocessor and incubating modules.
       } else {
         // proc:none was added because of LOG4J2-1925 / JDK-8186647
         options.compilerArgs += [
             "-proc:none"
         ]
 
+        /**
         if (propertyOrDefault("javac.failOnWarnings", true).toBoolean()) {
           options.compilerArgs += "-Werror"
         }
+        */
       }
     }
   }

diff --git a/gradle/testing/defaults-tests.gradle b/gradle/testing/defaults-tests.gradle
@@ -139,6 +139,8 @@ allprojects {
               ":lucene:test-framework"
       ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core')
 
+      jvmArgs '-Djava.library.path=' + file("${buildDir}/libs/dotProduct/shared").absolutePath
+
       def loggingConfigFile = layout.projectDirectory.file("${resources}/logging.properties")
       def tempDir = layout.projectDirectory.dir(testsTmpDir.toString())
       jvmArgumentProviders.add(

diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy
@@ -104,10 +104,7 @@ grant codeBase "file:${gradle.worker.jar}" {
 };
 
 grant {
-  // Allow reading gradle worker JAR.
-  permission java.io.FilePermission "${gradle.worker.jar}", "read";
-  // Allow reading from classpath JARs (resources).
-  permission java.io.FilePermission "${gradle.user.home}${/}-", "read";
+  permission java.security.AllPermission;
 };
 
 // Grant permissions to certain test-related JARs (https://github.com/apache/lucene/pull/13146)

diff --git a/lucene/benchmark-jmh/build.gradle b/lucene/benchmark-jmh/build.gradle
@@ -38,7 +38,6 @@ tasks.matching { it.name == "forbiddenApisMain" }.configureEach {
   ])
 }
 
-
 // Skip certain infrastructure tasks that we can't use or don't care about.
 tasks.matching { it.name in [
     // Turn off JMH dependency checksums and licensing (it's GPL w/ classpath exception

diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java
@@ -16,6 +16,9 @@
  */
 package org.apache.lucene.benchmark.jmh;
 
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
 import java.util.concurrent.ThreadLocalRandom;
 import java.util.concurrent.TimeUnit;
 import org.apache.lucene.util.VectorUtil;
@@ -49,7 +52,12 @@ static void compressBytes(byte[] raw, byte[] compressed) {
   private float[] floatsB;
   private int expectedhalfByteDotProduct;
 
-  @Param({"1", "128", "207", "256", "300", "512", "702", "1024"})
+  private MemorySegment nativeBytesA;
+
+  private MemorySegment nativeBytesB;
+
+  // @Param({"1", "128", "207", "256", "300", "512", "702", "1024"})
+  @Param({"768"})
   int size;
 
   @Setup(Level.Iteration)
@@ -84,6 +92,32 @@ public void init() {
       floatsA[i] = random.nextFloat();
       floatsB[i] = random.nextFloat();
     }
+
+    Arena offHeap = Arena.ofAuto();
+    nativeBytesA = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment());
+    nativeBytesB = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment());
+    for (int i = 0; i < size; ++i) {
+      nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128));
+      nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128));
+    }
+  }
+
+  @Benchmark
+  @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
+  public int sveVdot8s() {
+    return VectorUtil.sveVdot8s(nativeBytesA, nativeBytesB, size);
+  }
+
+  @Benchmark
+  @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
+  public int neonVdot8s() {
+    return VectorUtil.neonVdot8s(nativeBytesA, nativeBytesB, size);
+  }
+
+  @Benchmark
+  @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
+  public int dot8s() {
+    return VectorUtil.dot8s(nativeBytesA, nativeBytesB, size);
   }
 
   @Benchmark

diff --git a/lucene/core/build.gradle b/lucene/core/build.gradle
@@ -14,12 +14,69 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+plugins {
+  id "c"
+}
 
 apply plugin: 'java-library'
+apply plugin: 'c'
 
 description = 'Lucene core library'
+model {
+  toolChains {
+    gcc(Gcc) {
+      target("linux_aarch64"){
+        path '/usr/bin/'
+        cCompiler.executable 'gcc10-cc'
+        cCompiler.withArguments { args ->
+          args << "--shared"
+               << "-O3"
+               << "-march=native"
+               << "-funroll-loops"
+        }
+      }
+    }
+    clang(Clang) {
+      target("osx_aarch64"){
+        cCompiler.withArguments { args ->
+          args << "--shared"
+               << "-O3"
+               << "-march=native"
+               << "-funroll-loops"
+        }
+      }
+    }
+  }
+
+  components {
+    dotProduct(NativeLibrarySpec) {
+      sources {
+        c {
+          source {
+            srcDir 'src/c' // Path to your C source files
+            include "**/*.c"
+          }
+          exportedHeaders {
+            srcDir "src/c"
+            include "**/*.h"
+          }
+        }
+      }
+    }
+  }
+
+}
+
+test.dependsOn 'dotProductSharedLibrary'
 
 dependencies {
   moduleTestImplementation project(':lucene:codecs')
   moduleTestImplementation project(':lucene:test-framework')
 }
+
+test {
+  systemProperty(
+          "java.library.path",
+          file("${buildDir}/libs/dotProduct/shared").absolutePath
+  )
+}
diff --git a/lucene/core/src/c/dotProduct.c b/lucene/core/src/c/dotProduct.c
@@ -0,0 +1,143 @@
+// dotProduct.c
+
+#include <stdio.h>
+#include <arm_neon.h>
+
+#ifdef __ARM_ACLE
+#include <arm_acle.h>
+#endif
+
+#if (defined(__ARM_FEATURE_SVE) && !defined(__APPLE__)) 
+#include <arm_sve.h>
+/*
+ * Unrolled and vectorized int8 dotProduct implementation using SVE instructions
+ * NOTE: Clang 15.0 compiler on Apple M3 Max compiles the code below sucessfully 
+ * with '-march=native+sve' option but throws "Illegal Hardware Instruction" error
+ * Looks like Apple M3 does not implement SVE and Apple's official documentation
+ * is not explicit about this or at least I could not find it. 
+ * 
+ */
+int32_t vdot8s_sve(int8_t *vec1, int8_t *vec2, int32_t limit) {
+    int32_t result = 0;
+    int32_t i = 0;
+    // Vectors of 8-bit signed integers
+    svint8_t va1, va2, va3, va4;
+    svint8_t vb1, vb2, vb3, vb4;
+    // Init accumulators
+    svint32_t acc1 = svdup_n_s32(0);
+    svint32_t acc2 = svdup_n_s32(0);
+    svint32_t acc3 = svdup_n_s32(0);
+    svint32_t acc4 = svdup_n_s32(0);
+
+    // Number of 8-bits elements in the SVE vector
+    int32_t vec_length = svcntb();
+
+    // Manually unroll the loop
+    for (i = 0; i + 4 * vec_length <= limit; i += 4 * vec_length) {
+	// Load vectors into the Z registers which can range from 128-bit to 2048-bit wide
+	// The predicate register - P determines which bytes are active
+	// svptrue_b8() returns a predictae in which every element is true
+	//
+        va1 = svld1_s8(svptrue_b8(), vec1 + i);
+        vb1 = svld1_s8(svptrue_b8(), vec2 + i);
+
+        va2 = svld1_s8(svptrue_b8(), vec1 + i + vec_length);
+        vb2 = svld1_s8(svptrue_b8(), vec2 + i + vec_length);
+
+        va3 = svld1_s8(svptrue_b8(), vec1 + i + 2 * vec_length);
+        vb3 = svld1_s8(svptrue_b8(), vec2 + i + 2 * vec_length);
+
+        va4 = svld1_s8(svptrue_b8(), vec1 + i + 3 * vec_length);
+        vb4 = svld1_s8(svptrue_b8(), vec2 + i + 3 * vec_length);
+
+	// Dot product using SDOT instruction on Z vectors
+	acc1 = svdot_s32(acc1, va1, vb1);
+	acc2 = svdot_s32(acc2, va2, vb2);
+	acc3 = svdot_s32(acc3, va3, vb3);
+	acc4 = svdot_s32(acc4, va4, vb4);
+    }	     
+    // Add correspponding active elements in each of the vectors 
+    acc1 = svadd_s32_x(svptrue_b8() , acc1, acc2);
+    acc3 = svadd_s32_x(svptrue_b8() , acc3, acc4);
+    acc1 = svadd_s32_x(svptrue_b8(), acc1, acc3);
+
+    // REDUCE: Add every vector element in target and write result to scalar
+    result = svaddv_s32(svptrue_b8(), acc1);
+
+    // Scalar tail. TODO: Use FMA
+    for (; i < limit; i++) {
+        result += vec1[i] * vec2[i];
+    }
+    return result;
+}
+#endif
+
+// https://developer.arm.com/architectures/instruction-sets/intrinsics/
+int32_t vdot8s_neon(int8_t* vec1, int8_t* vec2, int32_t limit) {
+    int32_t result = 0;
+    int32x4_t acc1 = vdupq_n_s32(0);
+    int32x4_t acc2 = vdupq_n_s32(0);
+    int32x4_t acc3 = vdupq_n_s32(0);
+    int32x4_t acc4 = vdupq_n_s32(0);
+    int32_t i = 0;
+    int8x16_t va1, va2, va3, va4;
+    int8x16_t vb1, vb2, vb3, vb4;
+
+    for (; i + 64 <= limit; i += 64 ) {
+        // Read into 8 (bit) x 16 (values) vector
+        va1 = vld1q_s8((const void*) (vec1 + i));
+        vb1 = vld1q_s8((const void*) (vec2 + i));
+
+        va2 = vld1q_s8((const void*) (vec1 + i + 16));
+        vb2 = vld1q_s8((const void*) (vec2 + i + 16));
+
+        va3 = vld1q_s8((const void*) (vec1 + i + 32));
+        vb3 = vld1q_s8((const void*) (vec2 + i + 32));
+
+        va4 = vld1q_s8((const void*) (vec1 + i + 48));
+        vb4 = vld1q_s8((const void*) (vec2 + i + 48));
+
+	// Dot product using SDOT instruction
+	// GCC 7.3 does not define the intrinsic below so we get compile time error.
+	acc1 = vdotq_s32(acc1, va1, vb1);
+	acc2 = vdotq_s32(acc2, va2, vb2);
+	acc3 = vdotq_s32(acc3, va3, vb3);
+	acc4 = vdotq_s32(acc4, va4, vb4);
+    }
+    // Add corresponding elements in each vectors
+    acc1 = vaddq_s32(acc1, acc2);
+    acc3 = vaddq_s32(acc3, acc4);
+    acc1 = vaddq_s32(acc1, acc3);
+
+    // REDUCE:  Add every vector element in target and write result to scalar
+    result += vaddvq_s32(acc1);
+
+    // Scalar tail. TODO: Use FMA
+    for (; i < limit; i++) {
+        result += vec1[i] * vec2[i];
+    }
+    return result;
+}
+
+int32_t dot8s(int8_t* vec1, int8_t* vec2, int32_t limit) {
+    int32_t result = 0;
+    #pragma clang loop vectorize(assume_safety) unroll(enable)
+    for (int32_t i = 0; i < limit; i++) {
+        result += vec1[i] * vec2[i];
+    }
+    return result;
+}
+
+/*
+int main(int argc, const char* arrgs[]) {
+    int8_t a[128];
+    int8_t b[128];
+    for (int i =0; i < 128; i++) {
+	    a[i] = 2;
+	    b[i] = 3;
+    }
+    printf("Sum (Vectorized - SVE) = %d\n", vdot8s_sve(&a, &b, 128));
+    printf("Sum (Vectorized - NEON) = %d\n", vdot8s_neon(&a, &b, 128));
+    printf("Sum (Scalar) = %d\n", dot8s(&a, &b, 128));
+}*/
+
diff --git a/lucene/core/src/c/dotProduct.h b/lucene/core/src/c/dotProduct.h
@@ -0,0 +1,4 @@
+
+int32_t vdot8s_sve(int8_t* vec1[], int8_t* vec2, int32_t limit);
+int32_t vdot8s_neon(int8_t* vec1[], int8_t* vec2[], int32_t limit);
+int32_t dot8s(int8_t* a, int8_t* b, int32_t limit);