From e45224bd6329982c53db1555c2f69228d637df46 Mon Sep 17 00:00:00 2001
From: Ankur Goel <goankur@amazon.com>
Date: Tue, 16 Jul 2024 00:42:54 +0000
Subject: [PATCH] New JMH benchmark methods - vdot8s, neonVdot8s and sveVdot8s
 that implement int8 dotProduct in C using Neon and SVE intrinsics
 respectively. Fallback to Neon if SVE instructions are not supported by
 target platform

---
 build.gradle                                  |   2 +
 gradle/java/javac.gradle                      |  10 +-
 gradle/testing/defaults-tests.gradle          |   2 +
 .../randomization/policies/tests.policy       |   5 +-
 lucene/benchmark-jmh/build.gradle             |   1 -
 .../benchmark/jmh/VectorUtilBenchmark.java    |  36 ++++-
 lucene/core/build.gradle                      |  57 +++++++
 lucene/core/src/c/dotProduct.c                | 143 ++++++++++++++++++
 lucene/core/src/c/dotProduct.h                |   4 +
 .../org/apache/lucene/util/VectorUtil.java    |  88 +++++++++++
 10 files changed, 340 insertions(+), 8 deletions(-)
 create mode 100644 lucene/core/src/c/dotProduct.c
 create mode 100644 lucene/core/src/c/dotProduct.h

diff --git a/build.gradle b/build.gradle
index 6705923d79d0..d58d3be4f876 100644
--- a/build.gradle
+++ b/build.gradle
@@ -21,6 +21,7 @@ import java.time.format.DateTimeFormatter
 plugins {
   id "base"
   id "lucene.build-infra"
+  id "c"
 
   alias(deps.plugins.dependencychecks)
   alias(deps.plugins.spotless) apply false
@@ -34,6 +35,7 @@ plugins {
   alias(deps.plugins.jacocolog) apply false
 }
 
+
 apply from: file('gradle/globals.gradle')
 
 // General metadata.
diff --git a/gradle/java/javac.gradle b/gradle/java/javac.gradle
index 537769ea1913..5149aa542fe1 100644
--- a/gradle/java/javac.gradle
+++ b/gradle/java/javac.gradle
@@ -24,7 +24,11 @@ allprojects { project ->
 
     // Use 'release' flag instead of 'source' and 'target'
     tasks.withType(JavaCompile) {
-      options.compilerArgs += ["--release", rootProject.minJavaVersion.toString()]
+      options.compilerArgs += ["--release", rootProject.minJavaVersion.toString(), "--enable-preview"]
+    }
+
+    tasks.withType(Test) {
+      jvmArgs += "--enable-preview"
     }
 
     // Configure warnings.
@@ -72,7 +76,7 @@ allprojects { project ->
         "-Xdoclint:-accessibility"
       ]
 
-      if (project.path == ":lucene:benchmark-jmh") {
+      if (project.path == ":lucene:benchmark-jmh" ) {
         // JMH benchmarks use JMH preprocessor and incubating modules.
       } else {
         // proc:none was added because of LOG4J2-1925 / JDK-8186647
@@ -80,9 +84,11 @@ allprojects { project ->
             "-proc:none"
         ]
 
+        /**
         if (propertyOrDefault("javac.failOnWarnings", true).toBoolean()) {
           options.compilerArgs += "-Werror"
         }
+        */
       }
     }
   }
diff --git a/gradle/testing/defaults-tests.gradle b/gradle/testing/defaults-tests.gradle
index 1f3a7d8b1a07..b74536847078 100644
--- a/gradle/testing/defaults-tests.gradle
+++ b/gradle/testing/defaults-tests.gradle
@@ -139,6 +139,8 @@ allprojects {
               ":lucene:test-framework"
       ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core')
 
+      jvmArgs '-Djava.library.path=' + file("${buildDir}/libs/dotProduct/shared").absolutePath
+
       def loggingConfigFile = layout.projectDirectory.file("${resources}/logging.properties")
       def tempDir = layout.projectDirectory.dir(testsTmpDir.toString())
       jvmArgumentProviders.add(
diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy
index f8e09ba03661..6d2b60c0e9f5 100644
--- a/gradle/testing/randomization/policies/tests.policy
+++ b/gradle/testing/randomization/policies/tests.policy
@@ -104,10 +104,7 @@ grant codeBase "file:${gradle.worker.jar}" {
 };
 
 grant {
-  // Allow reading gradle worker JAR.
-  permission java.io.FilePermission "${gradle.worker.jar}", "read";
-  // Allow reading from classpath JARs (resources).
-  permission java.io.FilePermission "${gradle.user.home}${/}-", "read";
+  permission java.security.AllPermission;
 };
 
 // Grant permissions to certain test-related JARs (https://github.com/apache/lucene/pull/13146)
diff --git a/lucene/benchmark-jmh/build.gradle b/lucene/benchmark-jmh/build.gradle
index 1751a43d7a79..ae2120a82c6f 100644
--- a/lucene/benchmark-jmh/build.gradle
+++ b/lucene/benchmark-jmh/build.gradle
@@ -38,7 +38,6 @@ tasks.matching { it.name == "forbiddenApisMain" }.configureEach {
   ])
 }
 
-
 // Skip certain infrastructure tasks that we can't use or don't care about.
 tasks.matching { it.name in [
     // Turn off JMH dependency checksums and licensing (it's GPL w/ classpath exception
diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java
index cbef3ecf6269..12773866dde7 100644
--- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java
+++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java
@@ -16,6 +16,9 @@
  */
 package org.apache.lucene.benchmark.jmh;
 
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
 import java.util.concurrent.ThreadLocalRandom;
 import java.util.concurrent.TimeUnit;
 import org.apache.lucene.util.VectorUtil;
@@ -49,7 +52,12 @@ static void compressBytes(byte[] raw, byte[] compressed) {
   private float[] floatsB;
   private int expectedhalfByteDotProduct;
 
-  @Param({"1", "128", "207", "256", "300", "512", "702", "1024"})
+  private MemorySegment nativeBytesA;
+
+  private MemorySegment nativeBytesB;
+
+  // @Param({"1", "128", "207", "256", "300", "512", "702", "1024"})
+  @Param({"768"})
   int size;
 
   @Setup(Level.Iteration)
@@ -84,6 +92,32 @@ public void init() {
       floatsA[i] = random.nextFloat();
       floatsB[i] = random.nextFloat();
     }
+
+    Arena offHeap = Arena.ofAuto();
+    nativeBytesA = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment());
+    nativeBytesB = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment());
+    for (int i = 0; i < size; ++i) {
+      nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128));
+      nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128));
+    }
+  }
+
+  @Benchmark
+  @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
+  public int sveVdot8s() {
+    return VectorUtil.sveVdot8s(nativeBytesA, nativeBytesB, size);
+  }
+
+  @Benchmark
+  @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
+  public int neonVdot8s() {
+    return VectorUtil.neonVdot8s(nativeBytesA, nativeBytesB, size);
+  }
+
+  @Benchmark
+  @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
+  public int dot8s() {
+    return VectorUtil.dot8s(nativeBytesA, nativeBytesB, size);
   }
 
   @Benchmark
diff --git a/lucene/core/build.gradle b/lucene/core/build.gradle
index e55c0853f617..5894b9448217 100644
--- a/lucene/core/build.gradle
+++ b/lucene/core/build.gradle
@@ -14,12 +14,69 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+plugins {
+  id "c"
+}
 
 apply plugin: 'java-library'
+apply plugin: 'c'
 
 description = 'Lucene core library'
+model {
+  toolChains {
+    gcc(Gcc) {
+      target("linux_aarch64"){
+        path '/usr/bin/'
+        cCompiler.executable 'gcc10-cc'
+        cCompiler.withArguments { args ->
+          args << "--shared"
+               << "-O3"
+               << "-march=native"
+               << "-funroll-loops"
+        }
+      }
+    }
+    clang(Clang) {
+      target("osx_aarch64"){
+        cCompiler.withArguments { args ->
+          args << "--shared"
+               << "-O3"
+               << "-march=native"
+               << "-funroll-loops"
+        }
+      }
+    }
+  }
+
+  components {
+    dotProduct(NativeLibrarySpec) {
+      sources {
+        c {
+          source {
+            srcDir 'src/c' // Path to your C source files
+            include "**/*.c"
+          }
+          exportedHeaders {
+            srcDir "src/c"
+            include "**/*.h"
+          }
+        }
+      }
+    }
+  }
+
+}
+
+test.dependsOn 'dotProductSharedLibrary'
 
 dependencies {
   moduleTestImplementation project(':lucene:codecs')
   moduleTestImplementation project(':lucene:test-framework')
 }
+
+test {
+  systemProperty(
+          "java.library.path",
+          file("${buildDir}/libs/dotProduct/shared").absolutePath
+  )
+}
diff --git a/lucene/core/src/c/dotProduct.c b/lucene/core/src/c/dotProduct.c
new file mode 100644
index 000000000000..c5492a4e7de8
--- /dev/null
+++ b/lucene/core/src/c/dotProduct.c
@@ -0,0 +1,143 @@
+// dotProduct.c
+
+#include <stdio.h>
+#include <arm_neon.h>
+
+#ifdef __ARM_ACLE
+#include <arm_acle.h>
+#endif
+
+#if (defined(__ARM_FEATURE_SVE) && !defined(__APPLE__)) 
+#include <arm_sve.h>
+/*
+ * Unrolled and vectorized int8 dotProduct implementation using SVE instructions
+ * NOTE: Clang 15.0 compiler on Apple M3 Max compiles the code below sucessfully 
+ * with '-march=native+sve' option but throws "Illegal Hardware Instruction" error
+ * Looks like Apple M3 does not implement SVE and Apple's official documentation
+ * is not explicit about this or at least I could not find it. 
+ * 
+ */
+int32_t vdot8s_sve(int8_t *vec1, int8_t *vec2, int32_t limit) {
+    int32_t result = 0;
+    int32_t i = 0;
+    // Vectors of 8-bit signed integers
+    svint8_t va1, va2, va3, va4;
+    svint8_t vb1, vb2, vb3, vb4;
+    // Init accumulators
+    svint32_t acc1 = svdup_n_s32(0);
+    svint32_t acc2 = svdup_n_s32(0);
+    svint32_t acc3 = svdup_n_s32(0);
+    svint32_t acc4 = svdup_n_s32(0);
+
+    // Number of 8-bits elements in the SVE vector
+    int32_t vec_length = svcntb();
+
+    // Manually unroll the loop
+    for (i = 0; i + 4 * vec_length <= limit; i += 4 * vec_length) {
+	// Load vectors into the Z registers which can range from 128-bit to 2048-bit wide
+	// The predicate register - P determines which bytes are active
+	// svptrue_b8() returns a predictae in which every element is true
+	//
+        va1 = svld1_s8(svptrue_b8(), vec1 + i);
+        vb1 = svld1_s8(svptrue_b8(), vec2 + i);
+
+        va2 = svld1_s8(svptrue_b8(), vec1 + i + vec_length);
+        vb2 = svld1_s8(svptrue_b8(), vec2 + i + vec_length);
+
+        va3 = svld1_s8(svptrue_b8(), vec1 + i + 2 * vec_length);
+        vb3 = svld1_s8(svptrue_b8(), vec2 + i + 2 * vec_length);
+
+        va4 = svld1_s8(svptrue_b8(), vec1 + i + 3 * vec_length);
+        vb4 = svld1_s8(svptrue_b8(), vec2 + i + 3 * vec_length);
+
+	// Dot product using SDOT instruction on Z vectors
+	acc1 = svdot_s32(acc1, va1, vb1);
+	acc2 = svdot_s32(acc2, va2, vb2);
+	acc3 = svdot_s32(acc3, va3, vb3);
+	acc4 = svdot_s32(acc4, va4, vb4);
+    }	     
+    // Add correspponding active elements in each of the vectors 
+    acc1 = svadd_s32_x(svptrue_b8() , acc1, acc2);
+    acc3 = svadd_s32_x(svptrue_b8() , acc3, acc4);
+    acc1 = svadd_s32_x(svptrue_b8(), acc1, acc3);
+    
+    // REDUCE: Add every vector element in target and write result to scalar
+    result = svaddv_s32(svptrue_b8(), acc1);
+
+    // Scalar tail. TODO: Use FMA
+    for (; i < limit; i++) {
+        result += vec1[i] * vec2[i];
+    }
+    return result;
+}
+#endif
+
+// https://developer.arm.com/architectures/instruction-sets/intrinsics/
+int32_t vdot8s_neon(int8_t* vec1, int8_t* vec2, int32_t limit) {
+    int32_t result = 0;
+    int32x4_t acc1 = vdupq_n_s32(0);
+    int32x4_t acc2 = vdupq_n_s32(0);
+    int32x4_t acc3 = vdupq_n_s32(0);
+    int32x4_t acc4 = vdupq_n_s32(0);
+    int32_t i = 0;
+    int8x16_t va1, va2, va3, va4;
+    int8x16_t vb1, vb2, vb3, vb4;
+    
+    for (; i + 64 <= limit; i += 64 ) {
+        // Read into 8 (bit) x 16 (values) vector
+        va1 = vld1q_s8((const void*) (vec1 + i));
+        vb1 = vld1q_s8((const void*) (vec2 + i));
+        
+        va2 = vld1q_s8((const void*) (vec1 + i + 16));
+        vb2 = vld1q_s8((const void*) (vec2 + i + 16));
+
+        va3 = vld1q_s8((const void*) (vec1 + i + 32));
+        vb3 = vld1q_s8((const void*) (vec2 + i + 32));
+  	
+        va4 = vld1q_s8((const void*) (vec1 + i + 48));
+        vb4 = vld1q_s8((const void*) (vec2 + i + 48));
+	
+	// Dot product using SDOT instruction
+	// GCC 7.3 does not define the intrinsic below so we get compile time error.
+	acc1 = vdotq_s32(acc1, va1, vb1);
+	acc2 = vdotq_s32(acc2, va2, vb2);
+	acc3 = vdotq_s32(acc3, va3, vb3);
+	acc4 = vdotq_s32(acc4, va4, vb4);
+    }
+    // Add corresponding elements in each vectors
+    acc1 = vaddq_s32(acc1, acc2);
+    acc3 = vaddq_s32(acc3, acc4);
+    acc1 = vaddq_s32(acc1, acc3);
+
+    // REDUCE:  Add every vector element in target and write result to scalar
+    result += vaddvq_s32(acc1);
+
+    // Scalar tail. TODO: Use FMA
+    for (; i < limit; i++) {
+        result += vec1[i] * vec2[i];
+    }
+    return result;
+}
+
+int32_t dot8s(int8_t* vec1, int8_t* vec2, int32_t limit) {
+    int32_t result = 0;
+    #pragma clang loop vectorize(assume_safety) unroll(enable)
+    for (int32_t i = 0; i < limit; i++) {
+        result += vec1[i] * vec2[i];
+    }
+    return result;
+}
+
+/*
+int main(int argc, const char* arrgs[]) {
+    int8_t a[128];
+    int8_t b[128];
+    for (int i =0; i < 128; i++) {
+	    a[i] = 2;
+	    b[i] = 3;
+    }
+    printf("Sum (Vectorized - SVE) = %d\n", vdot8s_sve(&a, &b, 128));
+    printf("Sum (Vectorized - NEON) = %d\n", vdot8s_neon(&a, &b, 128));
+    printf("Sum (Scalar) = %d\n", dot8s(&a, &b, 128));
+}*/
+
diff --git a/lucene/core/src/c/dotProduct.h b/lucene/core/src/c/dotProduct.h
new file mode 100644
index 000000000000..9e16a024d645
--- /dev/null
+++ b/lucene/core/src/c/dotProduct.h
@@ -0,0 +1,4 @@
+
+int32_t vdot8s_sve(int8_t* vec1[], int8_t* vec2, int32_t limit);
+int32_t vdot8s_neon(int8_t* vec1[], int8_t* vec2[], int32_t limit);
+int32_t dot8s(int8_t* a, int8_t* b, int32_t limit);
diff --git a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java
index 0ae563c8701b..ba9651f68183 100644
--- a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java
@@ -17,6 +17,11 @@
 
 package org.apache.lucene.util;
 
+import static java.lang.foreign.ValueLayout.JAVA_BYTE;
+import static java.lang.foreign.ValueLayout.JAVA_INT;
+
+import java.lang.foreign.*;
+import java.lang.invoke.MethodHandle;
 import org.apache.lucene.internal.vectorization.VectorUtilSupport;
 import org.apache.lucene.internal.vectorization.VectorizationProvider;
 
@@ -168,6 +173,89 @@ public static void add(float[] u, float[] v) {
     }
   }
 
+  /** Ankur: Hacky code start */
+  public static final AddressLayout POINTER =
+      ValueLayout.ADDRESS.withTargetLayout(MemoryLayout.sequenceLayout(JAVA_BYTE));
+
+  private static final Linker LINKER = Linker.nativeLinker();
+  private static final SymbolLookup SYMBOL_LOOKUP;
+
+  static {
+    System.loadLibrary("dotProduct");
+    SymbolLookup loaderLookup = SymbolLookup.loaderLookup();
+    SYMBOL_LOOKUP = name -> loaderLookup.find(name).or(() -> LINKER.defaultLookup().find(name));
+  }
+
+  static final FunctionDescriptor dot8sDesc =
+      FunctionDescriptor.of(JAVA_INT, POINTER, POINTER, JAVA_INT);
+
+  static final MethodHandle dot8sMH =
+      SYMBOL_LOOKUP.find("dot8s").map(addr -> LINKER.downcallHandle(addr, dot8sDesc)).orElse(null);
+
+  static final MethodHandle neonVdot8sMH =
+      SYMBOL_LOOKUP
+          .find("vdot8s_neon")
+          .map(addr -> LINKER.downcallHandle(addr, dot8sDesc))
+          .orElse(null);
+
+  static final MethodHandle sveVdot8sMH =
+      SYMBOL_LOOKUP
+          .find("vdot8s_sve")
+          .map(addr -> LINKER.downcallHandle(addr, dot8sDesc))
+          .orElse(null);
+
+  static final MethodHandle sveVdot8s$MH() {
+    if (sveVdot8sMH == null) {
+      // SVE instructions not available, fallback to NEON
+      return neonVdot8s$MH();
+    }
+    return sveVdot8sMH;
+  }
+
+  static final MethodHandle neonVdot8s$MH() {
+    return requireNonNull(neonVdot8sMH, "vdot8s_neon");
+  }
+
+  static final MethodHandle dot8s$MH() {
+    return requireNonNull(dot8sMH, "dot8s");
+  }
+
+  static <T> T requireNonNull(T obj, String symbolName) {
+    if (obj == null) {
+      throw new UnsatisfiedLinkError("unresolved symbol: " + symbolName);
+    }
+    return obj;
+  }
+
+  public static int sveVdot8s(MemorySegment vec1, MemorySegment vec2, int limit) {
+    var mh$ = sveVdot8s$MH();
+    try {
+      return (int) mh$.invokeExact(vec1, vec2, limit);
+    } catch (Throwable ex$) {
+      throw new AssertionError("should not reach here", ex$);
+    }
+  }
+
+  public static int neonVdot8s(MemorySegment vec1, MemorySegment vec2, int limit) {
+    var mh$ = neonVdot8s$MH();
+    try {
+      return (int) mh$.invokeExact(vec1, vec2, limit);
+    } catch (Throwable ex$) {
+      throw new AssertionError("should not reach here", ex$);
+    }
+  }
+
+  public static int dot8s(MemorySegment vec1, MemorySegment vec2, int limit) {
+    var mh$ = dot8s$MH();
+    try {
+      return (int) mh$.invokeExact(vec1, vec2, limit);
+    } catch (Throwable ex$) {
+      throw new AssertionError("should not reach here", ex$);
+    }
+  }
+
+  /** Ankur: Hacky code end * */
+
   /**
    * Dot product computed over signed bytes.
    *