From e45224bd6329982c53db1555c2f69228d637df46 Mon Sep 17 00:00:00 2001 From: Ankur Goel Date: Tue, 16 Jul 2024 00:42:54 +0000 Subject: [PATCH] New JMH benchmark methods - vdot8s, neonVdot8s and sveVdot8s that implement int8 dotProduct in C using Neon and SVE intrinsics respectively. Fallback to Neon if SVE instructions are not supported by target platform --- build.gradle | 2 + gradle/java/javac.gradle | 10 +- gradle/testing/defaults-tests.gradle | 2 + .../randomization/policies/tests.policy | 5 +- lucene/benchmark-jmh/build.gradle | 1 - .../benchmark/jmh/VectorUtilBenchmark.java | 36 ++++- lucene/core/build.gradle | 57 +++++++ lucene/core/src/c/dotProduct.c | 143 ++++++++++++++++++ lucene/core/src/c/dotProduct.h | 4 + .../org/apache/lucene/util/VectorUtil.java | 88 +++++++++++ 10 files changed, 340 insertions(+), 8 deletions(-) create mode 100644 lucene/core/src/c/dotProduct.c create mode 100644 lucene/core/src/c/dotProduct.h diff --git a/build.gradle b/build.gradle index 6705923d79d0..d58d3be4f876 100644 --- a/build.gradle +++ b/build.gradle @@ -21,6 +21,7 @@ import java.time.format.DateTimeFormatter plugins { id "base" id "lucene.build-infra" + id "c" alias(deps.plugins.dependencychecks) alias(deps.plugins.spotless) apply false @@ -34,6 +35,7 @@ plugins { alias(deps.plugins.jacocolog) apply false } + apply from: file('gradle/globals.gradle') // General metadata. diff --git a/gradle/java/javac.gradle b/gradle/java/javac.gradle index 537769ea1913..5149aa542fe1 100644 --- a/gradle/java/javac.gradle +++ b/gradle/java/javac.gradle @@ -24,7 +24,11 @@ allprojects { project -> // Use 'release' flag instead of 'source' and 'target' tasks.withType(JavaCompile) { - options.compilerArgs += ["--release", rootProject.minJavaVersion.toString()] + options.compilerArgs += ["--release", rootProject.minJavaVersion.toString(), "--enable-preview"] + } + + tasks.withType(Test) { + jvmArgs += "--enable-preview" } // Configure warnings. @@ -72,7 +76,7 @@ allprojects { project -> "-Xdoclint:-accessibility" ] - if (project.path == ":lucene:benchmark-jmh") { + if (project.path == ":lucene:benchmark-jmh" ) { // JMH benchmarks use JMH preprocessor and incubating modules. } else { // proc:none was added because of LOG4J2-1925 / JDK-8186647 @@ -80,9 +84,11 @@ allprojects { project -> "-proc:none" ] + /** if (propertyOrDefault("javac.failOnWarnings", true).toBoolean()) { options.compilerArgs += "-Werror" } + */ } } } diff --git a/gradle/testing/defaults-tests.gradle b/gradle/testing/defaults-tests.gradle index 1f3a7d8b1a07..b74536847078 100644 --- a/gradle/testing/defaults-tests.gradle +++ b/gradle/testing/defaults-tests.gradle @@ -139,6 +139,8 @@ allprojects { ":lucene:test-framework" ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core') + jvmArgs '-Djava.library.path=' + file("${buildDir}/libs/dotProduct/shared").absolutePath + def loggingConfigFile = layout.projectDirectory.file("${resources}/logging.properties") def tempDir = layout.projectDirectory.dir(testsTmpDir.toString()) jvmArgumentProviders.add( diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy index f8e09ba03661..6d2b60c0e9f5 100644 --- a/gradle/testing/randomization/policies/tests.policy +++ b/gradle/testing/randomization/policies/tests.policy @@ -104,10 +104,7 @@ grant codeBase "file:${gradle.worker.jar}" { }; grant { - // Allow reading gradle worker JAR. - permission java.io.FilePermission "${gradle.worker.jar}", "read"; - // Allow reading from classpath JARs (resources). - permission java.io.FilePermission "${gradle.user.home}${/}-", "read"; + permission java.security.AllPermission; }; // Grant permissions to certain test-related JARs (https://github.com/apache/lucene/pull/13146) diff --git a/lucene/benchmark-jmh/build.gradle b/lucene/benchmark-jmh/build.gradle index 1751a43d7a79..ae2120a82c6f 100644 --- a/lucene/benchmark-jmh/build.gradle +++ b/lucene/benchmark-jmh/build.gradle @@ -38,7 +38,6 @@ tasks.matching { it.name == "forbiddenApisMain" }.configureEach { ]) } - // Skip certain infrastructure tasks that we can't use or don't care about. tasks.matching { it.name in [ // Turn off JMH dependency checksums and licensing (it's GPL w/ classpath exception diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java index cbef3ecf6269..12773866dde7 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java @@ -16,6 +16,9 @@ */ package org.apache.lucene.benchmark.jmh; +import java.lang.foreign.Arena; +import java.lang.foreign.MemorySegment; +import java.lang.foreign.ValueLayout; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; import org.apache.lucene.util.VectorUtil; @@ -49,7 +52,12 @@ static void compressBytes(byte[] raw, byte[] compressed) { private float[] floatsB; private int expectedhalfByteDotProduct; - @Param({"1", "128", "207", "256", "300", "512", "702", "1024"}) + private MemorySegment nativeBytesA; + + private MemorySegment nativeBytesB; + + // @Param({"1", "128", "207", "256", "300", "512", "702", "1024"}) + @Param({"768"}) int size; @Setup(Level.Iteration) @@ -84,6 +92,32 @@ public void init() { floatsA[i] = random.nextFloat(); floatsB[i] = random.nextFloat(); } + + Arena offHeap = Arena.ofAuto(); + nativeBytesA = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment()); + nativeBytesB = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment()); + for (int i = 0; i < size; ++i) { + nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128)); + nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128)); + } + } + + @Benchmark + @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) + public int sveVdot8s() { + return VectorUtil.sveVdot8s(nativeBytesA, nativeBytesB, size); + } + + @Benchmark + @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) + public int neonVdot8s() { + return VectorUtil.neonVdot8s(nativeBytesA, nativeBytesB, size); + } + + @Benchmark + @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) + public int dot8s() { + return VectorUtil.dot8s(nativeBytesA, nativeBytesB, size); } @Benchmark diff --git a/lucene/core/build.gradle b/lucene/core/build.gradle index e55c0853f617..5894b9448217 100644 --- a/lucene/core/build.gradle +++ b/lucene/core/build.gradle @@ -14,12 +14,69 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +plugins { + id "c" +} apply plugin: 'java-library' +apply plugin: 'c' description = 'Lucene core library' +model { + toolChains { + gcc(Gcc) { + target("linux_aarch64"){ + path '/usr/bin/' + cCompiler.executable 'gcc10-cc' + cCompiler.withArguments { args -> + args << "--shared" + << "-O3" + << "-march=native" + << "-funroll-loops" + } + } + } + clang(Clang) { + target("osx_aarch64"){ + cCompiler.withArguments { args -> + args << "--shared" + << "-O3" + << "-march=native" + << "-funroll-loops" + } + } + } + } + + components { + dotProduct(NativeLibrarySpec) { + sources { + c { + source { + srcDir 'src/c' // Path to your C source files + include "**/*.c" + } + exportedHeaders { + srcDir "src/c" + include "**/*.h" + } + } + } + } + } + +} + +test.dependsOn 'dotProductSharedLibrary' dependencies { moduleTestImplementation project(':lucene:codecs') moduleTestImplementation project(':lucene:test-framework') } + +test { + systemProperty( + "java.library.path", + file("${buildDir}/libs/dotProduct/shared").absolutePath + ) +} diff --git a/lucene/core/src/c/dotProduct.c b/lucene/core/src/c/dotProduct.c new file mode 100644 index 000000000000..c5492a4e7de8 --- /dev/null +++ b/lucene/core/src/c/dotProduct.c @@ -0,0 +1,143 @@ +// dotProduct.c + +#include +#include + +#ifdef __ARM_ACLE +#include +#endif + +#if (defined(__ARM_FEATURE_SVE) && !defined(__APPLE__)) +#include +/* + * Unrolled and vectorized int8 dotProduct implementation using SVE instructions + * NOTE: Clang 15.0 compiler on Apple M3 Max compiles the code below sucessfully + * with '-march=native+sve' option but throws "Illegal Hardware Instruction" error + * Looks like Apple M3 does not implement SVE and Apple's official documentation + * is not explicit about this or at least I could not find it. + * + */ +int32_t vdot8s_sve(int8_t *vec1, int8_t *vec2, int32_t limit) { + int32_t result = 0; + int32_t i = 0; + // Vectors of 8-bit signed integers + svint8_t va1, va2, va3, va4; + svint8_t vb1, vb2, vb3, vb4; + // Init accumulators + svint32_t acc1 = svdup_n_s32(0); + svint32_t acc2 = svdup_n_s32(0); + svint32_t acc3 = svdup_n_s32(0); + svint32_t acc4 = svdup_n_s32(0); + + // Number of 8-bits elements in the SVE vector + int32_t vec_length = svcntb(); + + // Manually unroll the loop + for (i = 0; i + 4 * vec_length <= limit; i += 4 * vec_length) { + // Load vectors into the Z registers which can range from 128-bit to 2048-bit wide + // The predicate register - P determines which bytes are active + // svptrue_b8() returns a predictae in which every element is true + // + va1 = svld1_s8(svptrue_b8(), vec1 + i); + vb1 = svld1_s8(svptrue_b8(), vec2 + i); + + va2 = svld1_s8(svptrue_b8(), vec1 + i + vec_length); + vb2 = svld1_s8(svptrue_b8(), vec2 + i + vec_length); + + va3 = svld1_s8(svptrue_b8(), vec1 + i + 2 * vec_length); + vb3 = svld1_s8(svptrue_b8(), vec2 + i + 2 * vec_length); + + va4 = svld1_s8(svptrue_b8(), vec1 + i + 3 * vec_length); + vb4 = svld1_s8(svptrue_b8(), vec2 + i + 3 * vec_length); + + // Dot product using SDOT instruction on Z vectors + acc1 = svdot_s32(acc1, va1, vb1); + acc2 = svdot_s32(acc2, va2, vb2); + acc3 = svdot_s32(acc3, va3, vb3); + acc4 = svdot_s32(acc4, va4, vb4); + } + // Add correspponding active elements in each of the vectors + acc1 = svadd_s32_x(svptrue_b8() , acc1, acc2); + acc3 = svadd_s32_x(svptrue_b8() , acc3, acc4); + acc1 = svadd_s32_x(svptrue_b8(), acc1, acc3); + + // REDUCE: Add every vector element in target and write result to scalar + result = svaddv_s32(svptrue_b8(), acc1); + + // Scalar tail. TODO: Use FMA + for (; i < limit; i++) { + result += vec1[i] * vec2[i]; + } + return result; +} +#endif + +// https://developer.arm.com/architectures/instruction-sets/intrinsics/ +int32_t vdot8s_neon(int8_t* vec1, int8_t* vec2, int32_t limit) { + int32_t result = 0; + int32x4_t acc1 = vdupq_n_s32(0); + int32x4_t acc2 = vdupq_n_s32(0); + int32x4_t acc3 = vdupq_n_s32(0); + int32x4_t acc4 = vdupq_n_s32(0); + int32_t i = 0; + int8x16_t va1, va2, va3, va4; + int8x16_t vb1, vb2, vb3, vb4; + + for (; i + 64 <= limit; i += 64 ) { + // Read into 8 (bit) x 16 (values) vector + va1 = vld1q_s8((const void*) (vec1 + i)); + vb1 = vld1q_s8((const void*) (vec2 + i)); + + va2 = vld1q_s8((const void*) (vec1 + i + 16)); + vb2 = vld1q_s8((const void*) (vec2 + i + 16)); + + va3 = vld1q_s8((const void*) (vec1 + i + 32)); + vb3 = vld1q_s8((const void*) (vec2 + i + 32)); + + va4 = vld1q_s8((const void*) (vec1 + i + 48)); + vb4 = vld1q_s8((const void*) (vec2 + i + 48)); + + // Dot product using SDOT instruction + // GCC 7.3 does not define the intrinsic below so we get compile time error. + acc1 = vdotq_s32(acc1, va1, vb1); + acc2 = vdotq_s32(acc2, va2, vb2); + acc3 = vdotq_s32(acc3, va3, vb3); + acc4 = vdotq_s32(acc4, va4, vb4); + } + // Add corresponding elements in each vectors + acc1 = vaddq_s32(acc1, acc2); + acc3 = vaddq_s32(acc3, acc4); + acc1 = vaddq_s32(acc1, acc3); + + // REDUCE: Add every vector element in target and write result to scalar + result += vaddvq_s32(acc1); + + // Scalar tail. TODO: Use FMA + for (; i < limit; i++) { + result += vec1[i] * vec2[i]; + } + return result; +} + +int32_t dot8s(int8_t* vec1, int8_t* vec2, int32_t limit) { + int32_t result = 0; + #pragma clang loop vectorize(assume_safety) unroll(enable) + for (int32_t i = 0; i < limit; i++) { + result += vec1[i] * vec2[i]; + } + return result; +} + +/* +int main(int argc, const char* arrgs[]) { + int8_t a[128]; + int8_t b[128]; + for (int i =0; i < 128; i++) { + a[i] = 2; + b[i] = 3; + } + printf("Sum (Vectorized - SVE) = %d\n", vdot8s_sve(&a, &b, 128)); + printf("Sum (Vectorized - NEON) = %d\n", vdot8s_neon(&a, &b, 128)); + printf("Sum (Scalar) = %d\n", dot8s(&a, &b, 128)); +}*/ + diff --git a/lucene/core/src/c/dotProduct.h b/lucene/core/src/c/dotProduct.h new file mode 100644 index 000000000000..9e16a024d645 --- /dev/null +++ b/lucene/core/src/c/dotProduct.h @@ -0,0 +1,4 @@ + +int32_t vdot8s_sve(int8_t* vec1[], int8_t* vec2, int32_t limit); +int32_t vdot8s_neon(int8_t* vec1[], int8_t* vec2[], int32_t limit); +int32_t dot8s(int8_t* a, int8_t* b, int32_t limit); diff --git a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java index 0ae563c8701b..ba9651f68183 100644 --- a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java @@ -17,6 +17,11 @@ package org.apache.lucene.util; +import static java.lang.foreign.ValueLayout.JAVA_BYTE; +import static java.lang.foreign.ValueLayout.JAVA_INT; + +import java.lang.foreign.*; +import java.lang.invoke.MethodHandle; import org.apache.lucene.internal.vectorization.VectorUtilSupport; import org.apache.lucene.internal.vectorization.VectorizationProvider; @@ -168,6 +173,89 @@ public static void add(float[] u, float[] v) { } } + /** Ankur: Hacky code start */ + public static final AddressLayout POINTER = + ValueLayout.ADDRESS.withTargetLayout(MemoryLayout.sequenceLayout(JAVA_BYTE)); + + private static final Linker LINKER = Linker.nativeLinker(); + private static final SymbolLookup SYMBOL_LOOKUP; + + static { + System.loadLibrary("dotProduct"); + SymbolLookup loaderLookup = SymbolLookup.loaderLookup(); + SYMBOL_LOOKUP = name -> loaderLookup.find(name).or(() -> LINKER.defaultLookup().find(name)); + } + + static final FunctionDescriptor dot8sDesc = + FunctionDescriptor.of(JAVA_INT, POINTER, POINTER, JAVA_INT); + + static final MethodHandle dot8sMH = + SYMBOL_LOOKUP.find("dot8s").map(addr -> LINKER.downcallHandle(addr, dot8sDesc)).orElse(null); + + static final MethodHandle neonVdot8sMH = + SYMBOL_LOOKUP + .find("vdot8s_neon") + .map(addr -> LINKER.downcallHandle(addr, dot8sDesc)) + .orElse(null); + + static final MethodHandle sveVdot8sMH = + SYMBOL_LOOKUP + .find("vdot8s_sve") + .map(addr -> LINKER.downcallHandle(addr, dot8sDesc)) + .orElse(null); + + static final MethodHandle sveVdot8s$MH() { + if (sveVdot8sMH == null) { + // SVE instructions not available, fallback to NEON + return neonVdot8s$MH(); + } + return sveVdot8sMH; + } + + static final MethodHandle neonVdot8s$MH() { + return requireNonNull(neonVdot8sMH, "vdot8s_neon"); + } + + static final MethodHandle dot8s$MH() { + return requireNonNull(dot8sMH, "dot8s"); + } + + static T requireNonNull(T obj, String symbolName) { + if (obj == null) { + throw new UnsatisfiedLinkError("unresolved symbol: " + symbolName); + } + return obj; + } + + public static int sveVdot8s(MemorySegment vec1, MemorySegment vec2, int limit) { + var mh$ = sveVdot8s$MH(); + try { + return (int) mh$.invokeExact(vec1, vec2, limit); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + public static int neonVdot8s(MemorySegment vec1, MemorySegment vec2, int limit) { + var mh$ = neonVdot8s$MH(); + try { + return (int) mh$.invokeExact(vec1, vec2, limit); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + public static int dot8s(MemorySegment vec1, MemorySegment vec2, int limit) { + var mh$ = dot8s$MH(); + try { + return (int) mh$.invokeExact(vec1, vec2, limit); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + /** Ankur: Hacky code end * */ + /** * Dot product computed over signed bytes. *