From 2dce12f83652403a851b258658a7486b1561cb56 Mon Sep 17 00:00:00 2001 From: Ankur Goel Date: Sat, 14 Sep 2024 03:07:30 +0000 Subject: [PATCH] Move C code to native module and integrate Java code under java21 --- gradle/java/javac.gradle | 10 +- gradle/testing/defaults-tests.gradle | 2 - .../randomization/policies/tests.policy | 8 +- lucene/benchmark-jmh/build.gradle | 1 + .../benchmark/jmh/VectorUtilBenchmark.java | 89 +++++++++++++---- lucene/core/build.gradle | 49 +--------- .../org/apache/lucene/util/Constants.java | 4 + .../org/apache/lucene/util/VectorUtil.java | 98 +++++++------------ ...Lucene99MemorySegmentByteVectorScorer.java | 46 ++++++++- ...MemorySegmentByteVectorScorerSupplier.java | 60 +++++++++++- .../vectorization/NativeMethodHandles.java | 78 +++++++++++++++ .../PanamaVectorUtilSupport.java | 27 ++++- .../store/MemorySegmentAccessInput.java | 3 + .../lucene/store/MemorySegmentIndexInput.java | 40 ++++++++ .../vectorization/TestVectorScorer.java | 1 - lucene/misc/build.gradle | 1 - lucene/native/build.gradle | 60 ++++++++++++ lucene/{core => native}/src/c/dotProduct.c | 88 ++++++++++++----- lucene/{core => native}/src/c/dotProduct.h | 0 settings.gradle | 1 + 20 files changed, 501 insertions(+), 165 deletions(-) create mode 100644 lucene/core/src/java21/org/apache/lucene/internal/vectorization/NativeMethodHandles.java create mode 100644 lucene/native/build.gradle rename lucene/{core => native}/src/c/dotProduct.c (66%) rename lucene/{core => native}/src/c/dotProduct.h (100%) diff --git a/gradle/java/javac.gradle b/gradle/java/javac.gradle index 5149aa542fe1..537769ea1913 100644 --- a/gradle/java/javac.gradle +++ b/gradle/java/javac.gradle @@ -24,11 +24,7 @@ allprojects { project -> // Use 'release' flag instead of 'source' and 'target' tasks.withType(JavaCompile) { - options.compilerArgs += ["--release", rootProject.minJavaVersion.toString(), "--enable-preview"] - } - - tasks.withType(Test) { - jvmArgs += "--enable-preview" + options.compilerArgs += ["--release", rootProject.minJavaVersion.toString()] } // Configure warnings. @@ -76,7 +72,7 @@ allprojects { project -> "-Xdoclint:-accessibility" ] - if (project.path == ":lucene:benchmark-jmh" ) { + if (project.path == ":lucene:benchmark-jmh") { // JMH benchmarks use JMH preprocessor and incubating modules. } else { // proc:none was added because of LOG4J2-1925 / JDK-8186647 @@ -84,11 +80,9 @@ allprojects { project -> "-proc:none" ] - /** if (propertyOrDefault("javac.failOnWarnings", true).toBoolean()) { options.compilerArgs += "-Werror" } - */ } } } diff --git a/gradle/testing/defaults-tests.gradle b/gradle/testing/defaults-tests.gradle index b74536847078..1f3a7d8b1a07 100644 --- a/gradle/testing/defaults-tests.gradle +++ b/gradle/testing/defaults-tests.gradle @@ -139,8 +139,6 @@ allprojects { ":lucene:test-framework" ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core') - jvmArgs '-Djava.library.path=' + file("${buildDir}/libs/dotProduct/shared").absolutePath - def loggingConfigFile = layout.projectDirectory.file("${resources}/logging.properties") def tempDir = layout.projectDirectory.dir(testsTmpDir.toString()) jvmArgumentProviders.add( diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy index 6d2b60c0e9f5..380360626112 100644 --- a/gradle/testing/randomization/policies/tests.policy +++ b/gradle/testing/randomization/policies/tests.policy @@ -52,6 +52,9 @@ grant { // Needed for DirectIODirectory to retrieve block size permission java.lang.RuntimePermission "getFileStoreAttributes"; + // Needed to load native library containing optimized dot product implementation + permission java.lang.RuntimePermission "loadLibrary.dotProduct"; + // TestLockFactoriesMultiJVM opens a random port on 127.0.0.1 (port 0 = ephemeral port range): permission java.net.SocketPermission "127.0.0.1:0", "accept,listen,resolve"; // Replicator tests connect to ephemeral ports @@ -104,7 +107,10 @@ grant codeBase "file:${gradle.worker.jar}" { }; grant { - permission java.security.AllPermission; + // Allow reading gradle worker JAR. + permission java.io.FilePermission "${gradle.worker.jar}", "read"; + // Allow reading from classpath JARs (resources). + permission java.io.FilePermission "${gradle.user.home}${/}-", "read"; }; // Grant permissions to certain test-related JARs (https://github.com/apache/lucene/pull/13146) diff --git a/lucene/benchmark-jmh/build.gradle b/lucene/benchmark-jmh/build.gradle index ae2120a82c6f..1751a43d7a79 100644 --- a/lucene/benchmark-jmh/build.gradle +++ b/lucene/benchmark-jmh/build.gradle @@ -38,6 +38,7 @@ tasks.matching { it.name == "forbiddenApisMain" }.configureEach { ]) } + // Skip certain infrastructure tasks that we can't use or don't care about. tasks.matching { it.name in [ // Turn off JMH dependency checksums and licensing (it's GPL w/ classpath exception diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java index 5dbff945ea06..f9e6a9005274 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java @@ -16,9 +16,9 @@ */ package org.apache.lucene.benchmark.jmh; -import java.lang.foreign.Arena; -import java.lang.foreign.MemorySegment; -import java.lang.foreign.ValueLayout; +import java.lang.invoke.MethodHandle; +import java.lang.invoke.MethodHandles; +import java.lang.invoke.MethodType; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; import org.apache.lucene.util.VectorUtil; @@ -52,12 +52,11 @@ static void compressBytes(byte[] raw, byte[] compressed) { private float[] floatsB; private int expectedhalfByteDotProduct; - private MemorySegment nativeBytesA; + private Object nativeBytesA; + private Object nativeBytesB; - private MemorySegment nativeBytesB; - - // @Param({"1", "128", "207", "256", "300", "512", "702", "1024"}) - @Param({"768"}) + /** private Object nativeBytesA; private Object nativeBytesB; */ + @Param({"1", "128", "207", "256", "300", "512", "702", "1024"}) int size; @Setup(Level.Iteration) @@ -92,20 +91,76 @@ public void init() { floatsA[i] = random.nextFloat(); floatsB[i] = random.nextFloat(); } - - Arena offHeap = Arena.ofAuto(); - nativeBytesA = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment()); - nativeBytesB = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment()); - for (int i = 0; i < size; ++i) { - nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128)); - nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128)); + // Java 21+ specific initialization + final int runtimeVersion = Runtime.version().feature(); + if (runtimeVersion >= 21) { + // Reflection based code to eliminate the use of Preview classes in JMH benchmarks + try { + final Class vectorUtilSupportClass = VectorUtil.getVectorUtilSupportClass(); + final var className = "org.apache.lucene.internal.vectorization.PanamaVectorUtilSupport"; + if (vectorUtilSupportClass.getName().equals(className) == false) { + nativeBytesA = null; + nativeBytesB = null; + } else { + MethodHandles.Lookup lookup = MethodHandles.lookup(); + final var MemorySegment = "java.lang.foreign.MemorySegment"; + final var methodType = + MethodType.methodType(lookup.findClass(MemorySegment), byte[].class); + MethodHandle nativeMemorySegment = + lookup.findStatic(vectorUtilSupportClass, "nativeMemorySegment", methodType); + byte[] a = new byte[size]; + byte[] b = new byte[size]; + for (int i = 0; i < size; ++i) { + a[i] = (byte) random.nextInt(128); + b[i] = (byte) random.nextInt(128); + } + nativeBytesA = nativeMemorySegment.invoke(a); + nativeBytesB = nativeMemorySegment.invoke(b); + } + } catch (Throwable e) { + throw new RuntimeException(e); + } + /* + Arena offHeap = Arena.ofAuto(); + nativeBytesA = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment()); + nativeBytesB = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment()); + for (int i = 0; i < size; ++i) { + nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128)); + nativeBytesB.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128)); + }*/ } } + /** + * High overhead (lower score) from using NATIVE_DOT_PRODUCT.invoke(nativeBytesA, nativeBytesB). + * Both nativeBytesA and nativeBytesB are offHeap MemorySegments created by invoking the method + * PanamaVectorUtilSupport.nativeMemorySegment(byte[]) which allocated these segments and copies + * bytes from the supplied byte[] to offHeap memory. The benchmark output below shows + * significantly more overhead. NOTE: Return type of dots8s() was set to void for the + * benchmark run to avoid boxing/unboxing overhead. + * + *
+   * Benchmark                  (size)   Mode  Cnt   Score   Error   Units
+   * VectorUtilBenchmark.dot8s     768  thrpt   15  36.406 ± 0.496  ops/us
+   * 
+ * + * Much lower overhead was observed when preview APIs were used directly in JMH benchmarking code + * and exact method invocation was made as shown below return (int) + * VectorUtil.NATIVE_DOT_PRODUCT.invokeExact(nativeBytesA, nativeBytesB); + * + *
+   * Benchmark                  (size)   Mode  Cnt   Score   Error   Units
+   * VectorUtilBenchmark.dot8s     768   thrpt   15   43.662 ± 0.818  ops/us
+   * 
+ */ @Benchmark @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) - public int dot8s() { - return VectorUtil.dot8s(nativeBytesA, nativeBytesB, size); + public void dot8s() { + try { + VectorUtil.NATIVE_DOT_PRODUCT.invoke(nativeBytesA, nativeBytesB); + } catch (Throwable e) { + throw new RuntimeException(e); + } } @Benchmark diff --git a/lucene/core/build.gradle b/lucene/core/build.gradle index e579582391b5..4b33293a45f8 100644 --- a/lucene/core/build.gradle +++ b/lucene/core/build.gradle @@ -14,54 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -plugins { - id "c" -} apply plugin: 'java-library' -apply plugin: 'c' description = 'Lucene core library' -model { - binaries { - all { - cCompiler.args "--shared", "-O3", "-march=native", "-funroll-loops" - } - } - - toolChains { - gcc(Gcc) { - target("linux_aarch64") { - cCompiler.executable = System.getenv("CC") - } - } - clang(Clang) { - target("osx_aarch64"){ - cCompiler.executable = System.getenv("CC") - } - } - } - - components { - dotProduct(NativeLibrarySpec) { - sources { - c { - source { - srcDir 'src/c' // Path to your C source files - include "**/*.c" - } - exportedHeaders { - srcDir "src/c" - include "**/*.h" - } - } - } - } - } - -} - -test.dependsOn 'dotProductSharedLibrary' dependencies { moduleTestImplementation project(':lucene:codecs') @@ -69,8 +25,11 @@ dependencies { } test { + build { + dependsOn ':lucene:native:build' + } systemProperty( "java.library.path", - file("${buildDir}/libs/dotProduct/shared").absolutePath + project(":lucene:native").layout.buildDirectory.get().asFile.absolutePath + "/libs/dotProduct/shared" ) } diff --git a/lucene/core/src/java/org/apache/lucene/util/Constants.java b/lucene/core/src/java/org/apache/lucene/util/Constants.java index ac6604e8ea33..722eed9bb80b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/Constants.java +++ b/lucene/core/src/java/org/apache/lucene/util/Constants.java @@ -100,6 +100,10 @@ private static boolean is64Bit() { /** true iff we know VFMA has faster throughput than separate vmul/vadd. */ public static final boolean HAS_FAST_VECTOR_FMA = hasFastVectorFMA(); + // TODO: && Boolean.parseBoolean(getSysProp("lucene.useNativeDotProduct", + // "False") + public static final boolean NATIVE_DOT_PRODUCT_ENABLED = OS_ARCH.equalsIgnoreCase("aarch64"); + /** true iff we know FMA has faster throughput than separate mul/add. */ public static final boolean HAS_FAST_SCALAR_FMA = hasFastScalarFMA(); diff --git a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java index 9f43747fb5f5..553214442cc0 100644 --- a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java @@ -17,11 +17,9 @@ package org.apache.lucene.util; -import static java.lang.foreign.ValueLayout.JAVA_BYTE; -import static java.lang.foreign.ValueLayout.JAVA_INT; - -import java.lang.foreign.*; import java.lang.invoke.MethodHandle; +import java.lang.invoke.MethodHandles; +import java.lang.invoke.MethodType; import org.apache.lucene.internal.vectorization.VectorUtilSupport; import org.apache.lucene.internal.vectorization.VectorizationProvider; @@ -54,11 +52,41 @@ public final class VectorUtil { private static final float EPSILON = 1e-4f; - private static final VectorUtilSupport IMPL = + public static final VectorUtilSupport IMPL = VectorizationProvider.getInstance().getVectorUtilSupport(); + // TODO: Harden this implementation and may be find a new home for this + private static MethodHandle nativeDotProduct() { + try { + final var PanamaVectorUtilSupport = + "org.apache.lucene.internal.vectorization.PanamaVectorUtilSupport"; + if (!IMPL.getClass().getName().equals(PanamaVectorUtilSupport)) { + return null; + } + MethodHandles.Lookup lookup = MethodHandles.lookup(); + final var MemorySegment = "java.lang.foreign.MemorySegment"; + final var methodType = + MethodType.methodType( + int.class, lookup.findClass(MemorySegment), lookup.findClass(MemorySegment)); + return lookup.findStatic(IMPL.getClass(), "nativeDotProduct", methodType); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + public static final MethodHandle NATIVE_DOT_PRODUCT = nativeDotProduct(); + private VectorUtil() {} + /* + Used in o.a.l.benchmark.jmh.VectorUtilBenchmark to create test vectors + in off-heap MemorySegments IF VectorUtilSupport instance supports + Panama APIs. + */ + public static Class getVectorUtilSupportClass() { + return IMPL.getClass(); + } + /** * Returns the vector dot product of the two vectors. * @@ -173,62 +201,6 @@ public static void add(float[] u, float[] v) { } } - /** Ankur: Hacky code start */ - public static final AddressLayout POINTER = - ValueLayout.ADDRESS.withTargetLayout(MemoryLayout.sequenceLayout(JAVA_BYTE)); - - private static final Linker LINKER = Linker.nativeLinker(); - private static final SymbolLookup SYMBOL_LOOKUP; - - static { - System.loadLibrary("dotProduct"); - SymbolLookup loaderLookup = SymbolLookup.loaderLookup(); - SYMBOL_LOOKUP = name -> loaderLookup.find(name).or(() -> LINKER.defaultLookup().find(name)); - } - - static final FunctionDescriptor dot8sDesc = - FunctionDescriptor.of(JAVA_INT, POINTER, POINTER, JAVA_INT); - - static final MethodHandle dot8sMH = - SYMBOL_LOOKUP.find("dot8s").map(addr -> LINKER.downcallHandle(addr, dot8sDesc)).orElse(null); - - static final MethodHandle neonVdot8sMH = - SYMBOL_LOOKUP - .find("vdot8s_neon") - .map(addr -> LINKER.downcallHandle(addr, dot8sDesc)) - .orElse(null); - - static final MethodHandle sveVdot8sMH = - SYMBOL_LOOKUP - .find("vdot8s_sve") - .map(addr -> LINKER.downcallHandle(addr, dot8sDesc)) - .orElse(null); - - /* chosen C implementation */ - static final MethodHandle dot8sImpl; - - static { - if (sveVdot8sMH != null) { - dot8sImpl = sveVdot8sMH; - } else if (neonVdot8sMH != null) { - dot8sImpl = neonVdot8sMH; - } else if (dot8sMH != null) { - dot8sImpl = dot8sMH; - } else { - throw new RuntimeException("c code was not linked!"); - } - } - - public static int dot8s(MemorySegment vec1, MemorySegment vec2, int limit) { - try { - return (int) dot8sImpl.invokeExact(vec1, vec2, limit); - } catch (Throwable ex$) { - throw new AssertionError("should not reach here", ex$); - } - } - - /** Ankur: Hacky code end * */ - /** * Dot product computed over signed bytes. * @@ -339,7 +311,9 @@ static int xorBitCountLong(byte[] a, byte[] b) { public static float dotProductScore(byte[] a, byte[] b) { // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len float denom = (float) (a.length * (1 << 15)); - return 0.5f + dotProduct(a, b) / denom; + + int raw = dotProduct(a, b); + return 0.5f + raw / denom; } /** diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java index b65f1e570921..bd7a0a7cb75a 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java @@ -17,7 +17,9 @@ package org.apache.lucene.internal.vectorization; import java.io.IOException; +import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; +import java.lang.foreign.ValueLayout; import java.util.Optional; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.KnnVectorValues; @@ -25,6 +27,7 @@ import org.apache.lucene.store.FilterIndexInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.MemorySegmentAccessInput; +import org.apache.lucene.util.Constants; import org.apache.lucene.util.hnsw.RandomVectorScorer; abstract sealed class Lucene99MemorySegmentByteVectorScorer @@ -34,6 +37,10 @@ abstract sealed class Lucene99MemorySegmentByteVectorScorer final MemorySegmentAccessInput input; final MemorySegment query; byte[] scratch; + MemorySegment offHeapScratch; + MemorySegment offHeapQuery; + + private static Arena offHeap; /** * Return an optional whose value, if present, is the scorer. Otherwise, an empty optional is @@ -47,9 +54,13 @@ public static Optional create( return Optional.empty(); } checkInvariants(values.size(), values.getVectorByteLength(), input); + offHeap = Arena.ofAuto(); return switch (type) { case COSINE -> Optional.of(new CosineScorer(msInput, values, queryVector)); - case DOT_PRODUCT -> Optional.of(new DotProductScorer(msInput, values, queryVector)); + case DOT_PRODUCT -> + Constants.NATIVE_DOT_PRODUCT_ENABLED == false + ? Optional.of(new DotProductScorer(msInput, values, queryVector)) + : Optional.of(new NativeDotProductScorer(msInput, values, queryVector)); case EUCLIDEAN -> Optional.of(new EuclideanScorer(msInput, values, queryVector)); case MAXIMUM_INNER_PRODUCT -> Optional.of(new MaxInnerProductScorer(msInput, values, queryVector)); @@ -64,6 +75,19 @@ public static Optional create( this.query = MemorySegment.ofArray(queryVector); } + final MemorySegment getNativeSegment(int ord) throws IOException { + long byteOffset = (long) ord * vectorByteSize; + MemorySegment seg = input.segmentSliceOrNull(byteOffset, vectorByteSize); + if (seg == null) { + if (offHeapScratch == null) { + offHeapScratch = offHeap.allocate(vectorByteSize, ValueLayout.JAVA_BYTE.byteAlignment()); + } + input.readBytes(byteOffset, offHeapScratch, 0, vectorByteSize); + seg = offHeapScratch; + } + return seg; + } + final MemorySegment getSegment(int ord) throws IOException { checkOrdinal(ord); long byteOffset = (long) ord * vectorByteSize; @@ -103,6 +127,26 @@ public float score(int node) throws IOException { } } + static final class NativeDotProductScorer extends Lucene99MemorySegmentByteVectorScorer { + + NativeDotProductScorer( + MemorySegmentAccessInput input, KnnVectorValues values, byte[] queryVector) { + super(input, values, queryVector); + if (offHeapQuery == null) { + offHeapQuery = offHeap.allocate(vectorByteSize, ValueLayout.JAVA_BYTE.byteAlignment()); + } + offHeapQuery.copyFrom(query); + } + + @Override + public float score(int node) throws IOException { + checkOrdinal(node); + // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len + int raw = PanamaVectorUtilSupport.nativeDotProduct(offHeapQuery, getNativeSegment(node)); + return 0.5f + raw / (float) (query.byteSize() * (1 << 15)); + } + } + static final class DotProductScorer extends Lucene99MemorySegmentByteVectorScorer { DotProductScorer(MemorySegmentAccessInput input, KnnVectorValues values, byte[] query) { super(input, values, query); diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java index 02c71561122d..f1193577d369 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java @@ -17,7 +17,9 @@ package org.apache.lucene.internal.vectorization; import java.io.IOException; +import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; +import java.lang.foreign.ValueLayout; import java.util.Optional; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.KnnVectorValues; @@ -25,6 +27,7 @@ import org.apache.lucene.store.FilterIndexInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.MemorySegmentAccessInput; +import org.apache.lucene.util.Constants; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; @@ -36,6 +39,11 @@ public abstract sealed class Lucene99MemorySegmentByteVectorScorerSupplier final MemorySegmentAccessInput input; final KnnVectorValues values; // to support ordToDoc/getAcceptOrds byte[] scratch1, scratch2; + MemorySegment[] offHeapScratch; + private static Arena offHeap; + + private static final int FIRST_OFFHEAP_SCRATCH = 0; + private static final int SECOND_OFFHEAP_SCRATCH = 1; /** * Return an optional whose value, if present, is the scorer supplier. Otherwise, an empty @@ -51,7 +59,10 @@ static Optional create( checkInvariants(values.size(), values.getVectorByteLength(), input); return switch (type) { case COSINE -> Optional.of(new CosineSupplier(msInput, values)); - case DOT_PRODUCT -> Optional.of(new DotProductSupplier(msInput, values)); + case DOT_PRODUCT -> + Constants.NATIVE_DOT_PRODUCT_ENABLED == false + ? Optional.of(new DotProductSupplier(msInput, values)) + : Optional.of(new NativeDotProductSupplier(msInput, values)); case EUCLIDEAN -> Optional.of(new EuclideanSupplier(msInput, values)); case MAXIMUM_INNER_PRODUCT -> Optional.of(new MaxInnerProductSupplier(msInput, values)); }; @@ -77,6 +88,21 @@ final void checkOrdinal(int ord) { } } + final MemorySegment getNativeSegment(int ord, int sid) throws IOException { + long byteOffset = (long) ord * vectorByteSize; + MemorySegment seg = input.segmentSliceOrNull(byteOffset, vectorByteSize); + if (seg == null) { + if (offHeapScratch[sid] + == null) { // Should be rare, this means current vector was split across memory segments + offHeapScratch[sid] = + offHeap.allocate(vectorByteSize, ValueLayout.JAVA_BYTE.byteAlignment()); + } + input.readBytes(byteOffset, offHeapScratch[sid], 0, vectorByteSize); + seg = offHeapScratch[sid]; + } + return seg; + } + final MemorySegment getFirstSegment(int ord) throws IOException { long byteOffset = (long) ord * vectorByteSize; MemorySegment seg = input.segmentSliceOrNull(byteOffset, vectorByteSize); @@ -128,6 +154,38 @@ public CosineSupplier copy() throws IOException { } } + static final class NativeDotProductSupplier + extends Lucene99MemorySegmentByteVectorScorerSupplier { + + NativeDotProductSupplier(MemorySegmentAccessInput input, KnnVectorValues values) { + super(input, values); + offHeap = Arena.ofAuto(); + offHeapScratch = new MemorySegment[2]; + } + + @Override + public RandomVectorScorer scorer(int ord) { + checkOrdinal(ord); + return new RandomVectorScorer.AbstractRandomVectorScorer(values) { + @Override + public float score(int node) throws IOException { + checkOrdinal(node); + // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len + int raw = + PanamaVectorUtilSupport.nativeDotProduct( + getNativeSegment(ord, FIRST_OFFHEAP_SCRATCH), + getNativeSegment(node, SECOND_OFFHEAP_SCRATCH)); + return 0.5f + raw / (float) (values.dimension() * (1 << 15)); + } + }; + } + + @Override + public NativeDotProductSupplier copy() throws IOException { + return new NativeDotProductSupplier(input.clone(), values); + } + } + static final class DotProductSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { DotProductSupplier(MemorySegmentAccessInput input, KnnVectorValues values) { diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/NativeMethodHandles.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/NativeMethodHandles.java new file mode 100644 index 000000000000..6e3b613ec6bf --- /dev/null +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/NativeMethodHandles.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.internal.vectorization; + +import static java.lang.foreign.ValueLayout.JAVA_BYTE; +import static java.lang.foreign.ValueLayout.JAVA_INT; + +import java.lang.foreign.AddressLayout; +import java.lang.foreign.FunctionDescriptor; +import java.lang.foreign.Linker; +import java.lang.foreign.MemoryLayout; +import java.lang.foreign.SymbolLookup; +import java.lang.foreign.ValueLayout; +import java.lang.invoke.MethodHandle; + +public final class NativeMethodHandles { + + private NativeMethodHandles() {} + + public static final AddressLayout POINTER = + ValueLayout.ADDRESS.withTargetLayout(MemoryLayout.sequenceLayout(JAVA_BYTE)); + + private static final Linker LINKER = Linker.nativeLinker(); + private static final SymbolLookup SYMBOL_LOOKUP; + + static { + System.loadLibrary("dotProduct"); + SymbolLookup loaderLookup = SymbolLookup.loaderLookup(); + SYMBOL_LOOKUP = name -> loaderLookup.find(name).or(() -> LINKER.defaultLookup().find(name)); + } + + private static final FunctionDescriptor dot8sDesc = + FunctionDescriptor.of(JAVA_INT, POINTER, POINTER, JAVA_INT); + + private static final MethodHandle dot8sMH = + SYMBOL_LOOKUP.find("dot8s").map(addr -> LINKER.downcallHandle(addr, dot8sDesc)).orElse(null); + + private static final MethodHandle neonVdot8sMH = + SYMBOL_LOOKUP + .find("vdot8s_neon") + .map(addr -> LINKER.downcallHandle(addr, dot8sDesc)) + .orElse(null); + + private static final MethodHandle sveVdot8sMH = + SYMBOL_LOOKUP + .find("vdot8s_sve") + .map(addr -> LINKER.downcallHandle(addr, dot8sDesc)) + .orElse(null); + + /* chosen C implementation */ + static final MethodHandle DOT_PRODUCT_IMPL; + + static { + if (sveVdot8sMH != null) { + DOT_PRODUCT_IMPL = sveVdot8sMH; + } else if (neonVdot8sMH != null) { + DOT_PRODUCT_IMPL = neonVdot8sMH; + } else if (dot8sMH != null) { + DOT_PRODUCT_IMPL = dot8sMH; + } else { + throw new RuntimeException("c code was not linked!"); + } + } +} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java index ad2dff11cea1..bd4e800271cc 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java @@ -25,6 +25,7 @@ import static jdk.incubator.vector.VectorOperators.S2I; import static jdk.incubator.vector.VectorOperators.ZERO_EXTEND_B2S; +import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; import jdk.incubator.vector.ByteVector; import jdk.incubator.vector.FloatVector; @@ -48,7 +49,7 @@ * * Setting these properties will make this code run EXTREMELY slow! */ -final class PanamaVectorUtilSupport implements VectorUtilSupport { +public final class PanamaVectorUtilSupport implements VectorUtilSupport { // preferred vector sizes, which can be altered for testing private static final VectorSpecies FLOAT_SPECIES; @@ -58,6 +59,7 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport { private static final VectorSpecies SHORT_SPECIES; static final int VECTOR_BITSIZE; + private static final Arena offHeap = Arena.ofConfined(); static { VECTOR_BITSIZE = PanamaVectorConstants.PREFERRED_VECTOR_BITSIZE; @@ -303,6 +305,29 @@ public int dotProduct(byte[] a, byte[] b) { return dotProduct(MemorySegment.ofArray(a), MemorySegment.ofArray(b)); } + /** + * For use in JMH benchmarking. + * + * @param b byte[] whose contents will be copied to offHeap memory + * @return offHeap memory segment + */ + public static MemorySegment nativeMemorySegment(byte[] b) { + MemorySegment seg = offHeap.allocate(b.length, JAVA_BYTE.byteAlignment()); + seg.copyFrom(MemorySegment.ofArray(b)); + return seg; + } + + public static int nativeDotProduct(MemorySegment a, MemorySegment b) { + assert a.byteSize() == b.byteSize(); + try { + int limit = (int) a.byteSize(); + int score = (int) NativeMethodHandles.DOT_PRODUCT_IMPL.invokeExact(a, b, limit); + return score; + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + public static int dotProduct(MemorySegment a, MemorySegment b) { assert a.byteSize() == b.byteSize(); int i = 0; diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccessInput.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccessInput.java index 8b6452a748ba..6c3063f49dd7 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccessInput.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccessInput.java @@ -29,5 +29,8 @@ public interface MemorySegmentAccessInput extends RandomAccessInput, Cloneable { /** Returns the memory segment for a given position and length, or null. */ MemorySegment segmentSliceOrNull(long pos, long len) throws IOException; + /** Copy bytes from underlying MemorySegment to another memory segment */ + void readBytes(long pos, MemorySegment dst, int offset, int len) throws IOException; + MemorySegmentAccessInput clone(); } diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java index 8bb70ba009dd..39002cdd3027 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java @@ -435,6 +435,32 @@ public void readGroupVInt(long[] dst, int offset) throws IOException { } } + @Override + public void readBytes(long pos, MemorySegment dst, int offset, int len) throws IOException { + try { + int si = (int) (pos >> chunkSizePower); + pos = pos & chunkSizeMask; + long curAvail = segments[si].byteSize() - pos; + while (len > curAvail) { + MemorySegment.copy( + segments[si], LAYOUT_BYTE, pos, dst, LAYOUT_BYTE, offset, (int) curAvail); + len -= curAvail; + offset += curAvail; + si++; + if (si >= segments.length) { + throw new EOFException("read past EOF: " + this); + } + pos = 0L; + curAvail = segments[si].byteSize(); + } + MemorySegment.copy(segments[si], LAYOUT_BYTE, pos, dst, LAYOUT_BYTE, offset, len); + } catch (IndexOutOfBoundsException ioobe) { + throw handlePositionalIOOBE(ioobe, "read", pos); + } catch (NullPointerException | IllegalStateException e) { + throw alreadyClosed(e); + } + } + @Override public void readBytes(long pos, byte[] b, int offset, int len) throws IOException { try { @@ -801,6 +827,20 @@ public void readBytes(long pos, byte[] bytes, int offset, int length) throws IOE super.readBytes(pos + this.offset, bytes, offset, length); } + /** + * Override superclass method to add segment specific offset to position. + * + * @param pos + * @param dst + * @param offset + * @param len + * @throws IOException + */ + @Override + public void readBytes(long pos, MemorySegment dst, int offset, int len) throws IOException { + super.readBytes(pos + this.offset, dst, offset, len); + } + @Override public short readShort(long pos) throws IOException { return super.readShort(pos + offset); diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java index bc3b6813a5be..635a64ded092 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java @@ -110,7 +110,6 @@ void testSimpleScorer(long maxChunkSize) throws IOException { float expected = scorer1.scorer(idx0).score(idx1); var scorer2 = MEMSEG_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); assertEquals(scorer2.scorer(idx0).score(idx1), expected, DELTA); - // getRandomVectorScorer var scorer3 = DEFAULT_SCORER.getRandomVectorScorer(sim, vectorValues, vectors[idx0]); assertEquals(scorer3.score(idx1), expected, DELTA); diff --git a/lucene/misc/build.gradle b/lucene/misc/build.gradle index a21dfeaafa7e..a0175b449e67 100644 --- a/lucene/misc/build.gradle +++ b/lucene/misc/build.gradle @@ -14,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - apply plugin: 'java-library' description = 'Index tools and other miscellaneous code' diff --git a/lucene/native/build.gradle b/lucene/native/build.gradle new file mode 100644 index 000000000000..35470e46bec5 --- /dev/null +++ b/lucene/native/build.gradle @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +plugins { + id 'c' +} +apply plugin: 'c' +description = 'Native C code used by KNN scoring components' + +model { + binaries { + all { + cCompiler.args "--shared", "-O3", "-march=native", "-funroll-loops" + } + } + + toolChains { + gcc(Gcc) { + target("linux_aarch64") { + cCompiler.executable = System.getenv("CC") + } + } + clang(Clang) { + target("osx_aarch64"){ + cCompiler.executable = System.getenv("CC") + } + } + } + + components { + dotProduct(NativeLibrarySpec) { + sources { + c { + source { + srcDir 'src/c' // Path to your C source files + include "**/*.c" + } + exportedHeaders { + srcDir "src/c" + include "**/*.h" + } + } + } + } + } + +} diff --git a/lucene/core/src/c/dotProduct.c b/lucene/native/src/c/dotProduct.c similarity index 66% rename from lucene/core/src/c/dotProduct.c rename to lucene/native/src/c/dotProduct.c index 3fe0a176a54a..02ea8c3a30dd 100644 --- a/lucene/core/src/c/dotProduct.c +++ b/lucene/native/src/c/dotProduct.c @@ -16,6 +16,7 @@ */ #include #include +#include #include "dotProduct.h" #ifdef __ARM_ACLE @@ -24,15 +25,29 @@ #if (defined(__ARM_FEATURE_SVE)) #include + +/** + - ARM intrinsics guide - https://developer.arm.com/architectures/instruction-sets/intrinsics/#q=svptrue + - SVE Programming examples - https://developer.arm.com/documentation/dai0548/latest/ +*/ +void dump(int8_t vec[], int N) { + printf("["); + for (int i = 0; i < N ; i++) { + printf("%d,",vec[i]); + } + printf("]\n"); +} /* * Unrolled and vectorized int8 dotProduct implementation using SVE instructions * NOTE: Clang 15.0 compiler on Apple M3 Max compiles the code below successfully * with '-march=native+sve' option but throws "Illegal Hardware Instruction" error * Looks like Apple M3 does not implement SVE and Apple's official documentation - * is not explicit about this or at least I could not find it. + * is not explicit about this or at least I could not find it. * */ int32_t vdot8s_sve(int8_t vec1[], int8_t vec2[], int32_t limit) { + // printf("Vector1: "); dump(vec1, limit); + // printf("Vector2: "); dump(vec2, limit); int32_t result = 0; int32_t i = 0; // Vectors of 8-bit signed integers @@ -51,9 +66,8 @@ int32_t vdot8s_sve(int8_t vec1[], int8_t vec2[], int32_t limit) { for (i = 0; i + 4 * vec_length <= limit; i += 4 * vec_length) { // Load vectors into the Z registers which can range from 128-bit to 2048-bit wide // The predicate register - P determines which bytes are active - // svptrue_b8() returns a predictae in which every element is true - // - va1 = svld1_s8(svptrue_b8(), &vec1[i]); + // svptrue_b8() returns a predicate in which every element is true + va1 = svld1_s8(svptrue_b8(), &vec1[i]); vb1 = svld1_s8(svptrue_b8(), &vec2[i]); va2 = svld1_s8(svptrue_b8(), &vec1[i + vec_length]); @@ -73,19 +87,23 @@ int32_t vdot8s_sve(int8_t vec1[], int8_t vec2[], int32_t limit) { } // Vector tail: less scalar computations for unaligned sizes, esp with big vector sizes - for (; i < limit; i+= vec_length) { - va1 = svld1_s8(svptrue_b8(), &vec1[i]); - vb1 = svld1_s8(svptrue_b8(), &vec2[i]); + svbool_t pred = svwhilelt_b8_s32(i, limit); + while (svptest_first(svptrue_b8(), pred)) { + va1 = svld1_s8(pred, &vec1[i]); + vb1 = svld1_s8(pred, &vec2[i]); acc1 = svdot_s32(acc1, va1, vb1); + i += vec_length; + pred = svwhilelt_b8_s32(i, limit); } - // Add correspponding active elements in each of the vectors - acc1 = svadd_s32_x(svptrue_b8() , acc1, acc2); - acc3 = svadd_s32_x(svptrue_b8() , acc3, acc4); + // Add corresponding active elements in each of the vectors + acc1 = svadd_s32_x(svptrue_b8(), acc1, acc2); + acc3 = svadd_s32_x(svptrue_b8(), acc3, acc4); acc1 = svadd_s32_x(svptrue_b8(), acc1, acc3); // REDUCE: Add every vector element in target and write result to scalar result = svaddv_s32(svptrue_b8(), acc1); + return result; } #endif /* SVE code */ @@ -102,7 +120,7 @@ int32_t vdot8s_neon(int8_t vec1[], int8_t vec2[], int32_t limit) { int32_t i = 0; int8x16_t va1, va2, va3, va4; int8x16_t vb1, vb2, vb3, vb4; - + for (; i + 64 <= limit; i += 64) { // Read into 8 (bit) x 16 (values) vector va1 = vld1q_s8(&vec1[i]); @@ -124,14 +142,12 @@ int32_t vdot8s_neon(int8_t vec1[], int8_t vec2[], int32_t limit) { acc3 = vdotq_s32(acc3, va3, vb3); acc4 = vdotq_s32(acc4, va4, vb4); } - // Vector tail: less scalar computations for unaligned sizes, esp with big vector sizes - for (; i < limit; i += 16) { + for (; i < limit - 16; i += 16) { va1 = vld1q_s8(&vec1[i]); vb1 = vld1q_s8(&vec2[i]); acc1 = vdotq_s32(acc1, va1, vb1); } - // Add corresponding elements in each vectors acc1 = vaddq_s32(acc1, acc2); acc3 = vaddq_s32(acc3, acc4); @@ -139,7 +155,10 @@ int32_t vdot8s_neon(int8_t vec1[], int8_t vec2[], int32_t limit) { // REDUCE: Add every vector element in target and write result to scalar result += vaddvq_s32(acc1); - + // Scalar tail + for (; i < limit; i++) { + result += vec1[i] * vec2[i]; + } return result; } #endif /* __ARM_NEON */ @@ -156,16 +175,35 @@ int32_t dot8s(int8_t vec1[], int8_t vec2[], int32_t limit) { } -int main(int argc, const char* arrgs[]) { - int s = 157; - int8_t a[s]; - int8_t b[s]; - for (int i = 0; i < s; i++) { - a[i] = 2; - b[i] = 3; +int main(int argc, const char* args[]) { + int DIMENSIONS = 1024; + for (int s = 1; s < DIMENSIONS; s++) { + int8_t *a = malloc(sizeof(int8_t) * s); + int8_t *b = malloc(sizeof(int8_t) * s); + for (int i = 0; i < s; i++) { + a[i] = i % 128; + b[i] = (s - i) % 128; + } + + //printf("\nVector length: %d bits", svcntb() * 8); + int sveDot = vdot8s_sve(a, b, s); + int vdotNeon = vdot8s_neon(a, b, s); + int dot = dot8s(a, b, s); + if(vdotNeon != dot || sveDot != dot) { + printf("\n-----------------------------------------"); + printf("\nDimension: [%d]; Sum (Vectorized - SVE) = %d", s, sveDot); + printf("\nDimension: [%d]; Sum (Vectorized - NEON) = %d", s, vdot8s_neon(a, b, s)); + printf("\nDimension: [%d]; Sum (Scalar) = %d", s, dot); + } + + free(a); + free(b); } - // printf("Sum (Vectorized - SVE) = %d\n", vdot8s_sve(a, b, s)); - printf("Dimension: [%d]; Sum (Vectorized - NEON) = %d\n", s, vdot8s_neon(a, b, s)); - printf("Dimension: [%d]; Sum (Scalar) = %d\n", s, dot8s(a, b, s)); + + int8_t a[30] = { 24, -48, 5, 62, -30, -41, 33, -14, -76, -127, -59, -73, -108, -82, 52, -103, -48, -6, 16, -24, -6, 28, 58, 117, -34, -15, 15, -113, 98, 104 }; + int8_t b[30] = { -54, 47, -81, 1, -72, -12, -5, -92, 61, -64, -40, 5, -43, -62, 14, -9, -37, -82, -7, 123, 59, 46, -83, -116, -70, 42, -120, 63, 48, 8 }; + printf("\nTest Complete for [1-%d] dimensional vectors\n", DIMENSIONS); + printf("\n Scalar Dot Product: %d\n", dot8s(a,b, 30)); + printf("\n SVE Dot Product: %d\n", vdot8s_neon(a,b, 30)); } diff --git a/lucene/core/src/c/dotProduct.h b/lucene/native/src/c/dotProduct.h similarity index 100% rename from lucene/core/src/c/dotProduct.h rename to lucene/native/src/c/dotProduct.h diff --git a/settings.gradle b/settings.gradle index f4ee13243ca6..a5573642695c 100644 --- a/settings.gradle +++ b/settings.gradle @@ -86,3 +86,4 @@ include "lucene:spatial-extras" include "lucene:spatial-test-fixtures" include "lucene:suggest" include "lucene:test-framework" +include "lucene:native" \ No newline at end of file