Skip to content

Commit

Permalink
New JMH benchmark methods - vdot8s, neonVdot8s and sveVdot8s that imp…
Browse files Browse the repository at this point in the history
…lement int8 dotProduct in C using Neon and SVE intrinsics respectively. Fallback to Neon if SVE instructions are not supported by target platform
  • Loading branch information
Ankur Goel authored and Focus committed Jul 30, 2024
1 parent 8a7d484 commit e45224b
Show file tree
Hide file tree
Showing 10 changed files with 340 additions and 8 deletions.
2 changes: 2 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import java.time.format.DateTimeFormatter
plugins {
id "base"
id "lucene.build-infra"
id "c"

alias(deps.plugins.dependencychecks)
alias(deps.plugins.spotless) apply false
Expand All @@ -34,6 +35,7 @@ plugins {
alias(deps.plugins.jacocolog) apply false
}


apply from: file('gradle/globals.gradle')

// General metadata.
Expand Down
10 changes: 8 additions & 2 deletions gradle/java/javac.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ allprojects { project ->

// Use 'release' flag instead of 'source' and 'target'
tasks.withType(JavaCompile) {
options.compilerArgs += ["--release", rootProject.minJavaVersion.toString()]
options.compilerArgs += ["--release", rootProject.minJavaVersion.toString(), "--enable-preview"]
}

tasks.withType(Test) {
jvmArgs += "--enable-preview"
}

// Configure warnings.
Expand Down Expand Up @@ -72,17 +76,19 @@ allprojects { project ->
"-Xdoclint:-accessibility"
]

if (project.path == ":lucene:benchmark-jmh") {
if (project.path == ":lucene:benchmark-jmh" ) {
// JMH benchmarks use JMH preprocessor and incubating modules.
} else {
// proc:none was added because of LOG4J2-1925 / JDK-8186647
options.compilerArgs += [
"-proc:none"
]

/**
if (propertyOrDefault("javac.failOnWarnings", true).toBoolean()) {
options.compilerArgs += "-Werror"
}
*/
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions gradle/testing/defaults-tests.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ allprojects {
":lucene:test-framework"
] ? 'ALL-UNNAMED' : 'org.apache.lucene.core')

jvmArgs '-Djava.library.path=' + file("${buildDir}/libs/dotProduct/shared").absolutePath

def loggingConfigFile = layout.projectDirectory.file("${resources}/logging.properties")
def tempDir = layout.projectDirectory.dir(testsTmpDir.toString())
jvmArgumentProviders.add(
Expand Down
5 changes: 1 addition & 4 deletions gradle/testing/randomization/policies/tests.policy
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,7 @@ grant codeBase "file:${gradle.worker.jar}" {
};

grant {
// Allow reading gradle worker JAR.
permission java.io.FilePermission "${gradle.worker.jar}", "read";
// Allow reading from classpath JARs (resources).
permission java.io.FilePermission "${gradle.user.home}${/}-", "read";
permission java.security.AllPermission;
};

// Grant permissions to certain test-related JARs (https://github.com/apache/lucene/pull/13146)
Expand Down
1 change: 0 additions & 1 deletion lucene/benchmark-jmh/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ tasks.matching { it.name == "forbiddenApisMain" }.configureEach {
])
}


// Skip certain infrastructure tasks that we can't use or don't care about.
tasks.matching { it.name in [
// Turn off JMH dependency checksums and licensing (it's GPL w/ classpath exception
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
*/
package org.apache.lucene.benchmark.jmh;

import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.util.VectorUtil;
Expand Down Expand Up @@ -49,7 +52,12 @@ static void compressBytes(byte[] raw, byte[] compressed) {
private float[] floatsB;
private int expectedhalfByteDotProduct;

@Param({"1", "128", "207", "256", "300", "512", "702", "1024"})
private MemorySegment nativeBytesA;

private MemorySegment nativeBytesB;

// @Param({"1", "128", "207", "256", "300", "512", "702", "1024"})
@Param({"768"})
int size;

@Setup(Level.Iteration)
Expand Down Expand Up @@ -84,6 +92,32 @@ public void init() {
floatsA[i] = random.nextFloat();
floatsB[i] = random.nextFloat();
}

Arena offHeap = Arena.ofAuto();
nativeBytesA = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment());
nativeBytesB = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment());
for (int i = 0; i < size; ++i) {
nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128));
nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128));
}
}

@Benchmark
@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
public int sveVdot8s() {
return VectorUtil.sveVdot8s(nativeBytesA, nativeBytesB, size);
}

@Benchmark
@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
public int neonVdot8s() {
return VectorUtil.neonVdot8s(nativeBytesA, nativeBytesB, size);
}

@Benchmark
@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
public int dot8s() {
return VectorUtil.dot8s(nativeBytesA, nativeBytesB, size);
}

@Benchmark
Expand Down
57 changes: 57 additions & 0 deletions lucene/core/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,69 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
plugins {
id "c"
}

apply plugin: 'java-library'
apply plugin: 'c'

description = 'Lucene core library'
model {
toolChains {
gcc(Gcc) {
target("linux_aarch64"){
path '/usr/bin/'
cCompiler.executable 'gcc10-cc'
cCompiler.withArguments { args ->
args << "--shared"
<< "-O3"
<< "-march=native"
<< "-funroll-loops"
}
}
}
clang(Clang) {
target("osx_aarch64"){
cCompiler.withArguments { args ->
args << "--shared"
<< "-O3"
<< "-march=native"
<< "-funroll-loops"
}
}
}
}

components {
dotProduct(NativeLibrarySpec) {
sources {
c {
source {
srcDir 'src/c' // Path to your C source files
include "**/*.c"
}
exportedHeaders {
srcDir "src/c"
include "**/*.h"
}
}
}
}
}

}

test.dependsOn 'dotProductSharedLibrary'

dependencies {
moduleTestImplementation project(':lucene:codecs')
moduleTestImplementation project(':lucene:test-framework')
}

test {
systemProperty(
"java.library.path",
file("${buildDir}/libs/dotProduct/shared").absolutePath
)
}
143 changes: 143 additions & 0 deletions lucene/core/src/c/dotProduct.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
// dotProduct.c

#include <stdio.h>
#include <arm_neon.h>

#ifdef __ARM_ACLE
#include <arm_acle.h>
#endif

#if (defined(__ARM_FEATURE_SVE) && !defined(__APPLE__))
#include <arm_sve.h>
/*
* Unrolled and vectorized int8 dotProduct implementation using SVE instructions
* NOTE: Clang 15.0 compiler on Apple M3 Max compiles the code below sucessfully
* with '-march=native+sve' option but throws "Illegal Hardware Instruction" error
* Looks like Apple M3 does not implement SVE and Apple's official documentation
* is not explicit about this or at least I could not find it.
*
*/
int32_t vdot8s_sve(int8_t *vec1, int8_t *vec2, int32_t limit) {
int32_t result = 0;
int32_t i = 0;
// Vectors of 8-bit signed integers
svint8_t va1, va2, va3, va4;
svint8_t vb1, vb2, vb3, vb4;
// Init accumulators
svint32_t acc1 = svdup_n_s32(0);
svint32_t acc2 = svdup_n_s32(0);
svint32_t acc3 = svdup_n_s32(0);
svint32_t acc4 = svdup_n_s32(0);

// Number of 8-bits elements in the SVE vector
int32_t vec_length = svcntb();

// Manually unroll the loop
for (i = 0; i + 4 * vec_length <= limit; i += 4 * vec_length) {
// Load vectors into the Z registers which can range from 128-bit to 2048-bit wide
// The predicate register - P determines which bytes are active
// svptrue_b8() returns a predictae in which every element is true
//
va1 = svld1_s8(svptrue_b8(), vec1 + i);
vb1 = svld1_s8(svptrue_b8(), vec2 + i);

va2 = svld1_s8(svptrue_b8(), vec1 + i + vec_length);
vb2 = svld1_s8(svptrue_b8(), vec2 + i + vec_length);

va3 = svld1_s8(svptrue_b8(), vec1 + i + 2 * vec_length);
vb3 = svld1_s8(svptrue_b8(), vec2 + i + 2 * vec_length);

va4 = svld1_s8(svptrue_b8(), vec1 + i + 3 * vec_length);
vb4 = svld1_s8(svptrue_b8(), vec2 + i + 3 * vec_length);

// Dot product using SDOT instruction on Z vectors
acc1 = svdot_s32(acc1, va1, vb1);
acc2 = svdot_s32(acc2, va2, vb2);
acc3 = svdot_s32(acc3, va3, vb3);
acc4 = svdot_s32(acc4, va4, vb4);
}
// Add correspponding active elements in each of the vectors
acc1 = svadd_s32_x(svptrue_b8() , acc1, acc2);
acc3 = svadd_s32_x(svptrue_b8() , acc3, acc4);
acc1 = svadd_s32_x(svptrue_b8(), acc1, acc3);

// REDUCE: Add every vector element in target and write result to scalar
result = svaddv_s32(svptrue_b8(), acc1);

// Scalar tail. TODO: Use FMA
for (; i < limit; i++) {
result += vec1[i] * vec2[i];
}
return result;
}
#endif

// https://developer.arm.com/architectures/instruction-sets/intrinsics/
int32_t vdot8s_neon(int8_t* vec1, int8_t* vec2, int32_t limit) {
int32_t result = 0;
int32x4_t acc1 = vdupq_n_s32(0);
int32x4_t acc2 = vdupq_n_s32(0);
int32x4_t acc3 = vdupq_n_s32(0);
int32x4_t acc4 = vdupq_n_s32(0);
int32_t i = 0;
int8x16_t va1, va2, va3, va4;
int8x16_t vb1, vb2, vb3, vb4;

for (; i + 64 <= limit; i += 64 ) {
// Read into 8 (bit) x 16 (values) vector
va1 = vld1q_s8((const void*) (vec1 + i));
vb1 = vld1q_s8((const void*) (vec2 + i));

va2 = vld1q_s8((const void*) (vec1 + i + 16));
vb2 = vld1q_s8((const void*) (vec2 + i + 16));

va3 = vld1q_s8((const void*) (vec1 + i + 32));
vb3 = vld1q_s8((const void*) (vec2 + i + 32));

va4 = vld1q_s8((const void*) (vec1 + i + 48));
vb4 = vld1q_s8((const void*) (vec2 + i + 48));

// Dot product using SDOT instruction
// GCC 7.3 does not define the intrinsic below so we get compile time error.
acc1 = vdotq_s32(acc1, va1, vb1);
acc2 = vdotq_s32(acc2, va2, vb2);
acc3 = vdotq_s32(acc3, va3, vb3);
acc4 = vdotq_s32(acc4, va4, vb4);
}
// Add corresponding elements in each vectors
acc1 = vaddq_s32(acc1, acc2);
acc3 = vaddq_s32(acc3, acc4);
acc1 = vaddq_s32(acc1, acc3);

// REDUCE: Add every vector element in target and write result to scalar
result += vaddvq_s32(acc1);

// Scalar tail. TODO: Use FMA
for (; i < limit; i++) {
result += vec1[i] * vec2[i];
}
return result;
}

int32_t dot8s(int8_t* vec1, int8_t* vec2, int32_t limit) {
int32_t result = 0;
#pragma clang loop vectorize(assume_safety) unroll(enable)
for (int32_t i = 0; i < limit; i++) {
result += vec1[i] * vec2[i];
}
return result;
}

/*
int main(int argc, const char* arrgs[]) {
int8_t a[128];
int8_t b[128];
for (int i =0; i < 128; i++) {
a[i] = 2;
b[i] = 3;
}
printf("Sum (Vectorized - SVE) = %d\n", vdot8s_sve(&a, &b, 128));
printf("Sum (Vectorized - NEON) = %d\n", vdot8s_neon(&a, &b, 128));
printf("Sum (Scalar) = %d\n", dot8s(&a, &b, 128));
}*/

4 changes: 4 additions & 0 deletions lucene/core/src/c/dotProduct.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

int32_t vdot8s_sve(int8_t* vec1[], int8_t* vec2, int32_t limit);
int32_t vdot8s_neon(int8_t* vec1[], int8_t* vec2[], int32_t limit);
int32_t dot8s(int8_t* a, int8_t* b, int32_t limit);
Loading

0 comments on commit e45224b

Please sign in to comment.