Skip to content

Commit

Permalink
Removed intrinsics.
Browse files Browse the repository at this point in the history
  • Loading branch information
rjzak committed Aug 16, 2020
1 parent f44a092 commit 08bbb15
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 67 deletions.
70 changes: 5 additions & 65 deletions LZJD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,9 @@
#include <algorithm>
#include <mutex> // std::call_once, std::once_flag
#include <math.h> // round()
#include <boost/function_output_iterator.hpp>


#include <x86intrin.h>
#include <stdint.h>


#include "LZJD.h"
#include "MurmurHash3.h"

using namespace std;

#ifdef __cplusplus
Expand Down Expand Up @@ -71,67 +64,14 @@ std::vector<int32_t> digest(uint64_t k, std::vector<char>& bytes)
return ints;
}


//Faster vecotrized list intersection taken from https://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/
//Need b/c C++ compiler output was over 5x slower than Java at this simple task!
//Surprisingly, sill more than 2x slower than the Java version! Go JIT go!
size_t intersect_vector(int32_t *A, int32_t *B, size_t s_a, size_t s_b)
{
size_t count = 0;
size_t i_a = 0, i_b = 0;

// trim lengths to be a multiple of 4
size_t st_a = (s_a / 4) * 4;
size_t st_b = (s_b / 4) * 4;

while(i_a < st_a && i_b < st_b)
{
//[ load segments of four 32-bit elements
__m128i v_a = _mm_load_si128((__m128i*)&A[i_a]);
__m128i v_b = _mm_load_si128((__m128i*)&B[i_b]);
//]

//[ move pointers
int32_t a_max = _mm_extract_epi32(v_a, 3);
int32_t b_max = _mm_extract_epi32(v_b, 3);
i_a += (a_max <= b_max) * 4;
i_b += (a_max >= b_max) * 4;
//]

//[ compute mask of common elements
const int32_t cyclic_shift = _MM_SHUFFLE(0,3,2,1);
__m128i cmp_mask1 = _mm_cmpeq_epi32(v_a, v_b); // pairwise comparison
v_b = _mm_shuffle_epi32(v_b, cyclic_shift); // shuffling
__m128i cmp_mask2 = _mm_cmpeq_epi32(v_a, v_b); // again...
v_b = _mm_shuffle_epi32(v_b, cyclic_shift);
__m128i cmp_mask3 = _mm_cmpeq_epi32(v_a, v_b); // and again...
v_b = _mm_shuffle_epi32(v_b, cyclic_shift);
__m128i cmp_mask4 = _mm_cmpeq_epi32(v_a, v_b); // and again.
__m128i cmp_mask = _mm_or_si128(
_mm_or_si128(cmp_mask1, cmp_mask2),
_mm_or_si128(cmp_mask3, cmp_mask4)
); // OR-ing of comparison masks
// convert the 128-bit mask to the 4-bit mask
int32_t mask = _mm_movemask_ps((__m128)cmp_mask);
//]

//[ copy out common elements
// __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask[mask]);
// _mm_storeu_si128((__m128i*)&C[count], p);
count += _mm_popcnt_u32(mask); // a number of elements is a weight of the mask
//]
}

// intersect the tail using scalar intersection

return count;
}

int32_t similarity(const std::vector<int32_t>& x_minset, const std::vector<int32_t>& y_minset)
{
int32_t same = 0;

same = intersect_vector((int32_t*)x_minset.data(), (int32_t*)y_minset.data(), x_minset.size(), y_minset.size());
vector<int32_t> v3;
set_intersection(x_minset.begin(),x_minset.end(),
y_minset.begin(),y_minset.end(),
back_inserter(v3));
same = v3.size();
double sim = same / (double) (x_minset.size() + y_minset.size() - same);
return (int) (round(100*sim));
}
Expand Down
2 changes: 1 addition & 1 deletion lzjd.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package golzjd

// #cgo CXXFLAGS: -Wall -ggdb -std=c++11 -msse4.2 -I .
// #cgo CXXFLAGS: -Wall -std=c++11 -I .
// #cgo LDFLAGS: -lstdc++
/*
#include "lzjd_helper.h"
Expand Down
1 change: 0 additions & 1 deletion lzjd_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#include <cstdint>
#include <string>
#include <cstring>
#include <x86intrin.h>
#include <boost/archive/iterators/base64_from_binary.hpp>
#include <boost/archive/iterators/binary_from_base64.hpp>
#include <boost/archive/iterators/transform_width.hpp>
Expand Down

0 comments on commit 08bbb15

Please sign in to comment.