Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PSQL support #3

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
This is a fork of Edward Raff's [LZJD](https://github.com/EdwardRaff/LZJD). The added feature is a plugin for [Postgres](https://www.postgresql.org/) to allow similarity matching in SQL queries.

### Requirements:
* GCC
* CMake
* Boost libraries
* Postgres requirements:
* Server development package, looks like `postgresql-server-dev-12`.

### Compile instructions:
* Check out the code
* `cd src`
* `mkdir build`
* `cmake ..`
* `make`

This creates:
* `liblzjd.shared.so`: Shared library for linking against.
* `liblzjd.static.a`: Static library for building against.
* `lzjd`: Command line application.
* If the Postgres server development package was installed, you should also have:
* `lzjd_psql.so`: Postgresl plugin


### Installation instructions:
* Copy `lzjd` to `/usr/local/bin/` or similar, if desired.
* Copy `liblzjd.shared.so` to `/usr/local/lib/liblzjd.so`, or similar, if desired. Note the file name change.
* Copy `liblzjd.static.a` to `/usr/local/lib/liblzjd.a`, or similar, if desired. Note the file name change here too.
* For the Postgres plugin:
* Run the command `pg_config --pkglibdir`, this is the installation directory.
* Copy `lzjd_psql.so` to the directory shown in the above command. Should be `/usr/lib/postgresql/XX/lib/` where XX is the version number.
* As the administrative user for your Postgres environment, run this SQL command to load the plugin: `CREATE OR REPLACE FUNCTION lzjd_compare(TEXT, TEXT) RETURNS INTEGER AS 'lzjd_psql.so', 'pg_lzjd_compare' LANGUAGE 'c';`. No restart required, the new function `lzjd_compare()` is available.
17 changes: 16 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
project(lzjd CXX)
project(lzjd)
cmake_minimum_required(VERSION 2.8)
FIND_PACKAGE( Boost 1.50 COMPONENTS program_options system filesystem REQUIRED )
INCLUDE_DIRECTORIES( ${Boost_INCLUDE_DIR} )
Expand All @@ -13,5 +13,20 @@ target_link_libraries(lzjd.shared)

add_library(lzjd.static STATIC ../src/LZJD.cpp ../src/MurmurHash3.cpp)

execute_process(COMMAND pg_config --includedir-server RESULT_VARIABLE PG_INCLUDESRV_RESULT OUTPUT_VARIABLE PG_INCLUDESRV_DIR)
execute_process(COMMAND pg_config --includedir RESULT_VARIABLE PG_INCLUDE_RESULT OUTPUT_VARIABLE PG_INCLUDE_DIR)

if(${PG_INCLUDESRV_RESULT} EQUAL 0)
MESSAGE( STATUS "PG Server Include: ${PG_INCLUDESRV_DIR}" )
MESSAGE( STATUS "PG Include: ${PG_INCLUDE_DIR}" )

add_library(lzjd_psql SHARED ../src/pglzjd.c ../src/pg_lzjd_helper.cpp ../src/LZJD.cpp ../src/MurmurHash3.cpp)
target_link_libraries(lzjd_psql ${Boost_LIBRARIES})
target_include_directories(lzjd_psql PRIVATE ${PG_INCLUDESRV_DIR})
#target_include_directories(lzjd_psql.so PRIVATE ${PG_INCLUDE_DIR})
set_target_properties(lzjd_psql PROPERTIES PREFIX "")
set_target_properties(lzjd_psql PROPERTIES OUTPUT_NAME "lzjd_psql")
endif()

set(CMAKE_BUILD_TYPE Release)

10 changes: 9 additions & 1 deletion src/LZJD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <cstdint>
#include <algorithm>
#include <mutex> // std::call_once, std::once_flag

#include <cmath> // round()
#include <boost/function_output_iterator.hpp>


Expand All @@ -17,6 +17,10 @@

using namespace std;

#ifdef __cplusplus
extern "C" {
#endif

LZJD::LZJD() {
}

Expand Down Expand Up @@ -131,3 +135,7 @@ int32_t similarity(const std::vector<int32_t>& x_minset, const std::vector<int32
double sim = same / (double) (x_minset.size() + y_minset.size() - same);
return (int) (round(100*sim));
}

#ifdef __cplusplus
}
#endif
8 changes: 8 additions & 0 deletions src/LZJD.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@

using namespace std;

#ifdef __cplusplus
extern "C" {
#endif

class LZJD
{
public:
Expand All @@ -23,4 +27,8 @@ std::vector<int32_t> digest(uint64_t k, std::vector<char>& bytes);

int32_t similarity(const std::vector<int32_t>& x_minset, const std::vector<int32_t>& y_minset);

#ifdef __cplusplus
}
#endif

#endif
7 changes: 7 additions & 0 deletions src/MurmurHash3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
#include "MurmurHash3.h"
using namespace std;

#ifdef __cplusplus
extern "C" {
#endif

void MurmurHash3::reset() {
_len = 0;
_h1 = _seed;
Expand Down Expand Up @@ -65,3 +69,6 @@ MurmurHash3::MurmurHash3(int32_t _seed) {
this->reset();
}

#ifdef __cplusplus
}
#endif
8 changes: 7 additions & 1 deletion src/MurmurHash3.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
#include <string>
#include <cstdint>


#ifdef __cplusplus
extern "C" {
#endif

#define FORCE_INLINE inline __attribute__((always_inline))

Expand Down Expand Up @@ -68,4 +70,8 @@ class MurmurHash3

};

#ifdef __cplusplus
}
#endif

#endif
68 changes: 68 additions & 0 deletions src/pg_lzjd_helper.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#include <cstdlib>
#include <cstdint>
#include <string>
#include <x86intrin.h>
#include <boost/archive/iterators/base64_from_binary.hpp>
#include <boost/archive/iterators/binary_from_base64.hpp>
#include <boost/archive/iterators/transform_width.hpp>
#include <boost/archive/iterators/ostream_iterator.hpp>
#include <boost/archive/iterators/remove_whitespace.hpp>

#include "LZJD.h"

using namespace std;
namespace bi = boost::archive::iterators;

extern "C" {

vector<int32_t> cstring_to_lzjd(char* hash) {
string line = hash;
auto first_colon = line.find(":", 0);
auto second_colon = line.find(":", first_colon + 1);
string path = line.substr(first_colon + 1, second_colon - first_colon - 1);
string base64ints = line.substr(second_colon + 1, line.size() - second_colon);
auto size = base64ints.size();
while (size > 0 && base64ints[size - 1] == '=')
size--;
base64ints = base64ints.substr(0, size);


//TODO this is not 100% kosher, but C++ is a pain.

typedef
bi::transform_width<
bi::binary_from_base64<bi::remove_whitespace < string::const_iterator>>,
8, 6
>
base64_dec;

vector<uint8_t> int_parts;

copy(
base64_dec(base64ints.cbegin()),
base64_dec(base64ints.cend()),
std::back_inserter(int_parts)
);

vector<int32_t> decoded_ints(int_parts.size() / 4);
for (int i = 0; i < int_parts.size(); i += 4) {
//big endian extraction of the right value
int32_t dec_i = (int_parts[i + 0] << 24) | (int_parts[i + 1] << 16) | (int_parts[i + 2] << 8) | (int_parts[i + 3] << 0);
decoded_ints[i / 4] = dec_i;
// cout << dec_i << ", ";
}
return decoded_ints;
}

int32_t lzjd_similarity(char *hash1, char *hash2) {
try {
vector<int32_t> l1 = cstring_to_lzjd(hash1);
vector<int32_t> l2 = cstring_to_lzjd(hash2);
return similarity(l1, l2);
} catch(...) {
return 0;
}
return 0;
}

} // End Extern C
14 changes: 14 additions & 0 deletions src/pg_lzjd_helper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef PGFIX_H
#define PGFIX_H

#ifdef __cplusplus
extern "C" {
#endif

int32_t lzjd_similarity(char *hash1, char *hash2);

#ifdef __cplusplus
}
#endif

#endif
32 changes: 32 additions & 0 deletions src/pglzjd.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// PostgreSQL includes
#include <postgres.h>
#include <utils/builtins.h> // in postgresql server includes, for text_to_cstring()

// Project includes
#include "pg_lzjd_helper.h"

PG_MODULE_MAGIC;

//
// CREATE OR REPLACE FUNCTION lzjd_compare(TEXT, TEXT) RETURNS INTEGER AS 'lzjd_psql.so', 'pg_lzjd_compare' LANGUAGE 'c';
//

PG_FUNCTION_INFO_V1(pg_lzjd_compare);
Datum pg_lzjd_compare(PG_FUNCTION_ARGS);

Datum pg_lzjd_compare(PG_FUNCTION_ARGS) {
if (PG_ARGISNULL(0) || PG_ARGISNULL(1)) {
PG_RETURN_INT32(0);
}
text *arg1 = PG_GETARG_TEXT_P(0);
text *arg2 = PG_GETARG_TEXT_P(1);
char* hash1 = text_to_cstring(arg1);
char* hash2 = text_to_cstring(arg2);

int32 score = lzjd_similarity(hash1, hash2);

pfree(hash1);
pfree(hash2);

PG_RETURN_INT32(score);
}
1 change: 1 addition & 0 deletions src/pglzjd.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE OR REPLACE FUNCTION lzjd_compare(TEXT, TEXT) RETURNS INTEGER AS 'lzjd_psql.so', 'pg_lzjd_compare' LANGUAGE 'c';