diff --git a/.github/workflows/python-ci-single.yml b/.github/workflows/python-ci-single.yml index af7e3a9e22..4803dfe260 100644 --- a/.github/workflows/python-ci-single.yml +++ b/.github/workflows/python-ci-single.yml @@ -53,7 +53,7 @@ jobs: - name: Check C++ Format shell: bash - run: ./scripts/run-clang-format.sh . clang-format 0 $(find libtiledbsoma -name "*.cc" -or -name "*.h") + run: ./scripts/run-clang-format.sh . clang-format 0 $(find libtiledbsoma -name "*.cc" -or -name "*.h" | grep -v external) build: runs-on: ${{ inputs.os }} diff --git a/apis/python/src/tiledbsoma/_experiment.py b/apis/python/src/tiledbsoma/_experiment.py index d42bd386a9..25c31e28d5 100644 --- a/apis/python/src/tiledbsoma/_experiment.py +++ b/apis/python/src/tiledbsoma/_experiment.py @@ -5,16 +5,18 @@ """Implementation of a SOMA Experiment. """ +import functools +from typing import Any, Optional -from typing import Any - -from somacore import experiment +from somacore import experiment, query +from typing_extensions import Self from ._collection import Collection, CollectionBase from ._dataframe import DataFrame from ._measurement import Measurement from ._tdb_handles import Wrapper from ._tiledb_object import AnyTileDBObject +from .utils import build_index class Experiment( # type: ignore[misc] # __eq__ false positive @@ -74,3 +76,23 @@ def _set_create_metadata(cls, handle: Wrapper[Any]) -> None: # TileDB Cloud UI to detect that they are SOMA datasets. handle.metadata["dataset_type"] = "soma" return super()._set_create_metadata(handle) + + def axis_query( # type: ignore + self, + measurement_name: str, + *, + obs_query: Optional[query.AxisQuery] = None, + var_query: Optional[query.AxisQuery] = None, + ) -> query.ExperimentAxisQuery[Self]: # type: ignore + """Creates an axis query over this experiment. + Lifecycle: maturing + """ + # mypy doesn't quite understand descriptors so it issues a spurious + # error here. + return query.ExperimentAxisQuery( # type: ignore + self, + measurement_name, + obs_query=obs_query or query.AxisQuery(), + var_query=var_query or query.AxisQuery(), + index_factory=functools.partial(build_index, context=self.context), + ) diff --git a/apis/python/src/tiledbsoma/pytiledbsoma.cc b/apis/python/src/tiledbsoma/pytiledbsoma.cc index bd41c0b414..9d25d018aa 100644 --- a/apis/python/src/tiledbsoma/pytiledbsoma.cc +++ b/apis/python/src/tiledbsoma/pytiledbsoma.cc @@ -37,6 +37,7 @@ #include #include +#include #include "query_condition.cc" @@ -693,5 +694,63 @@ PYBIND11_MODULE(pytiledbsoma, m) { .def("get_enum_is_ordered", get_enum_is_ordered) .def("get_enum_label_on_attr", &SOMAArray::get_enum_label_on_attr); + // Efficient C++ re-indexing (aka hashing unique key values to an index + // between 0 and number of keys - 1) based on khash + py::class_(m, "IntIndexer") + .def(py::init<>()) + .def(py::init&, int>()) + .def( + "map_locations", + [](IntIndexer& indexer, + py::array_t keys, + int num_threads) { + auto buffer = keys.request(); + int64_t* data = static_cast(buffer.ptr); + size_t length = buffer.shape[0]; + indexer.map_locations(keys.data(), keys.size(), num_threads); + }) + .def( + "map_locations", + [](IntIndexer& indexer, + std::vector keys, + int num_threads) { + indexer.map_locations(keys.data(), keys.size(), num_threads); + }) + // Perform lookup for a large input array of keys and return the looked + // up value array (passing ownership from C++ to python) + .def( + "get_indexer", + [](IntIndexer& indexer, py::array_t lookups) { + auto input_buffer = lookups.request(); + int64_t* input_ptr = static_cast(input_buffer.ptr); + size_t size = input_buffer.shape[0]; + auto results = py::array_t(size); + auto results_buffer = results.request(); + size_t results_size = results_buffer.shape[0]; + + int64_t* results_ptr = static_cast( + results_buffer.ptr); + + indexer.lookup(input_ptr, results_ptr, size); + return results; + }) + // Perform lookup for a large input array of keys and writes the looked + // up values into previously allocated array (works for the cases in + // which python and R pre-allocate the array) + .def( + "get_indexer", + [](IntIndexer& indexer, + py::array_t lookups, + py::array_t& results) { + auto input_buffer = lookups.request(); + int64_t* input_ptr = static_cast(input_buffer.ptr); + size_t size = input_buffer.shape[0]; + + auto results_buffer = results.request(); + int64_t* results_ptr = static_cast( + results_buffer.ptr); + size_t results_size = input_buffer.shape[0]; + indexer.lookup(input_ptr, input_ptr, size); + }); } } // namespace tiledbsoma diff --git a/apis/python/src/tiledbsoma/utils.py b/apis/python/src/tiledbsoma/utils.py new file mode 100644 index 0000000000..0901b8efe2 --- /dev/null +++ b/apis/python/src/tiledbsoma/utils.py @@ -0,0 +1,24 @@ +import numpy as np +import pandas as pd + +from tiledbsoma import pytiledbsoma as clib + +from .options import SOMATileDBContext + +"""Builds an indexer object compatible with :meth:`pd.Index.get_indexer`.""" + + +def build_index( + keys: np.typing.NDArray[np.int64], context: SOMATileDBContext +) -> clib.IntIndexer: + if len(np.unique(keys)) != len(keys): + raise pd.errors.InvalidIndexError( + "Reindexing only valid with uniquely valued Index objects" + ) + tdb_concurrency = int( + context.tiledb_ctx.config().get("sm.compute_concurrency_level", 10) + ) + thread_count = tdb_concurrency // 2 + reindexer = clib.IntIndexer() + reindexer.map_locations(keys, thread_count) + return reindexer diff --git a/apis/r/inst/include/tiledbsoma_types.h b/apis/r/inst/include/tiledbsoma_types.h index 973f567fe8..af27a1feb8 100644 --- a/apis/r/inst/include/tiledbsoma_types.h +++ b/apis/r/inst/include/tiledbsoma_types.h @@ -19,6 +19,7 @@ #include // for QueryCondition etc #define ARROW_SCHEMA_AND_ARRAY_DEFINED 1 #include +#include #include "rutilities.h" namespace tdbs = tiledbsoma; diff --git a/libtiledbsoma/src/CMakeLists.txt b/libtiledbsoma/src/CMakeLists.txt index f39d096623..ea62affc5f 100644 --- a/libtiledbsoma/src/CMakeLists.txt +++ b/libtiledbsoma/src/CMakeLists.txt @@ -50,6 +50,7 @@ message(STATUS "Building with commit hash ${BUILD_COMMIT_HASH}") # Common object library # ########################################################### add_library(TILEDB_SOMA_OBJECTS OBJECT + ${CMAKE_CURRENT_SOURCE_DIR}/reindexer/reindexer.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/managed_query.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_array.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_group.cc @@ -67,9 +68,8 @@ add_library(TILEDB_SOMA_OBJECTS OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/utils/util.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/version.cc - # TODO: uncomment when thread_pool is enabled - # ${CMAKE_CURRENT_SOURCE_DIR}/external/src/thread_pool/thread_pool.cc - # ${CMAKE_CURRENT_SOURCE_DIR}/external/src/thread_pool/status.cc + ${CMAKE_CURRENT_SOURCE_DIR}/external/src/thread_pool/thread_pool.cc + ${CMAKE_CURRENT_SOURCE_DIR}/external/src/thread_pool/status.cc ) message(STATUS "Building TileDB without deprecation warnings") @@ -87,6 +87,9 @@ set_property(TARGET TILEDB_SOMA_OBJECTS PROPERTY POSITION_INDEPENDENT_CODE ON) target_include_directories(TILEDB_SOMA_OBJECTS PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/vendor + ${CMAKE_CURRENT_SOURCE_DIR}/soma + ${CMAKE_CURRENT_SOURCE_DIR}/external/khash ${CMAKE_CURRENT_SOURCE_DIR}/external/include $ $ @@ -197,11 +200,16 @@ install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_sparse_ndarray.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_experiment.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_measurement.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_object.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_object.h DESTINATION "include/tiledbsoma/soma" ) -install(FILES +install(FILES + ${CMAKE_CURRENT_SOURCE_DIR}/reindexer/reindexer.h + DESTINATION "include/tiledbsoma/reindexer/" +) + +install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/tiledbsoma/tiledbsoma DESTINATION "include/tiledbsoma" ) @@ -213,10 +221,11 @@ install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/utils/stats.h ${CMAKE_CURRENT_SOURCE_DIR}/utils/util.h ${CMAKE_CURRENT_SOURCE_DIR}/utils/version.h + DESTINATION "include/tiledbsoma/utils" ) -install(FILES +install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/external/include/span/span.hpp DESTINATION "include/tiledbsoma/soma/span" ) @@ -294,6 +303,7 @@ if(TILEDBSOMA_BUILD_CLI) PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/external/include + ${CMAKE_CURRENT_SOURCE_DIR}/external/khash ${TILEDB_SOMA_EXPORT_HEADER_DIR} ${pybind11_INCLUDE_DIRS} ) diff --git a/libtiledbsoma/src/external/include/thread_pool/producer_consumer_queue.h b/libtiledbsoma/src/external/include/thread_pool/producer_consumer_queue.h index 89b2be3665..5b1dfdbc74 100644 --- a/libtiledbsoma/src/external/include/thread_pool/producer_consumer_queue.h +++ b/libtiledbsoma/src/external/include/thread_pool/producer_consumer_queue.h @@ -42,7 +42,7 @@ #include #include -#include "soma/logger_public.h" +#include namespace tiledbsoma { diff --git a/libtiledbsoma/src/external/khash/khash.h b/libtiledbsoma/src/external/khash/khash.h new file mode 100644 index 0000000000..f75f3474c1 --- /dev/null +++ b/libtiledbsoma/src/external/khash/khash.h @@ -0,0 +1,627 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2013-05-02 (0.2.8): + + * Use quadratic probing. When the capacity is power of 2, stepping function + i*(i+1)/2 guarantees to traverse each bucket. It is better than double + hashing on cache performance and is more robust than linear probing. + + In theory, double hashing should be more robust than quadratic probing. + However, my implementation is probably not for large hash tables, because + the second hash function is closely tied to the first hash function, + which reduce the effectiveness of double hashing. + + Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php + + 2011-12-29 (0.2.7): + + * Minor code clean up; no actual effect. + + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + + 2009-09-26 (0.2.4): + + * Improve portability + + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + */ + +#define AC_VERSION_KHASH_H "0.2.8" + +#include +#include +#include + +/* compiler specific configuration */ + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khint64_t; +#else +typedef unsigned long long khint64_t; +#endif + +#ifndef kh_inline +#ifdef _MSC_VER +#define kh_inline __inline +#else +#define kh_inline inline +#endif +#endif /* kh_inline */ + +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef kcalloc +#define kcalloc(N,Z) calloc(N,Z) +#endif +#ifndef kmalloc +#define kmalloc(Z) malloc(Z) +#endif +#ifndef krealloc +#define krealloc(P,Z) realloc(P,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + +static const double __ac_HASH_UPPER = 0.77; + +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct kh_##name##_s { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ + } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + kfree((void *)h->keys); kfree(h->flags); \ + kfree((void *)h->vals); \ + kfree(h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t k, i, last, mask, step = 0; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + (++step)) & mask; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) { kfree(new_flags); return -1; } \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) { kfree(new_flags); return -1; } \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t k, i, step = 0; \ + k = __hash_func(key); \ + i = k & new_mask; \ + while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + kfree(h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + return 0; \ + } \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + { \ + khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ + else { \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + i = (i + (++step)) & mask; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [khint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [khint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static kh_inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +static kh_inline khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other convenient macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: -1 if the operation failed; + 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/* More convenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash set containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/libtiledbsoma/src/external/khash/khashl.h b/libtiledbsoma/src/external/khash/khashl.h new file mode 100644 index 0000000000..93ce31354c --- /dev/null +++ b/libtiledbsoma/src/external/khash/khashl.h @@ -0,0 +1,351 @@ +/* The MIT License + + Copyright (c) 2019 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef __AC_KHASHL_H +#define __AC_KHASHL_H + +#define AC_VERSION_KHASHL_H "0.1" + +#include +#include +#include + +/************************************ + * Compiler specific configurations * + ************************************/ + +#if UINT_MAX == 0xffffffffu +typedef int32_t khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef int32_t khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef int64_t khint64_t; +#else +typedef int64_t khint64_t; +#endif + +#ifndef kh_inline +#ifdef _MSC_VER +#define kh_inline __inline +#else +#define kh_inline inline +#endif +#endif /* kh_inline */ + +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + +#define KH_LOCAL static kh_inline klib_unused + +typedef khint32_t khint_t; + +/****************** + * malloc aliases * + ******************/ + +#ifndef kcalloc +#define kcalloc(N,Z) calloc(N,Z) +#endif +#ifndef kmalloc +#define kmalloc(Z) malloc(Z) +#endif +#ifndef krealloc +#define krealloc(P,Z) realloc(P,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + +/**************************** + * Simple private functions * + ****************************/ + +#define __kh_used(flag, i) (flag[i>>5] >> (i&0x1fU) & 1U) +#define __kh_set_used(flag, i) (flag[i>>5] |= 1U<<(i&0x1fU)) +#define __kh_set_unused(flag, i) (flag[i>>5] &= ~(1U<<(i&0x1fU))) + +#define __kh_fsize(m) ((m) < 32? 1 : (m)>>5) + +static kh_inline khint_t __kh_h2b(khint_t hash, khint_t bits) { return hash * 2654435769U >> (32 - bits); } + +/******************* + * Hash table base * + *******************/ + +#define __KHASHL_TYPE(HType, khkey_t) \ + typedef struct HType { \ + khint_t bits, count; \ + khint32_t *used; \ + khkey_t *keys; \ + } HType; + +#define __KHASHL_PROTOTYPES(HType, prefix, khkey_t) \ + extern HType *prefix##_init(void); \ + extern void prefix##_destroy(HType *h); \ + extern void prefix##_clear(HType *h); \ + extern khint_t prefix##_getp(const HType *h, const khkey_t *key); \ + extern int prefix##_resize(HType *h, khint_t new_n_buckets); \ + extern khint_t prefix##_putp(HType *h, const khkey_t *key, int *absent); \ + extern void prefix##_del(HType *h, khint_t k); + +#define __KHASHL_IMPL_BASIC(SCOPE, HType, prefix) \ + SCOPE HType *prefix##_init(void) { \ + return (HType*)kcalloc(1, sizeof(HType)); \ + } \ + SCOPE void prefix##_destroy(HType *h) { \ + if (!h) return; \ + kfree((void *)h->keys); kfree(h->used); \ + kfree(h); \ + } \ + SCOPE void prefix##_clear(HType *h) { \ + if (h && h->used) { \ + uint32_t n_buckets = 1U << h->bits; \ + memset(h->used, 0, __kh_fsize(n_buckets) * sizeof(khint32_t)); \ + h->count = 0; \ + } \ + } + +#define __KHASHL_IMPL_GET(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + SCOPE khint_t prefix##_getp(const HType *h, const khkey_t *key) { \ + khint_t i, last, n_buckets, mask; \ + if (h->keys == 0) return 0; \ + n_buckets = 1U << h->bits; \ + mask = n_buckets - 1U; \ + i = last = __kh_h2b(__hash_fn(*key), h->bits); \ + while (__kh_used(h->used, i) && !__hash_eq(h->keys[i], *key)) { \ + i = (i + 1U) & mask; \ + if (i == last) return n_buckets; \ + } \ + return !__kh_used(h->used, i)? n_buckets : i; \ + } \ + SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { return prefix##_getp(h, &key); } + +#define __KHASHL_IMPL_RESIZE(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + SCOPE int prefix##_resize(HType *h, khint_t new_n_buckets) { \ + khint32_t *new_used = 0; \ + khint_t j = 0, x = new_n_buckets, n_buckets, new_bits, new_mask; \ + while ((x >>= 1) != 0) ++j; \ + if (new_n_buckets & (new_n_buckets - 1)) ++j; \ + new_bits = j > 2? j : 2; \ + new_n_buckets = 1U << new_bits; \ + if (h->count > (new_n_buckets>>1) + (new_n_buckets>>2)) return 0; /* requested size is too small */ \ + new_used = (khint32_t*)kmalloc(__kh_fsize(new_n_buckets) * sizeof(khint32_t)); \ + memset(new_used, 0, __kh_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_used) return -1; /* not enough memory */ \ + n_buckets = h->keys? 1U<bits : 0U; \ + if (n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t*)krealloc((void*)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) { kfree(new_used); return -1; } \ + h->keys = new_keys; \ + } /* otherwise shrink */ \ + new_mask = new_n_buckets - 1; \ + for (j = 0; j != n_buckets; ++j) { \ + khkey_t key; \ + if (!__kh_used(h->used, j)) continue; \ + key = h->keys[j]; \ + __kh_set_unused(h->used, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t i; \ + i = __kh_h2b(__hash_fn(key), new_bits); \ + while (__kh_used(new_used, i)) i = (i + 1) & new_mask; \ + __kh_set_used(new_used, i); \ + if (i < n_buckets && __kh_used(h->used, i)) { /* kick out the existing element */ \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + __kh_set_unused(h->used, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + break; \ + } \ + } \ + } \ + if (n_buckets > new_n_buckets) /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + kfree(h->used); /* free the working space */ \ + h->used = new_used, h->bits = new_bits; \ + return 0; \ + } + +#define __KHASHL_IMPL_PUT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + SCOPE khint_t prefix##_putp(HType *h, const khkey_t *key, int *absent) { \ + khint_t n_buckets, i, last, mask; \ + n_buckets = h->keys? 1U<bits : 0U; \ + *absent = -1; \ + if (h->count >= (n_buckets>>1) + (n_buckets>>2)) { /* rehashing */ \ + if (prefix##_resize(h, n_buckets + 1U) < 0) \ + return n_buckets; \ + n_buckets = 1U<bits; \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + mask = n_buckets - 1; \ + i = last = __kh_h2b(__hash_fn(*key), h->bits); \ + while (__kh_used(h->used, i) && !__hash_eq(h->keys[i], *key)) { \ + i = (i + 1U) & mask; \ + if (i == last) break; \ + } \ + if (!__kh_used(h->used, i)) { /* not present at all */ \ + h->keys[i] = *key; \ + __kh_set_used(h->used, i); \ + ++h->count; \ + *absent = 1; \ + } else *absent = 0; /* Don't touch h->keys[i] if present */ \ + return i; \ + } \ + SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { return prefix##_putp(h, &key, absent); } + +#define __KHASHL_IMPL_DEL(SCOPE, HType, prefix, khkey_t, __hash_fn) \ + SCOPE int prefix##_del(HType *h, khint_t i) { \ + khint_t j = i, k, mask, n_buckets; \ + if (h->keys == 0) return 0; \ + n_buckets = 1U<bits; \ + mask = n_buckets - 1U; \ + while (1) { \ + j = (j + 1U) & mask; \ + if (j == i || !__kh_used(h->used, j)) break; /* j==i only when the table is completely full */ \ + k = __kh_h2b(__hash_fn(h->keys[j]), h->bits); \ + if ((j > i && (k <= i || k > j)) || (j < i && (k <= i && k > j))) \ + h->keys[i] = h->keys[j], i = j; \ + } \ + __kh_set_unused(h->used, i); \ + --h->count; \ + return 1; \ + } + +#define KHASHL_DECLARE(HType, prefix, khkey_t) \ + __KHASHL_TYPE(HType, khkey_t) \ + __KHASHL_PROTOTYPES(HType, prefix, khkey_t) + +#define KHASHL_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_TYPE(HType, khkey_t) \ + __KHASHL_IMPL_BASIC(SCOPE, HType, prefix) \ + __KHASHL_IMPL_GET(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_IMPL_RESIZE(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_IMPL_PUT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_IMPL_DEL(SCOPE, HType, prefix, khkey_t, __hash_fn) + +/***************************** + * More convenient interface * + *****************************/ + +#define __kh_packed __attribute__ ((__packed__)) +#define __kh_cached_hash(x) ((x).hash) + +#define KHASHL_SET_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + typedef struct { khkey_t key; } __kh_packed HType##_s_bucket_t; \ + static kh_inline khint_t prefix##_s_hash(HType##_s_bucket_t x) { return __hash_fn(x.key); } \ + static kh_inline int prefix##_s_eq(HType##_s_bucket_t x, HType##_s_bucket_t y) { return __hash_eq(x.key, y.key); } \ + KHASHL_INIT(KH_LOCAL, HType, prefix##_s, HType##_s_bucket_t, prefix##_s_hash, prefix##_s_eq) \ + SCOPE HType *prefix##_init(void) { return prefix##_s_init(); } \ + SCOPE void prefix##_destroy(HType *h) { prefix##_s_destroy(h); } \ + SCOPE void prefix##_resize(HType *h, khint_t new_n_buckets) { prefix##_s_resize(h, new_n_buckets); } \ + SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_s_bucket_t t; t.key = key; return prefix##_s_getp(h, &t); } \ + SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_s_del(h, k); } \ + SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_s_bucket_t t; t.key = key; return prefix##_s_putp(h, &t, absent); } + +#define KHASHL_MAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \ + typedef struct { khkey_t key; kh_val_t val; } __kh_packed HType##_m_bucket_t; \ + static kh_inline khint_t prefix##_m_hash(HType##_m_bucket_t x) { return __hash_fn(x.key); } \ + static kh_inline int prefix##_m_eq(HType##_m_bucket_t x, HType##_m_bucket_t y) { return __hash_eq(x.key, y.key); } \ + KHASHL_INIT(KH_LOCAL, HType, prefix##_m, HType##_m_bucket_t, prefix##_m_hash, prefix##_m_eq) \ + SCOPE HType *prefix##_init(void) { return prefix##_m_init(); } \ + SCOPE void prefix##_destroy(HType *h) { prefix##_m_destroy(h); } \ + SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_m_bucket_t t; t.key = key; return prefix##_m_getp(h, &t); } \ + SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_m_del(h, k); } \ + SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_m_bucket_t t; t.key = key; return prefix##_m_putp(h, &t, absent); } + +#define KHASHL_CSET_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + typedef struct { khkey_t key; khint_t hash; } __kh_packed HType##_cs_bucket_t; \ + static kh_inline int prefix##_cs_eq(HType##_cs_bucket_t x, HType##_cs_bucket_t y) { return x.hash == y.hash && __hash_eq(x.key, y.key); } \ + KHASHL_INIT(KH_LOCAL, HType, prefix##_cs, HType##_cs_bucket_t, __kh_cached_hash, prefix##_cs_eq) \ + SCOPE HType *prefix##_init(void) { return prefix##_cs_init(); } \ + SCOPE void prefix##_destroy(HType *h) { prefix##_cs_destroy(h); } \ + SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_cs_bucket_t t; t.key = key; t.hash = __hash_fn(key); return prefix##_cs_getp(h, &t); } \ + SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_cs_del(h, k); } \ + SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_cs_bucket_t t; t.key = key, t.hash = __hash_fn(key); return prefix##_cs_putp(h, &t, absent); } + +#define KHASHL_CMAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \ + typedef struct { khkey_t key; kh_val_t val; khint_t hash; } __kh_packed HType##_cm_bucket_t; \ + static kh_inline int prefix##_cm_eq(HType##_cm_bucket_t x, HType##_cm_bucket_t y) { return x.hash == y.hash && __hash_eq(x.key, y.key); } \ + KHASHL_INIT(KH_LOCAL, HType, prefix##_cm, HType##_cm_bucket_t, __kh_cached_hash, prefix##_cm_eq) \ + SCOPE HType *prefix##_init(void) { return prefix##_cm_init(); } \ + SCOPE void prefix##_destroy(HType *h) { prefix##_cm_destroy(h); } \ + SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_cm_bucket_t t; t.key = key; t.hash = __hash_fn(key); return prefix##_cm_getp(h, &t); } \ + SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_cm_del(h, k); } \ + SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_cm_bucket_t t; t.key = key, t.hash = __hash_fn(key); return prefix##_cm_putp(h, &t, absent); } + +/************************** + * Public macro functions * + **************************/ + +#define kh_bucket(h, x) ((h)->keys[x]) +#define kh_size(h) ((h)->count) +#define kh_capacity(h) ((h)->keys? 1U<<(h)->bits : 0U) +#define kh_end(h) kh_capacity(h) + +#define kh_key(h, x) ((h)->keys[x].key) +#define kh_val(h, x) ((h)->keys[x].val) +#define kh_exist(h, x) __kh_used((h)->used, (x)) + +/************************************** + * Common hash and equality functions * + **************************************/ + +#define kh_eq_generic(a, b) ((a) == (b)) +#define kh_eq_str(a, b) (strcmp((a), (b)) == 0) +#define kh_hash_dummy(x) ((khint_t)(x)) + +static kh_inline khint_t kh_hash_uint32(khint_t key) { + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} + +static kh_inline khint_t kh_hash_uint64(khint64_t key) { + key = ~key + (key << 21); + key = key ^ key >> 24; + key = (key + (key << 3)) + (key << 8); + key = key ^ key >> 14; + key = (key + (key << 2)) + (key << 4); + key = key ^ key >> 28; + key = key + (key << 31); + return (khint_t)key; +} + +static kh_inline khint_t kh_hash_str(const char *s) { + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; + return h; +} + +#endif /* __AC_KHASHL_H */ diff --git a/libtiledbsoma/src/reindexer/example.py b/libtiledbsoma/src/reindexer/example.py new file mode 100644 index 0000000000..3b2a8b9c8e --- /dev/null +++ b/libtiledbsoma/src/reindexer/example.py @@ -0,0 +1,22 @@ +from time import perf_counter + +import cellxgene_census + +import tiledbsoma as soma + + +def main(): + census_s3 = dict(census_version="latest") + t1 = perf_counter() + with cellxgene_census.open_soma(**census_s3) as census: + with census["census_data"]["homo_sapiens"].axis_query( + measurement_name="RNA", + obs_query=soma.AxisQuery(value_filter="tissue_general == 'eye'"), + ) as query: + query.to_anndata(X_name="raw") + t2 = perf_counter() + print(f"End to end time {t2 - t1}") + + +if __name__ == "__main__": + main() diff --git a/libtiledbsoma/src/reindexer/reindexer.cc b/libtiledbsoma/src/reindexer/reindexer.cc new file mode 100644 index 0000000000..68331f7587 --- /dev/null +++ b/libtiledbsoma/src/reindexer/reindexer.cc @@ -0,0 +1,156 @@ +/** + * @file reindexer.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the a pybind11 api in the SOMA C++ library. + */ + +#include "reindexer.h" +#include +#include +#include +#include "khash.h" +#include "soma/enums.h" +#include "soma/soma_array.h" +#include "utils/arrow_adapter.h" +#include "utils/common.h" +#include "utils/logger.h" + +// Typedef for a 64-bit khash table +KHASH_MAP_INIT_INT64(m64, int64_t) + +namespace tiledbsoma { + +void IntIndexer::map_locations(const int64_t* keys, int size, int threads) { + map_size_ = size; + if (size == 0) { + return; + } + if (size < 10) { + threads = 1; + } + if (size < threads) + throw std::runtime_error( + "The number of keys " + std::to_string(size) + + " must be larger than the number of threads " + + std::to_string(threads) + " ."); + + LOG_DEBUG(fmt::format( + "End of Map locations of size {} and {} threads", size, threads)); + + LOG_DEBUG(fmt::format( + "[Re-indexer] Start of Map locations with {} keys and {} threads", + size, + threads)); + hash_ = kh_init(m64); + kh_resize(m64, hash_, size * 1.25); + LOG_DEBUG( + fmt::format("[Re-indexer] Thread pool started and hash table created")); + int ret; + khint64_t k; + int64_t counter = 0; + // Hash map construction + for (int i = 0; i < size; i++) { + k = kh_put(m64, hash_, keys[i], &ret); + assert(k != kh_end(hash_)); + kh_val(hash_, k) = counter; + counter++; + } + auto hsize = kh_size(hash_); + LOG_DEBUG(fmt::format("[Re-indexer] khash size = {}", hsize)); + tiledb_thread_pool_ = std::make_unique(threads); +} + +void IntIndexer::lookup(const int64_t* keys, int64_t* results, int size) { + if (size == 0) { + return; + } + LOG_DEBUG(fmt::format( + "Lookup with thread concurrency {} on data size {}", + tiledb_thread_pool_->concurrency_level(), + size)); + if (tiledb_thread_pool_->concurrency_level() == 1) { + for (int i = 0; i < size; i++) { + auto k = kh_get(m64, hash_, keys[i]); + if (k == kh_end(hash_)) { + // According to pandas behavior + results[i] = -1; + } else { + results[i] = kh_val(hash_, k); + } + } + return; + } + + LOG_DEBUG(fmt::format("Creating tileDB tasks for the size of {}", size)); + std::vector tasks; + + size_t thread_chunk_size = size / tiledb_thread_pool_->concurrency_level(); + + for (size_t i = 0; i < size_t(size); i += thread_chunk_size) { + size_t start = i; + size_t end = i + thread_chunk_size; + if (end > size_t(size)) { + end = size; + } + LOG_DEBUG(fmt::format( + "Creating tileDB task for the range from {} to {} ", start, end)); + tiledbsoma::ThreadPool::Task task = tiledb_thread_pool_->execute( + [this, start, end, &results, &keys]() { + for (size_t i = start; i < end; i++) { + auto k = kh_get(m64, hash_, keys[i]); + if (k == kh_end(hash_)) { + // According to pandas behavior + results[i] = -1; + } else { + results[i] = kh_val(hash_, k); + } + } + return tiledbsoma::Status::Ok(); + }); + assert(task.valid()); + tasks.emplace_back(std::move(task)); + LOG_DEBUG(fmt::format( + "Task for the range from {} to {} inserted in the queue", + start, + end)); + } + tiledb_thread_pool_->wait_all(tasks); +} + +IntIndexer::~IntIndexer() { + if (map_size_ > 0) { + kh_destroy(m64, this->hash_); + } +} + +IntIndexer::IntIndexer(const int64_t* keys, int size, int threads) { + map_locations(keys, size, threads); +} + +} // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/reindexer/reindexer.h b/libtiledbsoma/src/reindexer/reindexer.h new file mode 100644 index 0000000000..a83305bcac --- /dev/null +++ b/libtiledbsoma/src/reindexer/reindexer.h @@ -0,0 +1,103 @@ +/** + * @file reindexer.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the a pybind11 api into SOMA C++ library. + */ + +#ifndef TILEDBSOMA_REINDEXER_H +#define TILEDBSOMA_REINDEXER_H + +#include +#include +#include +#include +#include + +struct kh_m64_s; + +namespace tiledbsoma { + +class ThreadPool; + +class IntIndexer { + public: + /** + * Perform intitalization of hash and threadpool + * @param keys pointer to key array of 64bit integers + * @param size yhr number of keys in the put + * @param threads number of threads in the thread pool + */ + void map_locations(const int64_t* keys, int size, int threads = 4); + void map_locations(const std::vector& keys, int threads = 4) { + map_locations(keys.data(), keys.size(), threads); + } + /** + * Used for parallel lookup using khash + * @param keys array of keys to lookup + * @param result array for lookup results + * @param size // Number of key array + * @return and array of looked up value (same size as keys) + */ + void lookup(const int64_t* keys, int64_t* results, int size); + void lookup( + const std::vector& keys, std::vector& results) { + if (keys.size() != results.size()) + throw std::runtime_error( + "The size of input and results arrays must be the same."); + + lookup(keys.data(), results.data(), keys.size()); + } + IntIndexer(){}; + /** + * Constructor with the same arguments as map_locations + */ + IntIndexer(const int64_t* keys, int size, int threads); + IntIndexer(const std::vector& keys, int threads) + : IntIndexer(keys.data(), keys.size(), threads) { + } + virtual ~IntIndexer(); + + private: + /* + * The created 64bit hash table + */ + kh_m64_s* hash_; + /* + * TileDB threadpool + */ + std::shared_ptr tiledb_thread_pool_ = nullptr; + /* + * Number of elements in the map set by map_locations + */ + int map_size_ = 0; +}; + +} // namespace tiledbsoma + +#endif // TILEDBSOMA_REINDEXER_H \ No newline at end of file diff --git a/libtiledbsoma/test/CMakeLists.txt b/libtiledbsoma/test/CMakeLists.txt index f0f7a6b995..b58a8952bc 100644 --- a/libtiledbsoma/test/CMakeLists.txt +++ b/libtiledbsoma/test/CMakeLists.txt @@ -35,6 +35,7 @@ add_executable(unit_soma unit_soma_dense_ndarray.cc unit_soma_sparse_ndarray.cc unit_soma_collection.cc + test_indexer.cc # TODO: uncomment when thread_pool is enabled # unit_thread_pool.cc ) diff --git a/libtiledbsoma/test/test_indexer.cc b/libtiledbsoma/test/test_indexer.cc new file mode 100644 index 0000000000..dc324eb1ee --- /dev/null +++ b/libtiledbsoma/test/test_indexer.cc @@ -0,0 +1,132 @@ +/** + * @file test_indexer.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file manages unit tests for re-indexer + */ + +#include +#include +#include +#include +#include +#include +#include +#include "external/khash/khash.h" + +namespace { + +/** + * @brief Create multiple 64 bit vector and try the reindexer compared to khash + * as baseline. + * + */ + +KHASH_MAP_INIT_INT64(m64, int64_t) + +bool run_test(std::vector keys, std::vector lookups) { + std::vector indexer_results; + indexer_results.resize(lookups.size()); + + tiledbsoma::IntIndexer indexer; + indexer.map_locations(keys); + auto* hash = kh_init(m64); + int ret; + khint64_t k; + + for (size_t i = 0; i < keys.size(); i++) { + k = kh_put(m64, hash, keys[i], &ret); + assert(k != kh_end(hash)); + kh_val(hash, k) = i; + } + + indexer.lookup(lookups, indexer_results); + std::vector kh_results; + kh_results.resize(lookups.size()); + for (size_t i = 0; i < lookups.size(); i++) { + auto k = kh_get(m64, hash, lookups[i]); + if (k == kh_end(hash)) { + // According to pandas behavior + kh_results[i] = -1; + } else { + kh_results[i] = kh_val(hash, k); + } + } + kh_destroy(m64, hash); + return indexer_results == kh_results; +} + +std::vector test_pass = {false, false, true, true, true}; +std::vector>> test_data = { + { + {"keys", {-1, -1, -1, 0, 0, 0}}, + {"lookups", + {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5}}, + }, + {{"keys", + { + -1, 1, 2, 3, 4, 5, -1, 1, 2, 3, 4, 5, + -1, 1, 2, 3, 4, 5, -1, 1, 2, 3, 4, 5, + }}, + {"lookups", {-10000, 1, 2, 3, 5, 6}}}, + { + {"keys", {-1, 1, 2, 3, 4, 5}}, + {"lookups", + { + -1, 1, 2, 3, 4, 5, -1, 1, 2, 3, 4, 5, + -1, 1, 2, 3, 4, 5, -1, 1, 2, 3, 4, 5, + }}, + }, + { + {"keys", {-10000, -100000, 200000, 5, 1, 7}}, + {"lookups", + { + -1, 1, 2, 3, 4, 5, -1, 1, 2, 3, 4, 5, + -1, 1, 2, 3, 4, 5, -1, 1, 2, 3, 4, 5, + }}, + }, + { + {"keys", {-10000, -200000, 1000, 3000, 1, 2}}, + {"lookups", + { + -1, 1, 2, 3, 4, 5, -1, 1, 2, 3, 4, 5, + -1, 1, 2, 3, 4, 5, -1, 1, 2, 3, 4, 5, + }}, + }}; + +TEST_CASE("C++ re-indexer") { + for (size_t test = 0; test < test_data.size(); test++) { + bool result = run_test( + test_data[test]["keys"], test_data[test]["lookups"]); + if (!result) { + throw std::runtime_error( + "Test " + std::to_string(test) + " failed"); + } + } +} +} // namespace diff --git a/libtiledbsoma/test/test_indexer.py b/libtiledbsoma/test/test_indexer.py new file mode 100644 index 0000000000..3bbd9ec520 --- /dev/null +++ b/libtiledbsoma/test/test_indexer.py @@ -0,0 +1,187 @@ +import numpy as np +import pandas as pd +import tiledb + +from tiledbsoma.options import SOMATileDBContext +from tiledbsoma.options._soma_tiledb_context import _validate_soma_tiledb_context +from tiledbsoma.pytiledbsoma import config_logging +from tiledbsoma.utils import build_index + +config_logging("debug") + + +def indexer_test(keys: np.array, lookups: np.array, fail: bool): + if fail: + indexer_test_fail(keys, lookups) + else: + indexer_test_pass(keys, lookups) + + +def indexer_test_fail(keys: np.array, lookups: np.array): + try: + context = _validate_soma_tiledb_context(SOMATileDBContext(tiledb.default_ctx())) + index = build_index(keys, context) + index.get_indexer(lookups) + raise AssertionError("should have failed") + except pd.errors.InvalidIndexError: + pass + + try: + pd_index = pd.Index(keys) + pd_index.get_indexer(lookups) + raise AssertionError("should have failed") + except pd.errors.InvalidIndexError: + pass + + +def indexer_test_pass(keys: np.array, lookups: np.array): + context = _validate_soma_tiledb_context(SOMATileDBContext(tiledb.default_ctx())) + indexer = build_index(keys, context) + results = indexer.get_indexer(lookups) + panda_indexer = pd.Index(keys) + panda_results = panda_indexer.get_indexer(lookups) + assert np.equal(results.all(), panda_results.all()) + + +test_data = [ + { + "keys": [-1, -1, -1, 0, 0, 0], + "lookups": [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5], + "pass": False, + }, + { + "keys": [ + -1, + 1, + 2, + 3, + 4, + 5, + -1, + 1, + 2, + 3, + 4, + 5, + -1, + 1, + 2, + 3, + 4, + 5, + -1, + 1, + 2, + 3, + 4, + 5, + ], + "lookups": [-10000, 1, 2, 3, 5, 6], + "pass": False, + }, + { + "keys": [-1, 1, 2, 3, 4, 5], + "lookups": [ + -1, + 1, + 2, + 3, + 4, + 5, + -1, + 1, + 2, + 3, + 4, + 5, + -1, + 1, + 2, + 3, + 4, + 5, + -1, + 1, + 2, + 3, + 4, + 5, + ], + "pass": True, + }, + { + "keys": [-10000, -100000, 200000, 5, 1, 7], + "lookups": [ + -1, + 1, + 2, + 3, + 4, + 5, + -1, + 1, + 2, + 3, + 4, + 5, + -1, + 1, + 2, + 3, + 4, + 5, + -1, + 1, + 2, + 3, + 4, + 5, + ], + "pass": True, + }, + { + "keys": [-10000, -200000, 1000, 3000, 1, 2], + "lookups": [ + -1, + 1, + 2, + 3, + 4, + 5, + -1, + 1, + 2, + 3, + 4, + 5, + -1, + 1, + 2, + 3, + 4, + 5, + -1, + 1, + 2, + 3, + 4, + 5, + ], + "pass": True, + }, + { + "keys": [i for i in range(1, 10000)], + "lookups": [i for i in range(1, 10)], + "pass": True, + }, + { + "keys": [1, 2, 3, 4, 5, 2**63 - 1], + "lookups": [i for i in range(2**63 - 1000, 2**63 - 1)], + "pass": True, + }, +] + + +def test_indexer(): + for data in test_data: + indexer_test(data["keys"], data["lookups"], not data["pass"])