Skip to content

Commit

Permalink
Move to minimalkv
Browse files Browse the repository at this point in the history
  • Loading branch information
xhochy committed Apr 21, 2021
1 parent d027ca2 commit 3d6f160
Show file tree
Hide file tree
Showing 56 changed files with 144 additions and 152 deletions.
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
Changelog
=========

Kartothek 4.1.0 (2021-04-xx)
============================

* Switch from ``simplekv`` and ``storefact`` to their successor ``minimalkv`` as the library providing the store implementations.

Kartothek 4.0.2 (2021-04-xx)
============================
Expand Down
4 changes: 2 additions & 2 deletions asv_bench/benchmarks/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
import uuid
from functools import lru_cache

import minimalkv
import numpy as np
import pandas as pd
import pyarrow as pa
import storefact

from kartothek.core.index import ExplicitSecondaryIndex
from kartothek.io_components.metapartition import MetaPartition
Expand Down Expand Up @@ -44,7 +44,7 @@ def setup(self, number_values, number_partitions, dtype):
column=self.column_name, index_dct=index_dct, dtype=arrow_type
)
self.tmp_dir = tempfile.mkdtemp()
self.store = storefact.get_store_from_url("hfs://{}".format(self.tmp_dir))
self.store = minimalkv.get_store_from_url("hfs://{}".format(self.tmp_dir))
self.dataset_uuid = "some_uuid"
self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid)

Expand Down
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/predicate_pushdown.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from storefact import get_store_from_url
from minimalkv import get_store_from_url

from kartothek.serialization import ParquetSerializer
from kartothek.serialization.testing import get_dataframe_not_nested
Expand Down
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/write.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import tempfile
import uuid

from storefact import get_store_from_url
from minimalkv import get_store_from_url

from kartothek.core.common_metadata import make_meta
from kartothek.core.testing import get_dataframe_alltypes
Expand Down
3 changes: 1 addition & 2 deletions conda-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
dask[dataframe]
decorator
minimalkv
msgpack-python>=0.5.2
# Currently dask and numpy==1.16.0 clash
numpy!=1.15.0,!=1.16.0
pandas>=0.23.0, !=1.0.0
pyarrow>=0.17.1,!=1.0.0, <4
simplejson
simplekv
storefact
toolz
typing_extensions # Some backports of the py3.8 typing module
urlquote>=1.1.3
Expand Down
4 changes: 1 addition & 3 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,11 @@

intersphinx_mapping = {
"pandas": ("https://pandas.pydata.org/docs/", None),
"simplekv": ("https://simplekv.readthedocs.io/en/stable/", None),
"minimalkv": ("https://minimalkv.readthedocs.io/en/stable/", None),
"pyarrow": ("https://arrow.apache.org/docs/", None),
"numpy": ("https://numpy.org/doc/stable/", None),
"python": ("https://docs.python.org/3", None),
"dask": ("https://docs.dask.org/en/stable/", None),
# Storefact isn't exposing any sphinx refs
# "storefact": ("https://storefact.readthedocs.io/en/stable", None),
}

# In particular type annotations are rendered as its full path to the class but
Expand Down
3 changes: 1 addition & 2 deletions docs/environment-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@ channels:
dependencies:
- dask[dataframe]
- decorator
- minimalkv
- msgpack-python>=0.5.2
# Currently dask and numpy==1.16.0 clash
- numpy!=1.15.0,!=1.16.0
- pandas>=0.23.0, !=1.0.0
- pyarrow>=0.17.1,!=1.0.0, <4
- simplejson
- simplekv
- storefact
- toolz
- typing_extensions # Some backports of the py3.8 typing module
- urlquote>=1.1.3
Expand Down
4 changes: 2 additions & 2 deletions docs/guide/cube/command_line_features.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Command Line Features
</style>

Kartothek Cube also features a command line interface (CLI) for some cube operations. To use it, create a ``skv.yml`` file that
describes `storefact`_ stores:
describes `minimalkv`_ stores:

.. code-block:: yaml
Expand Down Expand Up @@ -147,5 +147,5 @@ Some information is not available when reading the schema information and requir

Use ``kartothek_cube --help`` to get a list of all commands, or see :mod:`~kartothek.cli`.

.. _storefact: https://github.com/blue-yonder/storefact
.. _minimalkv: https://github.com/data-engineering-collective/minimalkv

8 changes: 4 additions & 4 deletions docs/guide/cube/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@ First, we want to create a cube for geodata:
... partition_columns=["country"],
... )

Apart from an abstract cube definition, we need a `simplekv`_-based storage backend:
Apart from an abstract cube definition, we need a `minimalkv`_-based storage backend:

>>> from functools import partial
>>> import tempfile
>>> import storefact
>>> import minimalkv
>>> store_location = tempfile.mkdtemp()
>>> store_factory = partial(
... storefact.get_store_from_url,
... minimalkv.get_store_from_url,
... "hfs://" + store_location,
... )
>>> store = store_factory()
Expand Down Expand Up @@ -424,4 +424,4 @@ geodata++time/table/_common_metadata
.. _Dask: https://docs.dask.org/
.. _Dask.Bag: https://docs.dask.org/en/latest/bag.html
.. _Dask.DataFrame: https://docs.dask.org/en/latest/dataframe.html
.. _simplekv: https://simplekv.readthedocs.io/
.. _minimalkv: https://minimalkv.readthedocs.io/
6 changes: 3 additions & 3 deletions docs/guide/cube/glossary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ Glossary
Dataset that provides the groundtruth about which :term:`Cell` are in a :term:`Cube`.

Store Factory
A callable that does not take any arguments and creates a new `simplekv`_ store when being called. Its type is
``Callable[[], simplekv.KeyValueStore]``.
A callable that does not take any arguments and creates a new `minimalkv`_ store when being called. Its type is
``Callable[[], minimalkv.KeyValueStore]``.

Query
A request for data from the cube, including things like "payload columns", "conditions", and more.
Expand All @@ -76,4 +76,4 @@ Glossary

.. _Data Cubes: https://en.wikipedia.org/wiki/Data_cube
.. _Parquet: https://parquet.apache.org/
.. _simplekv: https://simplekv.readthedocs.io/
.. _minimalkv: https://minimalkv.readthedocs.io/
20 changes: 8 additions & 12 deletions docs/guide/getting_started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,24 +47,24 @@ We want to store this DataFrame now as a dataset. Therefore, we first need
to connect to a storage location.

We define a store factory as a callable which contains the storage information.
We will use `storefact`_ in this example to construct such a store factory
We will use `minimalkv`_ in this example to construct such a store factory
for the local filesystem (``hfs://`` indicates we are using the local filesystem and
what follows is the filepath).

.. ipython:: python
from functools import partial
from tempfile import TemporaryDirectory
from storefact import get_store_from_url
from minimalkv import get_store_from_url
dataset_dir = TemporaryDirectory()
store_url = f"hfs://{dataset_dir.name}"
.. admonition:: Storage locations

`storefact`_ offers support for several stores in Kartothek, these can be created using the
function `storefact.get_store_from_url` with one of the following prefixes:
`minimalkv`_ offers support for several stores in Kartothek, these can be created using the
function `minimalkv.get_store_from_url` with one of the following prefixes:

- ``hfs``: Local filesystem
- ``hazure``: AzureBlockBlobStorage
Expand All @@ -74,15 +74,11 @@ Interface
---------

Kartothek can write to any location that
fulfills the `simplekv.KeyValueStore interface
<https://simplekv.readthedocs.io/en/latest/#simplekv.KeyValueStore>`_ as long as they
support `ExtendedKeyspaceMixin
<https://github.com/mbr/simplekv/search?q=%22class+ExtendedKeyspaceMixin%22&unscoped_q=%22class+ExtendedKeyspaceMixin%22>`_
fulfills the `minimalkv.KeyValueStore interface
<https://minimalkv.readthedocs.io/en/latest/#minimalkv.KeyValueStore>`_ as long as they
support ``ExtendedKeyspaceMixin``
(this is necessary so that ``/`` can be used in the storage key name).

For more information, take a look out at the `storefact documentation
<https://storefact.readthedocs.io/en/latest/reference/storefact.html>`_.


Writing data to storage
=======================
Expand Down Expand Up @@ -232,5 +228,5 @@ function but returns a collection of ``dask.delayed`` objects.
read_table("a_unique_dataset_identifier", store_url, predicates=[[("A", "<", 2.5)]])
.. _storefact: https://github.com/blue-yonder/storefact
.. _minimalkv: https://github.com/data-engineering-collective/minimalkv
.. _dask: https://docs.dask.org/en/latest/
6 changes: 3 additions & 3 deletions docs/guide/mutating_datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ some data there with Kartothek.
import pandas as pd
from functools import partial
from tempfile import TemporaryDirectory
from storefact import get_store_from_url
from minimalkv import get_store_from_url
from kartothek.api.dataset import store_dataframes_as_dataset
Expand Down Expand Up @@ -236,7 +236,7 @@ When garbage collection is called, the files are removed.
.. ipython:: python
from kartothek.api.dataset import garbage_collect_dataset
from storefact import get_store_from_url
from minimalkv import get_store_from_url
store = get_store_from_url(store_url)
Expand All @@ -246,7 +246,7 @@ When garbage collection is called, the files are removed.
files_before.difference(store.keys()) # Show files removed
.. _storefact: https://github.com/blue-yonder/storefact
.. _minimalkv: https://github.com/data-engineering-collective/minimalkv


Mutating indexed datasets
Expand Down
2 changes: 1 addition & 1 deletion docs/guide/partitioning.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ first and store the data there with Kartothek:
import pandas as pd
from functools import partial
from tempfile import TemporaryDirectory
from storefact import get_store_from_url
from minimalkv import get_store_from_url
from kartothek.api.dataset import store_dataframes_as_dataset
Expand Down
9 changes: 4 additions & 5 deletions docs/spec/store_interface.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
KeyValueStore Interface
=======================

All storage interaction use ``simplekv.KeyValueStore`` as an storage layer
All storage interaction use ``minimalkv.KeyValueStore`` as an storage layer
abstraction. This allows convenient access to many different common Key-Value
stores (ABS, S3, GCS, local filesystem, etc.) and allows an easy switch between
the storage backends to facilitate a simpler test setup.
Expand All @@ -13,7 +13,7 @@ Generally, all of our public functions accepting a ``store`` argument accept a
multitude of different input types and we generally accept all kinds of stores
inheriting from ``KeyValueStore``, assuming they implement the pickle protocol.
However, there are storages which simply cannot be distributed across processes
or network nodes sensibly. A prime Example is the ``simplekv.memory.DictStore``
or network nodes sensibly. A prime Example is the ``minimalkv.memory.DictStore``
which uses a simple python dictionary as a backend store. It is technically
possible to (de-)serialize the store but once it is deserialized in another
process, or another node, the store looses its meaning since the stores are
Expand All @@ -25,9 +25,8 @@ protocol, or some more complex logic is required to initialize it, kartothek
also accepts _factories_ which must be a callable returning a ``KeyValueStore``
(see also ``kartothek.core.typing.StoreFactory``).

For convenience we also offer a `storefact`_ integration and accept store urls
For convenience we also offer an integration that accepts store urls
which proves another easy level of access and is well suited for ad-hoc
investigations.

.. _simplekv: https://simplekv.readthedocs.io/
.. _storefact: https://storefact.readthedocs.io/
.. _minimalkv: https://minimalkv.readthedocs.io/
4 changes: 2 additions & 2 deletions kartothek/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
.. important::
This module does not contain any public APIs.
Kartothek comes with a CLI tool named ``kartothek_cube``. To use it, create an YAML file that contains a dictionary of `storefact`_
Kartothek comes with a CLI tool named ``kartothek_cube``. To use it, create an YAML file that contains a dictionary of `minimalkv`_
stores (keys are names of the store and the values are dicts that contain the store config). ``Kartothek`` uses a `YAML`_
file called ``skv.yml`` and a store called ``dataset`` by default, but you may pass ``--skv`` and ``--store`` to change
these. An example file could look like:
Expand All @@ -30,7 +30,7 @@
.. _Dask: https://docs.dask.org/
.. _storefact: https://github.com/blue-yonder/storefact
.. _minimalkv: https://github.com/data-engineering-collective/minimalkv
.. _YAML: https://yaml.org/
"""
import logging
Expand Down
12 changes: 6 additions & 6 deletions kartothek/cli/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from functools import partial

import click
import storefact
import minimalkv
import yaml

from kartothek.api.discover import discover_cube
Expand All @@ -18,7 +18,7 @@ def get_cube(store, uuid_prefix):
----------
uuid_prefix: str
Dataset UUID prefix.
store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore]
store: Union[Callable[[], minimalkv.KeyValueStore], minimalkv.KeyValueStore]
KV store.
Returns
Expand All @@ -41,18 +41,18 @@ def get_cube(store, uuid_prefix):

def get_store(skv, store):
"""
Get simplekv store from storefact config file.
Get minimalkv store from minimalkv config file.
Parameters
----------
skv: str
Name of the storefact yaml. Normally ``'skv.yml'``.
Name of the minimalkv yaml. Normally ``'skv.yml'``.
store: str
ID of the store.
Returns
-------
store_factory: Callable[[], simplekv.KeyValueStore]
store_factory: Callable[[], minimalkv.KeyValueStore]
Store object.
Raises
Expand All @@ -73,7 +73,7 @@ def get_store(skv, store):
"Could not find store {store} in {skv}".format(store=store, skv=skv)
)

return partial(storefact.get_store, **store_cfg[store])
return partial(minimalkv.get_store, **store_cfg[store])


def _match_pattern(what, items, pattern):
Expand Down
2 changes: 1 addition & 1 deletion kartothek/core/common_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import pyarrow as pa
import pyarrow.parquet as pq
import simplejson
from simplekv import KeyValueStore
from minimalkv import KeyValueStore

from kartothek.core import naming
from kartothek.core._compat import load_json
Expand Down
12 changes: 6 additions & 6 deletions kartothek/core/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@

_PARAMETER_MAPPING = {
"store": """
store: Callable or str or simplekv.KeyValueStore
store: Callable or str or minimalkv.KeyValueStore
The store where we can find or store the dataset.
Can be either ``simplekv.KeyValueStore``, a storefact store url or a
generic Callable producing a ``simplekv.KeyValueStore``""",
Can be either ``minimalkv.KeyValueStore``, a minimalkv store url or a
generic Callable producing a ``minimalkv.KeyValueStore``""",
"overwrite": """
overwrite: Optional[bool]
If True, allow overwrite of an existing dataset.""",
Expand Down Expand Up @@ -70,12 +70,12 @@
`merge_datasets__pipeline` key that contains the source dataset uuids for
the merge.""",
"output_store": """
output_store : Union[Callable, str, simplekv.KeyValueStore]
output_store : Union[Callable, str, minimalkv.KeyValueStore]
If given, the resulting dataset is written to this store. By default
the input store.
Can be either `simplekv.KeyValueStore`, a storefact store url or a
generic Callable producing a ``simplekv.KeyValueStore``""",
Can be either `minimalkv.KeyValueStore`, a minimalkv store url or a
generic Callable producing a ``minimalkv.KeyValueStore``""",
"metadata": """
metadata : Optional[Dict]
A dictionary used to update the dataset metadata.""",
Expand Down
4 changes: 2 additions & 2 deletions kartothek/core/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from kartothek.core.utils import lazy_store

if TYPE_CHECKING:
from simplekv import KeyValueStore
from minimalkv import KeyValueStore

__all__ = ("DatasetFactory",)

Expand Down Expand Up @@ -38,7 +38,7 @@ def __init__(
.. code::
from functools import partial
from storefact import get_store_from_url
from minimalkv import get_store_from_url
from kartothek.io.eager import read_table
ds_factory = DatasetFactory(
Expand Down
Loading

0 comments on commit 3d6f160

Please sign in to comment.