Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Content descriptors integration #963

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,27 +26,28 @@ include ports.mk
PWD := $(shell pwd)
BUILD ?= build
ARCH ?= $(HOSTARCH)
OBJ := $(BUILD)/$(ARCH)/obj
BIN := $(BUILD)/$(ARCH)/bin
LIB := $(BUILD)/$(ARCH)/lib
TESTS := $(BUILD)/$(ARCH)/tests
TMPBIN := $(BUILD)/$(ARCH)/tmp
INC := $(BUILD)/$(ARCH)/include
OBJ := $(BUILD)/$(ARCH)-$(OSNAME)/obj
BIN := $(BUILD)/$(ARCH)-$(OSNAME)/bin
LIB := $(BUILD)/$(ARCH)-$(OSNAME)/lib
TESTS := $(BUILD)/$(ARCH)-$(OSNAME)/tests
TMPBIN := $(BUILD)/$(ARCH)-$(OSNAME)/tmp
INC := $(BUILD)/$(ARCH)-$(OSNAME)/include
SRC := .
TMP ?= $(PWD)/$(BUILD)/$(ARCH)/tmp
TMP ?= $(PWD)/$(BUILD)/$(ARCH)-$(OSNAME)/tmp

# These are for cross-compilation, where binaries used in the build need
# be be built for the host.
HOSTARCH ?= $(ARCH)
HOSTBIN ?= $(BUILD)/$(HOSTARCH)/bin
HOSTLIB ?= $(BUILD)/$(HOSTARCH)/lib
HOSTINC ?= $(BUILD)/$(HOSTARCH)/include
HOSTOSNAME ?= $(OSNAME)
HOSTBIN ?= $(BUILD)/$(HOSTARCH)-$(HOSTOSNAME)/bin
HOSTLIB ?= $(BUILD)/$(HOSTARCH)-$(HOSTOSNAME)/lib
HOSTINC ?= $(BUILD)/$(HOSTARCH)-$(HOSTOSNAME)/include

TEST_TMP := $(TESTS)
# Vars for configuration files or files that live outside bin and lib
ALTROOT := $(BUILD)/$(ARCH)/altroot
ETC := $(ALTROOT)/etc
PLUGINS := $(BUILD)/$(ARCH)/mldb_plugins
PLUGINS := $(BUILD)/$(ARCH)-$(OSNAME)/mldb_plugins

JML_BUILD := mldb/jml-build
INCLUDE := -Imldb
Expand Down
22 changes: 16 additions & 6 deletions block/content_descriptor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -460,13 +460,22 @@ getStream(const std::map<Utf8String, Any> & options) const
//cerr << "url = " << descriptor.getUrlStringUtf8() << " compression = "
// << compression << " mapped = " << isMapped << endl;

if (isMapped) {
while (isMapped) { // actually an if, but now we can break out
// Just get one single big block
auto contentHandler = getContent(descriptor);

struct Vals {
FsObjectInfo info;
FrozenMemoryRegion mem;

#if 0
~Vals()
{
cerr << endl << endl << endl;
cerr << "NO MORE MAPPING VALS" << endl;
cerr << endl << endl << endl;
}
#endif
};

auto vals = std::make_shared<Vals>();
Expand All @@ -493,7 +502,10 @@ getStream(const std::map<Utf8String, Any> & options) const
vals->mem.length());

if (outputSize < 0) {
throw Exception("decompressed size unknown");
if (outputSize == Decompressor::LENGTH_UNKNOWN)
break; // do as an istream as we can't run the splitting algorithm

throw Exception("decompressed size unknown: %i", (int)outputSize);
}

static MemorySerializer serializer;
Expand Down Expand Up @@ -546,10 +558,8 @@ getStream(const std::map<Utf8String, Any> & options) const
filter_istream stream(handler, descriptor.getUrlStringUtf8(), options2);
return stream;
}
else {
// Not mapped. We go block by block.
}


// Not mapped. We go block by block.
filter_istream result(descriptor.getUrlStringUtf8(), options2);
return result;
}
Expand Down
15 changes: 15 additions & 0 deletions block/testing/content_descriptor_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,18 @@ BOOST_AUTO_TEST_CASE( test_compressed_random_access )
{

}

BOOST_AUTO_TEST_CASE( test_parallel_decompress_zstd )
{
string input_file = "mldb_test_data/Books_5.json.zstd";
ContentDescriptor descriptor = jsonDecode<ContentDescriptor>("file://" + input_file);
std::shared_ptr<ContentHandler> handler = getDecompressedContent(descriptor);

auto onBlock = [&] (size_t blockNum, uint64_t blockOffset,
FrozenMemoryRegion block)
{
return true;
};

handler->forEachBlockParallel(0, 1024 * 1024 /* requested block size */, 1 /* maxParallelism */, onBlock);
}
2 changes: 1 addition & 1 deletion jml-build/os/Darwin.mk
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ READLINK:=readlink
linker_rpath=
SO_EXTENSION:=.dylib

VIRTUALENV ?= virtualenv
VIRTUALENV ?= virtualenv-$(ARCH)-$(OSNAME)-$(PYTHON_VERSION)
PYTHON ?= $(VIRTUALENV)/bin/python
PIP ?= $(VIRTUALENV)/bin/pip
PYTHON_DEPENDENCIES_PRE_CMD ?= $(PIP) install -U pip==21.2.3
Expand Down
2 changes: 1 addition & 1 deletion jml-build/os/Linux.mk
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ DIST_CODENAME:=$(shell lsb_release -sc)
MACHINE_NAME:=$(shell uname -n)
READLINK:=readlink -f

VIRTUALENV ?= virtualenv
VIRTUALENV ?= virtualenv-$(ARCH)-$(OSNAME)-$(PYTHON_VERSION)
PYTHON ?= $(VIRTUALENV)/bin/python
PIP ?= $(VIRTUALENV)/bin/pip
PYTHON_DEPENDENCIES_PRE_CMD ?= $(PIP) install -U pip==21.1.3
Expand Down
22 changes: 12 additions & 10 deletions makefile-main-plugin.mk
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

# Empty default suffixes to speed up initialization
.SUFFIXES:
OSNAME:=$(shell uname -s)

toolchain ?= gcc
PYTHON_ENABLED:=1
Expand Down Expand Up @@ -35,27 +36,28 @@ default: all

BUILD ?= build
ARCH ?= $(shell uname -m)
OBJ := $(BUILD)/$(ARCH)/obj
BIN := $(BUILD)/$(ARCH)/bin
OBJ := $(BUILD)/$(ARCH)-$(OSNAME)/obj
BIN := $(BUILD)/$(ARCH)-$(OSNAME)/bin
LIB := $(BUILD)/$(ARCH)/lib
TESTS := $(BUILD)/$(ARCH)/tests
TMPBIN := $(BUILD)/$(ARCH)/tmp
INC := $(BUILD)/$(ARCH)/include
TESTS := $(BUILD)/$(ARCH)-$(OSNAME)/tests
TMPBIN := $(BUILD)/$(ARCH)-$(OSNAME)/tmp
INC := $(BUILD)/$(ARCH)-$(OSNAME)/include
SRC := .
TMP ?= $(BUILD)/$(ARCH)/tmp
TMP ?= $(BUILD)/$(ARCH)-$(OSNAME)/tmp

# These are for cross-compilation, where binaries used in the build need
# be be built for the host.
HOSTARCH ?= $(ARCH)
HOSTBIN ?= $(BUILD)/$(HOSTARCH)/bin
HOSTLIB ?= $(BUILD)/$(HOSTARCH)/lib
HOSTINC ?= $(BUILD)/$(HOSTARCH)/include
HOSTOSNAME ?= $(OSNAME)
HOSTBIN ?= $(BUILD)/$(HOSTARCH)-$(HOSTOSNAME)/bin
HOSTLIB ?= $(BUILD)/$(HOSTARCH)-$(HOSTOSNAME)/lib
HOSTINC ?= $(BUILD)/$(HOSTARCH)-$(HOSTOSNAME)/include

TEST_TMP := $(TESTS)
# Vars for configuration files or files that live outside bin and lib
ALTROOT := $(BUILD)/$(ARCH)/altroot
ETC := $(ALTROOT)/etc
PLUGINS := $(BUILD)/$(ARCH)/mldb_plugins
PLUGINS := $(BUILD)/$(ARCH)-$(OSNAME)/mldb_plugins

JML_BUILD := mldb/jml-build
INCLUDE := -I. -Imldb
Expand Down
2 changes: 1 addition & 1 deletion mldb_test_data
Submodule mldb_test_data updated from 7db85b to a5bc07
7 changes: 6 additions & 1 deletion plugins/jml/randomforest.h
Original file line number Diff line number Diff line change
Expand Up @@ -321,11 +321,16 @@ struct PartitionData {

bool ordinal = features.at(featureToSplitOn).ordinal;

// Density of example numbers within our set of rows. When this
// gets too low, we do essentially random accesses and it kills
// our cache performance. In that case we can re-index to reduce
// the size.
double useRatio = 1.0 * rows.size() / rows.back().exampleNum;

//todo: Re-index when usable data fits inside cache
bool reIndex = useRatio < 0.1;
bool reIndex = useRatio < 0.25;
//reIndex = false;
//using namespace std;
//cerr << "useRatio = " << useRatio << endl;

if (!reIndex) {
Expand Down
Loading