From 7453847214833b205de8173b0b2e673e16dbc31e Mon Sep 17 00:00:00 2001 From: zhangyansheng Date: Thu, 24 Dec 2020 17:40:20 +0800 Subject: [PATCH 1/2] add logging increased key count --- core/ps/table/sparse_table.cc | 23 +++++++++++++++-------- core/ps/table/sparse_table.h | 5 +++-- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/core/ps/table/sparse_table.cc b/core/ps/table/sparse_table.cc index db9a77b..4d4ef09 100644 --- a/core/ps/table/sparse_table.cc +++ b/core/ps/table/sparse_table.cc @@ -79,7 +79,7 @@ void SparseTable::Push(const SparsePushRequest* req, butil::IOBuf& grad_buf, Spa } } -void SparseTable::Save(const std::string& filepath) const { +void SparseTable::Save(const std::string& filepath) { butil::Timer timer(butil::Timer::STARTED); std::string file = filepath + "/sparse_table/"; @@ -96,14 +96,19 @@ void SparseTable::Save(const std::string& filepath) const { timer.stop(); + int new_key_count = op_kernel_->KeyCount(); + LOG(INFO) << "SparseTable save. rank:" << self_shard_id_ - << " table_name:" << name_ - << " table_id:" << GetHandle() + << " name:" << name_ + << " handle:" << GetHandle() << " latency:" << timer.s_elapsed() << "s" - << " keys_count:" << op_kernel_->KeyCount(); + << " key_count:" << new_key_count + << " increased key_count:" << new_key_count - saved_key_count_; + + saved_key_count_ = new_key_count; } -void SparseTable::Load(const std::string& filepath) const { +void SparseTable::Load(const std::string& filepath) { butil::Timer timer(butil::Timer::STARTED); std::string file = filepath + "/sparse_table/"; @@ -120,11 +125,13 @@ void SparseTable::Load(const std::string& filepath) const { timer.stop(); + saved_key_count_ = op_kernel_->KeyCount(); + LOG(INFO) << "SparseTable load. rank:" << self_shard_id_ - << " table_name:" << name_ - << " table_id:" << GetHandle() + << " name:" << name_ + << " handle:" << GetHandle() << " latency:" << timer.s_elapsed() << "s" - << " keys_count:" << op_kernel_->KeyCount(); + << " key_count:" << saved_key_count_; } void SparseTable::ShowDecay() const { diff --git a/core/ps/table/sparse_table.h b/core/ps/table/sparse_table.h index 6cb4760..81024e2 100644 --- a/core/ps/table/sparse_table.h +++ b/core/ps/table/sparse_table.h @@ -46,9 +46,9 @@ class SparseTable { return handle_; } - void Save(const std::string& filepath) const; + void Save(const std::string& filepath); - void Load(const std::string& filepath) const; + void Load(const std::string& filepath); void ShowDecay() const; @@ -59,6 +59,7 @@ class SparseTable { const OptimizerBase* opt_ = nullptr; std::shared_ptr op_kernel_; int dim_; + int saved_key_count_ = 0; std::string name_; }; From 3548339b7b768b8c087eee1eed94da2002ef3bb4 Mon Sep 17 00:00:00 2001 From: zhangyansheng Date: Fri, 25 Dec 2020 10:34:34 +0800 Subject: [PATCH 2/2] add feature drop show threshold and update show decay with moving avg --- core/main/py_wrapper.cc | 138 +++++++-------------------- core/ps/optimizer/ada_grad_kernel.cc | 10 +- core/ps/optimizer/ada_grad_kernel.h | 6 +- core/ps/optimizer/adam_kernel.cc | 6 +- core/ps/optimizer/adam_kernel.h | 6 +- core/ps/optimizer/data_struct.h | 10 ++ core/ps/optimizer/ftrl_kernel.cc | 12 +-- core/ps/optimizer/ftrl_kernel.h | 6 +- core/ps/optimizer/optimizer.h | 48 +--------- core/ps/optimizer/optimizer_kernel.h | 14 +-- 10 files changed, 72 insertions(+), 184 deletions(-) diff --git a/core/main/py_wrapper.cc b/core/main/py_wrapper.cc index 53eb27d..5fedb38 100644 --- a/core/main/py_wrapper.cc +++ b/core/main/py_wrapper.cc @@ -33,6 +33,15 @@ using std::string; using namespace tensornet; +#define PYDICT_PARSE_KWARGS(kwargs, name, default_value) \ + opt->name = default_value; \ + { \ + PyObject* item = PyDict_GetItemString(kwargs.ptr(), #name); \ + if (NULL != item) { \ + opt->name = PyFloat_AsDouble(item); \ + } \ + } + PYBIND11_MODULE(_pywrap_tn, m) { m.def("init", []() { PsCluster* cluster = PsCluster::Instance(); @@ -54,47 +63,17 @@ PYBIND11_MODULE(_pywrap_tn, m) { return true; }) .def("AdaGrad", [](py::kwargs kwargs) { - float learning_rate = 0.01; - float initial_g2sum = 0; - float initial_scale = 1; - float epsilon = 1e-8; - float grad_decay_rate = 1.0; - float mom_decay_rate = 1.0; - float show_decay_rate = 0.98; - - PyObject* item = PyDict_GetItemString(kwargs.ptr(), "learning_rate"); - if (NULL != item) { - learning_rate = PyFloat_AsDouble(item); - } + auto opt = new AdaGrad(); - item = PyDict_GetItemString(kwargs.ptr(), "initial_g2sum"); - if (NULL != item) { - initial_g2sum = PyFloat_AsDouble(item); - } + PYDICT_PARSE_KWARGS(kwargs, learning_rate, 0.01); + PYDICT_PARSE_KWARGS(kwargs, show_decay_rate, 0.98); + PYDICT_PARSE_KWARGS(kwargs, feature_drop_show, 1 - opt->show_decay_rate); - item = PyDict_GetItemString(kwargs.ptr(), "initial_scale"); - if (NULL != item) { - initial_scale = PyFloat_AsDouble(item); - } - item = PyDict_GetItemString(kwargs.ptr(), "epsilon"); - if (NULL != item) { - epsilon = PyFloat_AsDouble(item); - } - item = PyDict_GetItemString(kwargs.ptr(), "grad_decay_rate"); - if (NULL != item) { - grad_decay_rate = PyFloat_AsDouble(item); - } - item = PyDict_GetItemString(kwargs.ptr(), "mom_decay_rate"); - if (NULL != item) { - mom_decay_rate = PyFloat_AsDouble(item); - } - item = PyDict_GetItemString(kwargs.ptr(), "show_decay_rate"); - if (NULL != item) { - show_decay_rate = PyFloat_AsDouble(item); - } - - auto opt = new AdaGrad(learning_rate, initial_g2sum, initial_scale, epsilon, - grad_decay_rate, mom_decay_rate, show_decay_rate); + PYDICT_PARSE_KWARGS(kwargs, initial_g2sum, 0); + PYDICT_PARSE_KWARGS(kwargs, initial_scale, 1); + PYDICT_PARSE_KWARGS(kwargs, epsilon, 1e-8); + PYDICT_PARSE_KWARGS(kwargs, grad_decay_rate, 1.0); + PYDICT_PARSE_KWARGS(kwargs, mom_decay_rate, 1.0); // NOTICE! opt will not delete until system exist PyObject* obj = PyCapsule_New(opt, nullptr, nullptr); @@ -102,37 +81,16 @@ PYBIND11_MODULE(_pywrap_tn, m) { return py::reinterpret_steal(obj); }) .def("Adam", [](py::kwargs kwargs) { - float learning_rate = 0.001; - float beta1 = 0.9; - float beta2 = 0.999; - float epsilon = 1e-8; - float initial_scale = 1.0; - - PyObject* item = PyDict_GetItemString(kwargs.ptr(), "learning_rate"); - if (NULL != item) { - learning_rate = PyFloat_AsDouble(item); - } - - item = PyDict_GetItemString(kwargs.ptr(), "beta1"); - if (NULL != item) { - beta1 = PyFloat_AsDouble(item); - } - - item = PyDict_GetItemString(kwargs.ptr(), "beta2"); - if (NULL != item) { - beta2 = PyFloat_AsDouble(item); - } + auto opt = new Adam(); - item = PyDict_GetItemString(kwargs.ptr(), "epsilon"); - if (NULL != item) { - epsilon = PyFloat_AsDouble(item); - } - item = PyDict_GetItemString(kwargs.ptr(), "initial_scale"); - if (NULL != item) { - initial_scale = PyFloat_AsDouble(item); - } + PYDICT_PARSE_KWARGS(kwargs, learning_rate, 0.001); + PYDICT_PARSE_KWARGS(kwargs, show_decay_rate, 0.98); + PYDICT_PARSE_KWARGS(kwargs, feature_drop_show, 1 - opt->show_decay_rate); - auto opt = new Adam(learning_rate, beta1, beta2, epsilon, initial_scale); + PYDICT_PARSE_KWARGS(kwargs, beta1, 0.9); + PYDICT_PARSE_KWARGS(kwargs, beta2, 0.999); + PYDICT_PARSE_KWARGS(kwargs, epsilon, 1e-8); + PYDICT_PARSE_KWARGS(kwargs, initial_scale, 1.0); // NOTICE! opt will not delete until system exist PyObject* obj = PyCapsule_New(opt, nullptr, nullptr); @@ -140,44 +98,16 @@ PYBIND11_MODULE(_pywrap_tn, m) { return py::reinterpret_steal(obj); }) .def("Ftrl", [](py::kwargs kwargs) { - float learning_rate = 0.05; - float initial_range = 0; - float beta = 1; - float lambda1 = 0.1; - float lambda2 = 1; - float show_decay_rate = 0.98; - - PyObject* item = PyDict_GetItemString(kwargs.ptr(), "learning_rate"); - if (NULL != item) { - learning_rate = PyFloat_AsDouble(item); - } + auto opt = new Ftrl(); - item = PyDict_GetItemString(kwargs.ptr(), "initial_range"); - if (NULL != item) { - initial_range = PyFloat_AsDouble(item); - } - - item = PyDict_GetItemString(kwargs.ptr(), "beta"); - if (NULL != item) { - beta = PyFloat_AsDouble(item); - } - - item = PyDict_GetItemString(kwargs.ptr(), "lambda1"); - if (NULL != item) { - lambda1 = PyFloat_AsDouble(item); - } - - item = PyDict_GetItemString(kwargs.ptr(), "lambda2"); - if (NULL != item) { - lambda2 = PyFloat_AsDouble(item); - } - - item = PyDict_GetItemString(kwargs.ptr(), "show_decay_rate"); - if (NULL != item) { - show_decay_rate = PyFloat_AsDouble(item); - } + PYDICT_PARSE_KWARGS(kwargs, learning_rate, 0.05); + PYDICT_PARSE_KWARGS(kwargs, show_decay_rate, 0.98); + PYDICT_PARSE_KWARGS(kwargs, feature_drop_show, 1 - opt->show_decay_rate); - auto opt = new Ftrl(learning_rate, initial_range, beta, lambda1, lambda2, show_decay_rate); + PYDICT_PARSE_KWARGS(kwargs, beta, 1); + PYDICT_PARSE_KWARGS(kwargs, lambda1, 0.1); + PYDICT_PARSE_KWARGS(kwargs, lambda2, 1); + PYDICT_PARSE_KWARGS(kwargs, initial_scale, 1.0); // NOTICE! opt will not delete until system exist PyObject* obj = PyCapsule_New(opt, nullptr, nullptr); diff --git a/core/ps/optimizer/ada_grad_kernel.cc b/core/ps/optimizer/ada_grad_kernel.cc index f4b50fc..24c7faa 100644 --- a/core/ps/optimizer/ada_grad_kernel.cc +++ b/core/ps/optimizer/ada_grad_kernel.cc @@ -89,7 +89,7 @@ SparseAdaGradValue::SparseAdaGradValue(int dim, const AdaGrad* opt) { } void SparseAdaGradValue::Apply(const AdaGrad* opt, SparseGradInfo& grad_info, int dim) { - show_ += grad_info.batch_show; + delta_show += grad_info.batch_show; float* w = Weight(); @@ -112,7 +112,7 @@ void SparseAdaGradValue::Serialize(std::ostream& os, int dim) { } os << g2sum_ << "\t"; - os << show_; + os << show; } void SparseAdaGradValue::DeSerialize(std::istream& is, int dim) { @@ -121,11 +121,7 @@ void SparseAdaGradValue::DeSerialize(std::istream& is, int dim) { } is >> g2sum_; - is >> show_; -} - -void SparseAdaGradValue::ShowDecay(const AdaGrad* opt) { - show_ *= opt->show_decay_rate; + is >> show; } } // namespace tensornet diff --git a/core/ps/optimizer/ada_grad_kernel.h b/core/ps/optimizer/ada_grad_kernel.h index 3814a36..864d3c5 100644 --- a/core/ps/optimizer/ada_grad_kernel.h +++ b/core/ps/optimizer/ada_grad_kernel.h @@ -53,7 +53,8 @@ class DenseAdaGradValue { std::ostream& operator<<(std::ostream& os, const DenseAdaGradValue& value); std::istream& operator>>(std::istream& is, DenseAdaGradValue& value); -struct alignas(4) SparseAdaGradValue { +struct alignas(4) SparseAdaGradValue + : public SparseOptValue { public: SparseAdaGradValue(int dim, const AdaGrad* opt); @@ -73,15 +74,12 @@ struct alignas(4) SparseAdaGradValue { void Apply(const AdaGrad* opt, SparseGradInfo& grad_info, int dim); - void ShowDecay(const AdaGrad* opt); - void Serialize(std::ostream& os, int dim); void DeSerialize(std::istream& is, int dim); private: float g2sum_; - float show_ = 0.0; float data_[0]; }; diff --git a/core/ps/optimizer/adam_kernel.cc b/core/ps/optimizer/adam_kernel.cc index 48f058e..9020e38 100644 --- a/core/ps/optimizer/adam_kernel.cc +++ b/core/ps/optimizer/adam_kernel.cc @@ -104,7 +104,7 @@ SparseAdamValue::SparseAdamValue(int dim, const Adam* opt) { } void SparseAdamValue::Apply(const Adam* opt, SparseGradInfo& grad_info, int dim) { - show_ += grad_info.batch_show; + delta_show += grad_info.batch_show; float* w = Weight(); float* m = M(dim); @@ -129,7 +129,7 @@ void SparseAdamValue::Serialize(std::ostream& os, int dim) { os << v[i] << "\t"; } - os << show_; + os << show; } void SparseAdamValue::DeSerialize(std::istream& is, int dim) { @@ -143,7 +143,7 @@ void SparseAdamValue::DeSerialize(std::istream& is, int dim) { is >> v[i]; } - is >> show_; + is >> show; } } // namespace tensornet { diff --git a/core/ps/optimizer/adam_kernel.h b/core/ps/optimizer/adam_kernel.h index 1a5d2f0..7f5311f 100644 --- a/core/ps/optimizer/adam_kernel.h +++ b/core/ps/optimizer/adam_kernel.h @@ -56,7 +56,8 @@ class DenseAdamValue { std::ostream& operator<<(std::ostream& os, const DenseAdamValue& value); std::istream& operator>>(std::istream& is, DenseAdamValue& value); -struct alignas(4) SparseAdamValue { +struct alignas(4) SparseAdamValue + : public SparseOptValue { public: SparseAdamValue(int dim, const Adam* opt); ~SparseAdamValue() = default; @@ -75,8 +76,6 @@ struct alignas(4) SparseAdamValue { void Apply(const Adam* opt, SparseGradInfo& grad_info, int dim); - void ShowDecay(const Adam* opt) {} - void Serialize(std::ostream& os, int dim); void DeSerialize(std::istream& is, int dim); @@ -100,7 +99,6 @@ struct alignas(4) SparseAdamValue { } private: - float show_ = 0.0; float data_[0]; }; diff --git a/core/ps/optimizer/data_struct.h b/core/ps/optimizer/data_struct.h index 266ada3..e5ab2b5 100644 --- a/core/ps/optimizer/data_struct.h +++ b/core/ps/optimizer/data_struct.h @@ -22,6 +22,16 @@ struct SparseGradInfo { int batch_show; }; +struct alignas(4) SparseOptValue { + float show = 0.0; + int delta_show = 0; + + void ShowDecay(float decay_rate) { + show = (1 - decay_rate) * delta_show + decay_rate * show; + delta_show = 0; + } +}; + } // namespace tensornet { #endif // !TENSORNET_OPTIMIZER_DATA_STRUCT_H_ diff --git a/core/ps/optimizer/ftrl_kernel.cc b/core/ps/optimizer/ftrl_kernel.cc index f3ecd14..621444a 100644 --- a/core/ps/optimizer/ftrl_kernel.cc +++ b/core/ps/optimizer/ftrl_kernel.cc @@ -50,13 +50,15 @@ SparseFtrlValue::SparseFtrlValue(int dim, const Ftrl* opt) { float* n = N(dim); for (int i = 0; i < dim; ++i) { - w[i] = distribution(reng) * opt->initial_range; + w[i] = distribution(reng) * opt->initial_scale; z[i] = 0; n[i] = 0; } } void SparseFtrlValue::Apply(const Ftrl* opt, SparseGradInfo& grad_info, int dim) { + delta_show += grad_info.batch_show; + float* w = Weight(); float* z = Z(dim); float* n = N(dim); @@ -90,7 +92,7 @@ void SparseFtrlValue::Serialize(std::ostream& os, int dim) { os << n[i] << "\t"; } - os << show_; + os << show; } void SparseFtrlValue::DeSerialize(std::istream& is, int dim) { @@ -104,11 +106,7 @@ void SparseFtrlValue::DeSerialize(std::istream& is, int dim) { is >> n[i]; } - is >> show_; -} - -void SparseFtrlValue::ShowDecay(const Ftrl* opt) { - show_ *= opt->show_decay_rate; + is >> show; } } // namespace tensornet diff --git a/core/ps/optimizer/ftrl_kernel.h b/core/ps/optimizer/ftrl_kernel.h index 47e52c4..0b5c782 100644 --- a/core/ps/optimizer/ftrl_kernel.h +++ b/core/ps/optimizer/ftrl_kernel.h @@ -52,7 +52,8 @@ class DenseFtrlValue { std::ostream& operator<<(std::ostream& os, const DenseFtrlValue& value); std::istream& operator>>(std::istream& is, DenseFtrlValue& value); -struct alignas(4) SparseFtrlValue { +struct alignas(4) SparseFtrlValue + : public SparseOptValue { public: SparseFtrlValue(int dim, const Ftrl* opt); @@ -72,8 +73,6 @@ struct alignas(4) SparseFtrlValue { void Apply(const Ftrl* opt, SparseGradInfo& grad_info, int dim); - void ShowDecay(const Ftrl* opt); - void Serialize(std::ostream& os, int dim); void DeSerialize(std::istream& is, int dim); @@ -96,7 +95,6 @@ struct alignas(4) SparseFtrlValue { } private: - float show_ = 0.0; float data_[0]; }; diff --git a/core/ps/optimizer/optimizer.h b/core/ps/optimizer/optimizer.h index ebfdca7..3a67798 100644 --- a/core/ps/optimizer/optimizer.h +++ b/core/ps/optimizer/optimizer.h @@ -29,12 +29,6 @@ typedef std::shared_ptr SparseOptKernelSharedPtr; class OptimizerBase { public: - OptimizerBase(float lr) - : learning_rate(lr) { - } - - virtual ~OptimizerBase() { } - virtual DenseOptKernelSharedPtr CreateDenseOptKernel( int offset_begin, int offset_end) const = 0; @@ -44,20 +38,12 @@ class OptimizerBase { public: float learning_rate = 0.01; + float show_decay_rate = 0.98; + float feature_drop_show = 0.02; }; class Adam : public OptimizerBase { public: - Adam(float lr, float b1, float b2, float eps, float initial_scale) - : OptimizerBase(lr) - , beta1(b1) - , beta2(b2) - , epsilon(eps) - , initial_scale(initial_scale) { - } - - ~Adam() { } - virtual DenseOptKernelSharedPtr CreateDenseOptKernel( int offset_begin, int offset_end) const; @@ -76,20 +62,6 @@ class Adam : public OptimizerBase { class AdaGrad : public OptimizerBase { public: - AdaGrad(float lr, float initial_g2sum, float initial_scale, - float epsilon, float grad_decay_rate, float mom_decay_rate, - float show_decay_rate) - : OptimizerBase(lr) - , initial_g2sum(initial_g2sum) - , initial_scale(initial_scale) - , epsilon(epsilon) - , grad_decay_rate(grad_decay_rate) - , mom_decay_rate(mom_decay_rate) - , show_decay_rate(show_decay_rate) { - } - - ~AdaGrad() { } - virtual DenseOptKernelSharedPtr CreateDenseOptKernel( int offset_begin, int offset_end) const; @@ -105,23 +77,10 @@ class AdaGrad : public OptimizerBase { float epsilon = 1e-08; float grad_decay_rate = 1.0; float mom_decay_rate = 0.9; - float show_decay_rate = 0.98; }; class Ftrl : public OptimizerBase { public: - Ftrl(float lr, float initial_range, float beta, - float lambda1, float lambda2, float show_decay_rate) - : OptimizerBase(lr) - , initial_range(initial_range) - , beta(beta) - , lambda1(lambda1) - , lambda2(lambda2) - ,show_decay_rate(show_decay_rate) { - } - - ~Ftrl() {} - virtual DenseOptKernelSharedPtr CreateDenseOptKernel( int offset_begin, int offset_end) const; @@ -132,11 +91,10 @@ class Ftrl : public OptimizerBase { } public: - float initial_range = 0; + float initial_scale = 0; float beta = 1; float lambda1 = 0.1; float lambda2 = 1; - float show_decay_rate = 0.98; }; } // namespace tensornet { diff --git a/core/ps/optimizer/optimizer_kernel.h b/core/ps/optimizer/optimizer_kernel.h index 7a28ab6..e87b3f9 100644 --- a/core/ps/optimizer/optimizer_kernel.h +++ b/core/ps/optimizer/optimizer_kernel.h @@ -343,9 +343,11 @@ class SparseKernelBlock { os << "dim:" << block.dim_ << std::endl; for (const auto& value : block.values_) { - os << value.first << "\t"; - value.second->Serialize(os, block.dim_); - os << std::endl; + if (value.second->show > block.opt_->feature_drop_show) { + os << value.first << "\t"; + value.second->Serialize(os, block.dim_); + os << std::endl; + } } return os; @@ -376,7 +378,7 @@ class SparseKernelBlock { void ShowDecay() { for (auto& iter : values_) { ValueType* value = iter.second; - value->ShowDecay(opt_); + value->ShowDecay(opt_->show_decay_rate); } } @@ -419,7 +421,7 @@ class SparseOptimizerKernel : public SparseOptimizerKernelBase { for (size_t i = 0; i < SPARSE_KERNEL_BLOCK_NUM; ++i) { threads.push_back(std::thread([this, i, &filepath]() { std::string file = filepath; - file.append("/sparse_block_").append(std::to_string(i)).append(".gz"); + file.append("/block_").append(std::to_string(i)).append(".gz"); FileWriterSink writer_sink(file, FCT_ZLIB); @@ -441,7 +443,7 @@ class SparseOptimizerKernel : public SparseOptimizerKernelBase { for (size_t i = 0; i < SPARSE_KERNEL_BLOCK_NUM; ++i) { threads.push_back(std::thread([this, i, &filepath]() { std::string file = filepath; - file.append("/sparse_block_").append(std::to_string(i)).append(".gz"); + file.append("/block_").append(std::to_string(i)).append(".gz"); FileReaderSource reader_source(file, FCT_ZLIB); boost::iostreams::stream in_stream(reader_source);