From 7453847214833b205de8173b0b2e673e16dbc31e Mon Sep 17 00:00:00 2001
From: zhangyansheng <zhangyansheng@360.cn>
Date: Thu, 24 Dec 2020 17:40:20 +0800
Subject: [PATCH 1/2] add logging increased key count

---
 core/ps/table/sparse_table.cc | 23 +++++++++++++++--------
 core/ps/table/sparse_table.h  |  5 +++--
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/core/ps/table/sparse_table.cc b/core/ps/table/sparse_table.cc
index db9a77b..4d4ef09 100644
--- a/core/ps/table/sparse_table.cc
+++ b/core/ps/table/sparse_table.cc
@@ -79,7 +79,7 @@ void SparseTable::Push(const SparsePushRequest* req, butil::IOBuf& grad_buf, Spa
     }
 }
 
-void SparseTable::Save(const std::string& filepath) const {
+void SparseTable::Save(const std::string& filepath) {
     butil::Timer timer(butil::Timer::STARTED);
 
     std::string file = filepath + "/sparse_table/";
@@ -96,14 +96,19 @@ void SparseTable::Save(const std::string& filepath) const {
 
     timer.stop();
 
+    int new_key_count = op_kernel_->KeyCount();
+
     LOG(INFO) << "SparseTable save. rank:" << self_shard_id_
-              << " table_name:" << name_
-              << " table_id:" << GetHandle()
+              << " name:" << name_
+              << " handle:" << GetHandle()
               << " latency:" << timer.s_elapsed() << "s"
-              << " keys_count:" << op_kernel_->KeyCount();
+              << " key_count:" << new_key_count
+              << " increased key_count:" << new_key_count - saved_key_count_;
+
+    saved_key_count_ = new_key_count;
 }
 
-void SparseTable::Load(const std::string& filepath) const {
+void SparseTable::Load(const std::string& filepath) {
     butil::Timer timer(butil::Timer::STARTED);
 
     std::string file = filepath + "/sparse_table/";
@@ -120,11 +125,13 @@ void SparseTable::Load(const std::string& filepath) const {
 
     timer.stop();
 
+    saved_key_count_ = op_kernel_->KeyCount();
+
     LOG(INFO) << "SparseTable load. rank:" << self_shard_id_
-              << " table_name:" << name_
-              << " table_id:" << GetHandle()
+              << " name:" << name_
+              << " handle:" << GetHandle()
               << " latency:" << timer.s_elapsed() << "s"
-              << " keys_count:" << op_kernel_->KeyCount();
+              << " key_count:" << saved_key_count_;
 }
 
 void SparseTable::ShowDecay() const {
diff --git a/core/ps/table/sparse_table.h b/core/ps/table/sparse_table.h
index 6cb4760..81024e2 100644
--- a/core/ps/table/sparse_table.h
+++ b/core/ps/table/sparse_table.h
@@ -46,9 +46,9 @@ class SparseTable {
         return handle_;
     }
 
-    void Save(const std::string& filepath) const;
+    void Save(const std::string& filepath);
 
-    void Load(const std::string& filepath) const;
+    void Load(const std::string& filepath);
 
     void ShowDecay() const;
 
@@ -59,6 +59,7 @@ class SparseTable {
     const OptimizerBase* opt_ = nullptr;
     std::shared_ptr<SparseOptimizerKernelBase> op_kernel_;
     int dim_;
+    int saved_key_count_ = 0;
     std::string name_;
 };
 

From 3548339b7b768b8c087eee1eed94da2002ef3bb4 Mon Sep 17 00:00:00 2001
From: zhangyansheng <zhangyansheng@360.cn>
Date: Fri, 25 Dec 2020 10:34:34 +0800
Subject: [PATCH 2/2] add feature drop show threshold and update show decay
 with moving avg

---
 core/main/py_wrapper.cc              | 138 +++++++--------------------
 core/ps/optimizer/ada_grad_kernel.cc |  10 +-
 core/ps/optimizer/ada_grad_kernel.h  |   6 +-
 core/ps/optimizer/adam_kernel.cc     |   6 +-
 core/ps/optimizer/adam_kernel.h      |   6 +-
 core/ps/optimizer/data_struct.h      |  10 ++
 core/ps/optimizer/ftrl_kernel.cc     |  12 +--
 core/ps/optimizer/ftrl_kernel.h      |   6 +-
 core/ps/optimizer/optimizer.h        |  48 +---------
 core/ps/optimizer/optimizer_kernel.h |  14 +--
 10 files changed, 72 insertions(+), 184 deletions(-)

diff --git a/core/main/py_wrapper.cc b/core/main/py_wrapper.cc
index 53eb27d..5fedb38 100644
--- a/core/main/py_wrapper.cc
+++ b/core/main/py_wrapper.cc
@@ -33,6 +33,15 @@ using std::string;
 
 using namespace tensornet;
 
+#define PYDICT_PARSE_KWARGS(kwargs, name, default_value)                        \
+    opt->name = default_value;                                                  \
+    {                                                                           \
+        PyObject* item = PyDict_GetItemString(kwargs.ptr(), #name);             \
+        if (NULL != item) {                                                     \
+            opt->name = PyFloat_AsDouble(item);                                 \
+        }                                                                       \
+    }
+
 PYBIND11_MODULE(_pywrap_tn, m) {
     m.def("init", []() {
         PsCluster* cluster = PsCluster::Instance();
@@ -54,47 +63,17 @@ PYBIND11_MODULE(_pywrap_tn, m) {
         return true;
     })
     .def("AdaGrad", [](py::kwargs kwargs) {
-        float learning_rate = 0.01;
-        float initial_g2sum = 0;
-        float initial_scale = 1;
-        float epsilon = 1e-8;
-        float grad_decay_rate = 1.0;
-        float mom_decay_rate = 1.0;
-        float show_decay_rate = 0.98;
-
-        PyObject* item = PyDict_GetItemString(kwargs.ptr(), "learning_rate");
-        if (NULL != item) {
-            learning_rate = PyFloat_AsDouble(item);
-        }
+        auto opt = new AdaGrad();
 
-        item = PyDict_GetItemString(kwargs.ptr(), "initial_g2sum");
-        if (NULL != item) {
-            initial_g2sum = PyFloat_AsDouble(item);
-        }
+        PYDICT_PARSE_KWARGS(kwargs, learning_rate, 0.01);
+        PYDICT_PARSE_KWARGS(kwargs, show_decay_rate, 0.98);
+        PYDICT_PARSE_KWARGS(kwargs, feature_drop_show, 1 - opt->show_decay_rate);
 
-        item = PyDict_GetItemString(kwargs.ptr(), "initial_scale");
-        if (NULL != item) {
-            initial_scale = PyFloat_AsDouble(item);
-        }
-        item = PyDict_GetItemString(kwargs.ptr(), "epsilon");
-        if (NULL != item) {
-            epsilon = PyFloat_AsDouble(item);
-        }
-        item = PyDict_GetItemString(kwargs.ptr(), "grad_decay_rate");
-        if (NULL != item) {
-            grad_decay_rate = PyFloat_AsDouble(item);
-        }
-        item = PyDict_GetItemString(kwargs.ptr(), "mom_decay_rate");
-        if (NULL != item) {
-            mom_decay_rate = PyFloat_AsDouble(item);
-        }
-        item = PyDict_GetItemString(kwargs.ptr(), "show_decay_rate");
-        if (NULL != item) {
-            show_decay_rate = PyFloat_AsDouble(item);
-        }
-
-        auto opt = new AdaGrad(learning_rate, initial_g2sum, initial_scale, epsilon, 
-                grad_decay_rate, mom_decay_rate, show_decay_rate);
+        PYDICT_PARSE_KWARGS(kwargs, initial_g2sum, 0);
+        PYDICT_PARSE_KWARGS(kwargs, initial_scale, 1);
+        PYDICT_PARSE_KWARGS(kwargs, epsilon, 1e-8);
+        PYDICT_PARSE_KWARGS(kwargs, grad_decay_rate, 1.0);
+        PYDICT_PARSE_KWARGS(kwargs, mom_decay_rate, 1.0);
 
         // NOTICE! opt will not delete until system exist
         PyObject* obj = PyCapsule_New(opt, nullptr, nullptr);
@@ -102,37 +81,16 @@ PYBIND11_MODULE(_pywrap_tn, m) {
         return py::reinterpret_steal<py::object>(obj);
     })
     .def("Adam", [](py::kwargs kwargs) {
-        float learning_rate = 0.001;
-        float beta1 = 0.9;
-        float beta2 = 0.999;
-        float epsilon = 1e-8;
-        float initial_scale = 1.0;
-
-        PyObject* item = PyDict_GetItemString(kwargs.ptr(), "learning_rate");
-        if (NULL != item) {
-            learning_rate = PyFloat_AsDouble(item);
-        }
-
-        item = PyDict_GetItemString(kwargs.ptr(), "beta1");
-        if (NULL != item) {
-            beta1 = PyFloat_AsDouble(item);
-        }
-
-        item = PyDict_GetItemString(kwargs.ptr(), "beta2");
-        if (NULL != item) {
-            beta2 = PyFloat_AsDouble(item);
-        }
+        auto opt = new Adam();
 
-        item = PyDict_GetItemString(kwargs.ptr(), "epsilon");
-        if (NULL != item) {
-            epsilon = PyFloat_AsDouble(item);
-        }
-        item = PyDict_GetItemString(kwargs.ptr(), "initial_scale");
-        if (NULL != item) {
-            initial_scale = PyFloat_AsDouble(item);
-        }
+        PYDICT_PARSE_KWARGS(kwargs, learning_rate, 0.001);
+        PYDICT_PARSE_KWARGS(kwargs, show_decay_rate, 0.98);
+        PYDICT_PARSE_KWARGS(kwargs, feature_drop_show, 1 - opt->show_decay_rate);
 
-        auto opt = new Adam(learning_rate, beta1, beta2, epsilon, initial_scale);
+        PYDICT_PARSE_KWARGS(kwargs, beta1, 0.9);
+        PYDICT_PARSE_KWARGS(kwargs, beta2, 0.999);
+        PYDICT_PARSE_KWARGS(kwargs, epsilon, 1e-8);
+        PYDICT_PARSE_KWARGS(kwargs, initial_scale, 1.0);
 
         // NOTICE! opt will not delete until system exist
         PyObject* obj = PyCapsule_New(opt, nullptr, nullptr);
@@ -140,44 +98,16 @@ PYBIND11_MODULE(_pywrap_tn, m) {
         return py::reinterpret_steal<py::object>(obj);
     })
     .def("Ftrl", [](py::kwargs kwargs) {
-        float learning_rate = 0.05;
-        float initial_range = 0;
-        float beta = 1;
-        float lambda1 = 0.1;
-        float lambda2 = 1;
-        float show_decay_rate = 0.98;
-
-        PyObject* item = PyDict_GetItemString(kwargs.ptr(), "learning_rate");
-        if (NULL != item) {
-            learning_rate = PyFloat_AsDouble(item);
-        }
+        auto opt = new Ftrl();
 
-        item = PyDict_GetItemString(kwargs.ptr(), "initial_range");
-        if (NULL != item) {
-            initial_range = PyFloat_AsDouble(item);
-        }
-
-        item = PyDict_GetItemString(kwargs.ptr(), "beta");
-        if (NULL != item) {
-            beta = PyFloat_AsDouble(item);
-        }
-
-        item = PyDict_GetItemString(kwargs.ptr(), "lambda1");
-        if (NULL != item) {
-            lambda1 = PyFloat_AsDouble(item);
-        }
-
-        item = PyDict_GetItemString(kwargs.ptr(), "lambda2");
-        if (NULL != item) {
-            lambda2 = PyFloat_AsDouble(item);
-        }
-
-        item = PyDict_GetItemString(kwargs.ptr(), "show_decay_rate");
-        if (NULL != item) {
-            show_decay_rate = PyFloat_AsDouble(item);
-        }
+        PYDICT_PARSE_KWARGS(kwargs, learning_rate, 0.05);
+        PYDICT_PARSE_KWARGS(kwargs, show_decay_rate, 0.98);
+        PYDICT_PARSE_KWARGS(kwargs, feature_drop_show, 1 - opt->show_decay_rate);
 
-        auto opt = new Ftrl(learning_rate, initial_range, beta, lambda1, lambda2, show_decay_rate);
+        PYDICT_PARSE_KWARGS(kwargs, beta, 1);
+        PYDICT_PARSE_KWARGS(kwargs, lambda1, 0.1);
+        PYDICT_PARSE_KWARGS(kwargs, lambda2, 1);
+        PYDICT_PARSE_KWARGS(kwargs, initial_scale, 1.0);
 
         // NOTICE! opt will not delete until system exist
         PyObject* obj = PyCapsule_New(opt, nullptr, nullptr);
diff --git a/core/ps/optimizer/ada_grad_kernel.cc b/core/ps/optimizer/ada_grad_kernel.cc
index f4b50fc..24c7faa 100644
--- a/core/ps/optimizer/ada_grad_kernel.cc
+++ b/core/ps/optimizer/ada_grad_kernel.cc
@@ -89,7 +89,7 @@ SparseAdaGradValue::SparseAdaGradValue(int dim, const AdaGrad* opt) {
 }
 
 void SparseAdaGradValue::Apply(const AdaGrad* opt, SparseGradInfo& grad_info, int dim) {
-    show_ += grad_info.batch_show;
+    delta_show += grad_info.batch_show;
 
     float* w = Weight();
 
@@ -112,7 +112,7 @@ void SparseAdaGradValue::Serialize(std::ostream& os, int dim) {
     }
 
     os << g2sum_ << "\t";
-    os << show_;
+    os << show;
 }
 
 void SparseAdaGradValue::DeSerialize(std::istream& is, int dim) {
@@ -121,11 +121,7 @@ void SparseAdaGradValue::DeSerialize(std::istream& is, int dim) {
     }
 
     is >> g2sum_;
-    is >> show_;
-}
-
-void SparseAdaGradValue::ShowDecay(const AdaGrad* opt) {
-    show_ *= opt->show_decay_rate;
+    is >> show;
 }
 
 } // namespace tensornet
diff --git a/core/ps/optimizer/ada_grad_kernel.h b/core/ps/optimizer/ada_grad_kernel.h
index 3814a36..864d3c5 100644
--- a/core/ps/optimizer/ada_grad_kernel.h
+++ b/core/ps/optimizer/ada_grad_kernel.h
@@ -53,7 +53,8 @@ class DenseAdaGradValue {
 std::ostream& operator<<(std::ostream& os, const DenseAdaGradValue& value);
 std::istream& operator>>(std::istream& is, DenseAdaGradValue& value);
 
-struct alignas(4) SparseAdaGradValue {
+struct alignas(4) SparseAdaGradValue
+    : public SparseOptValue {
 public:
     SparseAdaGradValue(int dim, const AdaGrad* opt);
 
@@ -73,15 +74,12 @@ struct alignas(4) SparseAdaGradValue {
 
     void Apply(const AdaGrad* opt, SparseGradInfo& grad_info, int dim);
 
-    void ShowDecay(const AdaGrad* opt);
-
     void Serialize(std::ostream& os, int dim);
 
     void DeSerialize(std::istream& is, int dim);
 
 private:
     float g2sum_;
-    float show_ = 0.0;
     float data_[0];
 };
 
diff --git a/core/ps/optimizer/adam_kernel.cc b/core/ps/optimizer/adam_kernel.cc
index 48f058e..9020e38 100644
--- a/core/ps/optimizer/adam_kernel.cc
+++ b/core/ps/optimizer/adam_kernel.cc
@@ -104,7 +104,7 @@ SparseAdamValue::SparseAdamValue(int dim, const Adam* opt) {
 }
 
 void SparseAdamValue::Apply(const Adam* opt, SparseGradInfo& grad_info, int dim) {
-    show_ += grad_info.batch_show;
+    delta_show += grad_info.batch_show;
 
     float* w = Weight();
     float* m = M(dim);
@@ -129,7 +129,7 @@ void SparseAdamValue::Serialize(std::ostream& os, int dim) {
         os << v[i] << "\t";
     }
 
-    os << show_;
+    os << show;
 }
 
 void SparseAdamValue::DeSerialize(std::istream& is, int dim) {
@@ -143,7 +143,7 @@ void SparseAdamValue::DeSerialize(std::istream& is, int dim) {
         is >> v[i];
     }
 
-    is >> show_;
+    is >> show;
 }
 
 } // namespace tensornet {
diff --git a/core/ps/optimizer/adam_kernel.h b/core/ps/optimizer/adam_kernel.h
index 1a5d2f0..7f5311f 100644
--- a/core/ps/optimizer/adam_kernel.h
+++ b/core/ps/optimizer/adam_kernel.h
@@ -56,7 +56,8 @@ class DenseAdamValue {
 std::ostream& operator<<(std::ostream& os, const DenseAdamValue& value);
 std::istream& operator>>(std::istream& is, DenseAdamValue& value);
 
-struct alignas(4) SparseAdamValue {
+struct alignas(4) SparseAdamValue
+    : public SparseOptValue {
 public:
     SparseAdamValue(int dim, const Adam* opt);
     ~SparseAdamValue() = default;
@@ -75,8 +76,6 @@ struct alignas(4) SparseAdamValue {
 
     void Apply(const Adam* opt, SparseGradInfo& grad_info, int dim);
 
-    void ShowDecay(const Adam* opt) {}
-
     void Serialize(std::ostream& os, int dim);
 
     void DeSerialize(std::istream& is, int dim);
@@ -100,7 +99,6 @@ struct alignas(4) SparseAdamValue {
     }
 
 private:
-    float show_ = 0.0;
     float data_[0];
 };
 
diff --git a/core/ps/optimizer/data_struct.h b/core/ps/optimizer/data_struct.h
index 266ada3..e5ab2b5 100644
--- a/core/ps/optimizer/data_struct.h
+++ b/core/ps/optimizer/data_struct.h
@@ -22,6 +22,16 @@ struct SparseGradInfo {
     int batch_show;
 };
 
+struct alignas(4) SparseOptValue {
+    float show = 0.0;
+    int delta_show = 0;
+
+    void ShowDecay(float decay_rate) {
+        show = (1 - decay_rate) * delta_show + decay_rate * show;
+        delta_show = 0;
+    }
+};
+
 } // namespace tensornet {
 
 #endif // !TENSORNET_OPTIMIZER_DATA_STRUCT_H_
diff --git a/core/ps/optimizer/ftrl_kernel.cc b/core/ps/optimizer/ftrl_kernel.cc
index f3ecd14..621444a 100644
--- a/core/ps/optimizer/ftrl_kernel.cc
+++ b/core/ps/optimizer/ftrl_kernel.cc
@@ -50,13 +50,15 @@ SparseFtrlValue::SparseFtrlValue(int dim, const Ftrl* opt) {
     float* n = N(dim);
 
     for (int i = 0; i < dim; ++i) {
-        w[i] = distribution(reng) * opt->initial_range;
+        w[i] = distribution(reng) * opt->initial_scale;
         z[i] = 0;
         n[i] = 0;
     }
 }
 
 void SparseFtrlValue::Apply(const Ftrl* opt, SparseGradInfo& grad_info, int dim) {
+    delta_show += grad_info.batch_show;
+
     float* w = Weight();
     float* z = Z(dim);
     float* n = N(dim);
@@ -90,7 +92,7 @@ void SparseFtrlValue::Serialize(std::ostream& os, int dim) {
         os << n[i] << "\t";
     }
 
-    os << show_;
+    os << show;
 }
 
 void SparseFtrlValue::DeSerialize(std::istream& is, int dim) {
@@ -104,11 +106,7 @@ void SparseFtrlValue::DeSerialize(std::istream& is, int dim) {
         is >> n[i];
     }
 
-    is >> show_;
-}
-
-void SparseFtrlValue::ShowDecay(const Ftrl* opt) {
-    show_ *= opt->show_decay_rate;
+    is >> show;
 }
 
 } // namespace tensornet
diff --git a/core/ps/optimizer/ftrl_kernel.h b/core/ps/optimizer/ftrl_kernel.h
index 47e52c4..0b5c782 100644
--- a/core/ps/optimizer/ftrl_kernel.h
+++ b/core/ps/optimizer/ftrl_kernel.h
@@ -52,7 +52,8 @@ class DenseFtrlValue {
 std::ostream& operator<<(std::ostream& os, const DenseFtrlValue& value);
 std::istream& operator>>(std::istream& is, DenseFtrlValue& value);
 
-struct alignas(4) SparseFtrlValue {
+struct alignas(4) SparseFtrlValue
+    : public SparseOptValue {
 public:
     SparseFtrlValue(int dim, const Ftrl* opt);
 
@@ -72,8 +73,6 @@ struct alignas(4) SparseFtrlValue {
 
     void Apply(const Ftrl* opt, SparseGradInfo& grad_info, int dim);
 
-    void ShowDecay(const Ftrl* opt);
-
     void Serialize(std::ostream& os, int dim);
 
     void DeSerialize(std::istream& is, int dim);
@@ -96,7 +95,6 @@ struct alignas(4) SparseFtrlValue {
     }
 
 private:
-    float show_ = 0.0;
     float data_[0];
 };
 
diff --git a/core/ps/optimizer/optimizer.h b/core/ps/optimizer/optimizer.h
index ebfdca7..3a67798 100644
--- a/core/ps/optimizer/optimizer.h
+++ b/core/ps/optimizer/optimizer.h
@@ -29,12 +29,6 @@ typedef std::shared_ptr<SparseOptimizerKernelBase> SparseOptKernelSharedPtr;
 
 class OptimizerBase {
 public:
-    OptimizerBase(float lr)
-        : learning_rate(lr) {
-    }
-
-    virtual ~OptimizerBase() { }
-
     virtual DenseOptKernelSharedPtr CreateDenseOptKernel(
         int offset_begin, int offset_end) const = 0;
 
@@ -44,20 +38,12 @@ class OptimizerBase {
 
 public:
     float learning_rate = 0.01;
+    float show_decay_rate = 0.98;
+    float feature_drop_show = 0.02;
 };
 
 class Adam : public OptimizerBase {
 public:
-    Adam(float lr, float b1, float b2, float eps, float initial_scale)
-        : OptimizerBase(lr)
-        , beta1(b1)
-        , beta2(b2)
-        , epsilon(eps)
-        , initial_scale(initial_scale) {
-    }
-
-    ~Adam() { }
-
     virtual DenseOptKernelSharedPtr CreateDenseOptKernel(
         int offset_begin, int offset_end) const;
 
@@ -76,20 +62,6 @@ class Adam : public OptimizerBase {
 
 class AdaGrad : public OptimizerBase {
 public:
-    AdaGrad(float lr, float initial_g2sum, float initial_scale,
-            float epsilon, float grad_decay_rate, float mom_decay_rate,
-            float show_decay_rate)
-        : OptimizerBase(lr)
-        , initial_g2sum(initial_g2sum)
-        , initial_scale(initial_scale)
-        , epsilon(epsilon)
-        , grad_decay_rate(grad_decay_rate)
-        , mom_decay_rate(mom_decay_rate)
-        , show_decay_rate(show_decay_rate) {
-    }
-
-    ~AdaGrad() { }
-
     virtual DenseOptKernelSharedPtr CreateDenseOptKernel(
         int offset_begin, int offset_end) const;
 
@@ -105,23 +77,10 @@ class AdaGrad : public OptimizerBase {
     float epsilon = 1e-08;
     float grad_decay_rate = 1.0;
     float mom_decay_rate = 0.9;
-    float show_decay_rate = 0.98;
 };
 
 class Ftrl : public OptimizerBase {
 public:
-    Ftrl(float lr, float initial_range, float beta, 
-         float lambda1, float lambda2, float show_decay_rate)
-        : OptimizerBase(lr)
-        , initial_range(initial_range)
-        , beta(beta)
-        , lambda1(lambda1)
-        , lambda2(lambda2)
-        ,show_decay_rate(show_decay_rate) {
-    }
-
-    ~Ftrl() {}
-
     virtual DenseOptKernelSharedPtr CreateDenseOptKernel(
         int offset_begin, int offset_end) const;
 
@@ -132,11 +91,10 @@ class Ftrl : public OptimizerBase {
     }
 
 public:
-    float initial_range = 0;
+    float initial_scale = 0;
     float beta = 1;
     float lambda1 = 0.1;
     float lambda2 = 1;
-    float show_decay_rate = 0.98;
 };
 
 } // namespace tensornet {
diff --git a/core/ps/optimizer/optimizer_kernel.h b/core/ps/optimizer/optimizer_kernel.h
index 7a28ab6..e87b3f9 100644
--- a/core/ps/optimizer/optimizer_kernel.h
+++ b/core/ps/optimizer/optimizer_kernel.h
@@ -343,9 +343,11 @@ class SparseKernelBlock {
         os << "dim:" << block.dim_ << std::endl;
 
         for (const auto& value : block.values_) {
-            os << value.first << "\t";
-            value.second->Serialize(os, block.dim_);
-            os << std::endl;
+            if (value.second->show > block.opt_->feature_drop_show) {
+                os << value.first << "\t";
+                value.second->Serialize(os, block.dim_);
+                os << std::endl;
+            }
         }
 
         return os;
@@ -376,7 +378,7 @@ class SparseKernelBlock {
     void ShowDecay() {
         for (auto& iter : values_) {
             ValueType* value = iter.second;
-            value->ShowDecay(opt_);
+            value->ShowDecay(opt_->show_decay_rate);
         }
     }
 
@@ -419,7 +421,7 @@ class SparseOptimizerKernel : public SparseOptimizerKernelBase {
         for (size_t i = 0; i < SPARSE_KERNEL_BLOCK_NUM; ++i) {
             threads.push_back(std::thread([this, i, &filepath]() {
                 std::string file = filepath;
-                file.append("/sparse_block_").append(std::to_string(i)).append(".gz");
+                file.append("/block_").append(std::to_string(i)).append(".gz");
 
                 FileWriterSink writer_sink(file, FCT_ZLIB);
 
@@ -441,7 +443,7 @@ class SparseOptimizerKernel : public SparseOptimizerKernelBase {
         for (size_t i = 0; i < SPARSE_KERNEL_BLOCK_NUM; ++i) {
             threads.push_back(std::thread([this, i, &filepath]() {
                 std::string file = filepath;
-                file.append("/sparse_block_").append(std::to_string(i)).append(".gz");
+                file.append("/block_").append(std::to_string(i)).append(".gz");
 
                 FileReaderSource reader_source(file, FCT_ZLIB);
                 boost::iostreams::stream<FileReaderSource> in_stream(reader_source);