Merge pull request #44 from Qihoo360/zhangys

show decay refine
Qihoo360 · Dec 25, 2020 · 43521f7 · 43521f7
2 parents b899dca + 3548339
commit 43521f7
Show file tree

Hide file tree

Showing 12 changed files with 90 additions and 194 deletions.
diff --git a/core/main/py_wrapper.cc b/core/main/py_wrapper.cc
@@ -33,6 +33,15 @@ using std::string;
 
 using namespace tensornet;
 
+#define PYDICT_PARSE_KWARGS(kwargs, name, default_value)                        \
+    opt->name = default_value;                                                  \
+    {                                                                           \
+        PyObject* item = PyDict_GetItemString(kwargs.ptr(), #name);             \
+        if (NULL != item) {                                                     \
+            opt->name = PyFloat_AsDouble(item);                                 \
+        }                                                                       \
+    }
+
 PYBIND11_MODULE(_pywrap_tn, m) {
     m.def("init", []() {
         PsCluster* cluster = PsCluster::Instance();
@@ -54,130 +63,51 @@ PYBIND11_MODULE(_pywrap_tn, m) {
         return true;
     })
     .def("AdaGrad", [](py::kwargs kwargs) {
-        float learning_rate = 0.01;
-        float initial_g2sum = 0;
-        float initial_scale = 1;
-        float epsilon = 1e-8;
-        float grad_decay_rate = 1.0;
-        float mom_decay_rate = 1.0;
-        float show_decay_rate = 0.98;
-
-        PyObject* item = PyDict_GetItemString(kwargs.ptr(), "learning_rate");
-        if (NULL != item) {
-            learning_rate = PyFloat_AsDouble(item);
-        }
+        auto opt = new AdaGrad();
 
-        item = PyDict_GetItemString(kwargs.ptr(), "initial_g2sum");
-        if (NULL != item) {
-            initial_g2sum = PyFloat_AsDouble(item);
-        }
+        PYDICT_PARSE_KWARGS(kwargs, learning_rate, 0.01);
+        PYDICT_PARSE_KWARGS(kwargs, show_decay_rate, 0.98);
+        PYDICT_PARSE_KWARGS(kwargs, feature_drop_show, 1 - opt->show_decay_rate);
 
-        item = PyDict_GetItemString(kwargs.ptr(), "initial_scale");
-        if (NULL != item) {
-            initial_scale = PyFloat_AsDouble(item);
-        }
-        item = PyDict_GetItemString(kwargs.ptr(), "epsilon");
-        if (NULL != item) {
-            epsilon = PyFloat_AsDouble(item);
-        }
-        item = PyDict_GetItemString(kwargs.ptr(), "grad_decay_rate");
-        if (NULL != item) {
-            grad_decay_rate = PyFloat_AsDouble(item);
-        }
-        item = PyDict_GetItemString(kwargs.ptr(), "mom_decay_rate");
-        if (NULL != item) {
-            mom_decay_rate = PyFloat_AsDouble(item);
-        }
-        item = PyDict_GetItemString(kwargs.ptr(), "show_decay_rate");
-        if (NULL != item) {
-            show_decay_rate = PyFloat_AsDouble(item);
-        }
-
-        auto opt = new AdaGrad(learning_rate, initial_g2sum, initial_scale, epsilon, 
-                grad_decay_rate, mom_decay_rate, show_decay_rate);
+        PYDICT_PARSE_KWARGS(kwargs, initial_g2sum, 0);
+        PYDICT_PARSE_KWARGS(kwargs, initial_scale, 1);
+        PYDICT_PARSE_KWARGS(kwargs, epsilon, 1e-8);
+        PYDICT_PARSE_KWARGS(kwargs, grad_decay_rate, 1.0);
+        PYDICT_PARSE_KWARGS(kwargs, mom_decay_rate, 1.0);
 
         // NOTICE! opt will not delete until system exist
         PyObject* obj = PyCapsule_New(opt, nullptr, nullptr);
 
         return py::reinterpret_steal<py::object>(obj);
     })
     .def("Adam", [](py::kwargs kwargs) {
-        float learning_rate = 0.001;
-        float beta1 = 0.9;
-        float beta2 = 0.999;
-        float epsilon = 1e-8;
-        float initial_scale = 1.0;
-
-        PyObject* item = PyDict_GetItemString(kwargs.ptr(), "learning_rate");
-        if (NULL != item) {
-            learning_rate = PyFloat_AsDouble(item);
-        }
-
-        item = PyDict_GetItemString(kwargs.ptr(), "beta1");
-        if (NULL != item) {
-            beta1 = PyFloat_AsDouble(item);
-        }
-
-        item = PyDict_GetItemString(kwargs.ptr(), "beta2");
-        if (NULL != item) {
-            beta2 = PyFloat_AsDouble(item);
-        }
+        auto opt = new Adam();
 
-        item = PyDict_GetItemString(kwargs.ptr(), "epsilon");
-        if (NULL != item) {
-            epsilon = PyFloat_AsDouble(item);
-        }
-        item = PyDict_GetItemString(kwargs.ptr(), "initial_scale");
-        if (NULL != item) {
-            initial_scale = PyFloat_AsDouble(item);
-        }
+        PYDICT_PARSE_KWARGS(kwargs, learning_rate, 0.001);
+        PYDICT_PARSE_KWARGS(kwargs, show_decay_rate, 0.98);
+        PYDICT_PARSE_KWARGS(kwargs, feature_drop_show, 1 - opt->show_decay_rate);
 
-        auto opt = new Adam(learning_rate, beta1, beta2, epsilon, initial_scale);
+        PYDICT_PARSE_KWARGS(kwargs, beta1, 0.9);
+        PYDICT_PARSE_KWARGS(kwargs, beta2, 0.999);
+        PYDICT_PARSE_KWARGS(kwargs, epsilon, 1e-8);
+        PYDICT_PARSE_KWARGS(kwargs, initial_scale, 1.0);
 
         // NOTICE! opt will not delete until system exist
         PyObject* obj = PyCapsule_New(opt, nullptr, nullptr);
 
         return py::reinterpret_steal<py::object>(obj);
     })
     .def("Ftrl", [](py::kwargs kwargs) {
-        float learning_rate = 0.05;
-        float initial_range = 0;
-        float beta = 1;
-        float lambda1 = 0.1;
-        float lambda2 = 1;
-        float show_decay_rate = 0.98;
-
-        PyObject* item = PyDict_GetItemString(kwargs.ptr(), "learning_rate");
-        if (NULL != item) {
-            learning_rate = PyFloat_AsDouble(item);
-        }
+        auto opt = new Ftrl();
 
-        item = PyDict_GetItemString(kwargs.ptr(), "initial_range");
-        if (NULL != item) {
-            initial_range = PyFloat_AsDouble(item);
-        }
-
-        item = PyDict_GetItemString(kwargs.ptr(), "beta");
-        if (NULL != item) {
-            beta = PyFloat_AsDouble(item);
-        }
-
-        item = PyDict_GetItemString(kwargs.ptr(), "lambda1");
-        if (NULL != item) {
-            lambda1 = PyFloat_AsDouble(item);
-        }
-
-        item = PyDict_GetItemString(kwargs.ptr(), "lambda2");
-        if (NULL != item) {
-            lambda2 = PyFloat_AsDouble(item);
-        }
-
-        item = PyDict_GetItemString(kwargs.ptr(), "show_decay_rate");
-        if (NULL != item) {
-            show_decay_rate = PyFloat_AsDouble(item);
-        }
+        PYDICT_PARSE_KWARGS(kwargs, learning_rate, 0.05);
+        PYDICT_PARSE_KWARGS(kwargs, show_decay_rate, 0.98);
+        PYDICT_PARSE_KWARGS(kwargs, feature_drop_show, 1 - opt->show_decay_rate);
 
-        auto opt = new Ftrl(learning_rate, initial_range, beta, lambda1, lambda2, show_decay_rate);
+        PYDICT_PARSE_KWARGS(kwargs, beta, 1);
+        PYDICT_PARSE_KWARGS(kwargs, lambda1, 0.1);
+        PYDICT_PARSE_KWARGS(kwargs, lambda2, 1);
+        PYDICT_PARSE_KWARGS(kwargs, initial_scale, 1.0);
 
         // NOTICE! opt will not delete until system exist
         PyObject* obj = PyCapsule_New(opt, nullptr, nullptr);

diff --git a/core/ps/optimizer/ada_grad_kernel.cc b/core/ps/optimizer/ada_grad_kernel.cc
@@ -89,7 +89,7 @@ SparseAdaGradValue::SparseAdaGradValue(int dim, const AdaGrad* opt) {
 }
 
 void SparseAdaGradValue::Apply(const AdaGrad* opt, SparseGradInfo& grad_info, int dim) {
-    show_ += grad_info.batch_show;
+    delta_show += grad_info.batch_show;
 
     float* w = Weight();
 
@@ -112,7 +112,7 @@ void SparseAdaGradValue::Serialize(std::ostream& os, int dim) {
     }
 
     os << g2sum_ << "\t";
-    os << show_;
+    os << show;
 }
 
 void SparseAdaGradValue::DeSerialize(std::istream& is, int dim) {
@@ -121,11 +121,7 @@ void SparseAdaGradValue::DeSerialize(std::istream& is, int dim) {
     }
 
     is >> g2sum_;
-    is >> show_;
-}
-
-void SparseAdaGradValue::ShowDecay(const AdaGrad* opt) {
-    show_ *= opt->show_decay_rate;
+    is >> show;
 }
 
 } // namespace tensornet

diff --git a/core/ps/optimizer/ada_grad_kernel.h b/core/ps/optimizer/ada_grad_kernel.h
@@ -53,7 +53,8 @@ class DenseAdaGradValue {
 std::ostream& operator<<(std::ostream& os, const DenseAdaGradValue& value);
 std::istream& operator>>(std::istream& is, DenseAdaGradValue& value);
 
-struct alignas(4) SparseAdaGradValue {
+struct alignas(4) SparseAdaGradValue
+    : public SparseOptValue {
 public:
     SparseAdaGradValue(int dim, const AdaGrad* opt);
 
@@ -73,15 +74,12 @@ struct alignas(4) SparseAdaGradValue {
 
     void Apply(const AdaGrad* opt, SparseGradInfo& grad_info, int dim);
 
-    void ShowDecay(const AdaGrad* opt);
-
     void Serialize(std::ostream& os, int dim);
 
     void DeSerialize(std::istream& is, int dim);
 
 private:
     float g2sum_;
-    float show_ = 0.0;
     float data_[0];
 };
 

diff --git a/core/ps/optimizer/adam_kernel.cc b/core/ps/optimizer/adam_kernel.cc
@@ -104,7 +104,7 @@ SparseAdamValue::SparseAdamValue(int dim, const Adam* opt) {
 }
 
 void SparseAdamValue::Apply(const Adam* opt, SparseGradInfo& grad_info, int dim) {
-    show_ += grad_info.batch_show;
+    delta_show += grad_info.batch_show;
 
     float* w = Weight();
     float* m = M(dim);
@@ -129,7 +129,7 @@ void SparseAdamValue::Serialize(std::ostream& os, int dim) {
         os << v[i] << "\t";
     }
 
-    os << show_;
+    os << show;
 }
 
 void SparseAdamValue::DeSerialize(std::istream& is, int dim) {
@@ -143,7 +143,7 @@ void SparseAdamValue::DeSerialize(std::istream& is, int dim) {
         is >> v[i];
     }
 
-    is >> show_;
+    is >> show;
 }
 
 } // namespace tensornet {

diff --git a/core/ps/optimizer/adam_kernel.h b/core/ps/optimizer/adam_kernel.h
@@ -56,7 +56,8 @@ class DenseAdamValue {
 std::ostream& operator<<(std::ostream& os, const DenseAdamValue& value);
 std::istream& operator>>(std::istream& is, DenseAdamValue& value);
 
-struct alignas(4) SparseAdamValue {
+struct alignas(4) SparseAdamValue
+    : public SparseOptValue {
 public:
     SparseAdamValue(int dim, const Adam* opt);
     ~SparseAdamValue() = default;
@@ -75,8 +76,6 @@ struct alignas(4) SparseAdamValue {
 
     void Apply(const Adam* opt, SparseGradInfo& grad_info, int dim);
 
-    void ShowDecay(const Adam* opt) {}
-
     void Serialize(std::ostream& os, int dim);
 
     void DeSerialize(std::istream& is, int dim);
@@ -100,7 +99,6 @@ struct alignas(4) SparseAdamValue {
     }
 
 private:
-    float show_ = 0.0;
     float data_[0];
 };
 

diff --git a/core/ps/optimizer/data_struct.h b/core/ps/optimizer/data_struct.h
@@ -22,6 +22,16 @@ struct SparseGradInfo {
     int batch_show;
 };
 
+struct alignas(4) SparseOptValue {
+    float show = 0.0;
+    int delta_show = 0;
+
+    void ShowDecay(float decay_rate) {
+        show = (1 - decay_rate) * delta_show + decay_rate * show;
+        delta_show = 0;
+    }
+};
+
 } // namespace tensornet {
 
 #endif // !TENSORNET_OPTIMIZER_DATA_STRUCT_H_
diff --git a/core/ps/optimizer/ftrl_kernel.cc b/core/ps/optimizer/ftrl_kernel.cc
@@ -50,13 +50,15 @@ SparseFtrlValue::SparseFtrlValue(int dim, const Ftrl* opt) {
     float* n = N(dim);
 
     for (int i = 0; i < dim; ++i) {
-        w[i] = distribution(reng) * opt->initial_range;
+        w[i] = distribution(reng) * opt->initial_scale;
         z[i] = 0;
         n[i] = 0;
     }
 }
 
 void SparseFtrlValue::Apply(const Ftrl* opt, SparseGradInfo& grad_info, int dim) {
+    delta_show += grad_info.batch_show;
+
     float* w = Weight();
     float* z = Z(dim);
     float* n = N(dim);
@@ -90,7 +92,7 @@ void SparseFtrlValue::Serialize(std::ostream& os, int dim) {
         os << n[i] << "\t";
     }
 
-    os << show_;
+    os << show;
 }
 
 void SparseFtrlValue::DeSerialize(std::istream& is, int dim) {
@@ -104,11 +106,7 @@ void SparseFtrlValue::DeSerialize(std::istream& is, int dim) {
         is >> n[i];
     }
 
-    is >> show_;
-}
-
-void SparseFtrlValue::ShowDecay(const Ftrl* opt) {
-    show_ *= opt->show_decay_rate;
+    is >> show;
 }
 
 } // namespace tensornet

diff --git a/core/ps/optimizer/ftrl_kernel.h b/core/ps/optimizer/ftrl_kernel.h
@@ -52,7 +52,8 @@ class DenseFtrlValue {
 std::ostream& operator<<(std::ostream& os, const DenseFtrlValue& value);
 std::istream& operator>>(std::istream& is, DenseFtrlValue& value);
 
-struct alignas(4) SparseFtrlValue {
+struct alignas(4) SparseFtrlValue
+    : public SparseOptValue {
 public:
     SparseFtrlValue(int dim, const Ftrl* opt);
 
@@ -72,8 +73,6 @@ struct alignas(4) SparseFtrlValue {
 
     void Apply(const Ftrl* opt, SparseGradInfo& grad_info, int dim);
 
-    void ShowDecay(const Ftrl* opt);
-
     void Serialize(std::ostream& os, int dim);
 
     void DeSerialize(std::istream& is, int dim);
@@ -96,7 +95,6 @@ struct alignas(4) SparseFtrlValue {
     }
 
 private:
-    float show_ = 0.0;
     float data_[0];
 };