tiny_dnn 1.0.0
A header only, dependency-free deep learning framework in C++11
Loading...
Searching...
No Matches
optimizer.h
1/*
2 Copyright (c) 2013, Taiga Nomi
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in the
11 documentation and/or other materials provided with the distribution.
12 * Neither the name of the <organization> nor the
13 names of its contributors may be used to endorse or promote products
14 derived from this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
17 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
20 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27#pragma once
28#include "tiny_dnn/util/util.h"
29#include <unordered_map>
30
31namespace tiny_dnn {
32
37struct optimizer {
38 optimizer() = default;
39 optimizer(const optimizer &) = default;
40#ifndef CNN_DEFAULT_MOVE_CONSTRUCTOR_UNAVAILABLE
41 optimizer(optimizer &&) = default;
42#endif
43 optimizer &operator =(const optimizer &) = default;
44#ifndef CNN_DEFAULT_ASSIGNMENT_OPERATOR_UNAVAILABLE
45 optimizer &operator =(optimizer &&) = default;
46#endif
47 virtual ~optimizer() = default;
48 virtual void update(const vec_t& dW, vec_t &W, bool parallelize) = 0;
49 virtual void reset() {} // override to implement pre-learning action
50};
51
52// helper class to hold N values for each weight
53template <int N>
55 void reset() override {
56 for (auto& e : E_) e.clear();
57 }
58
59protected:
60 template <int Index>
61 vec_t& get(const vec_t& key) {
62 static_assert(Index < N, "index out of range");
63 if (E_[Index][&key].empty())
64 E_[Index][&key].resize(key.size(), float_t());
65 return E_[Index][&key];
66 }
67 std::unordered_map<const vec_t*, vec_t> E_[N];
68};
69
77struct adagrad : public stateful_optimizer<1> {
78 adagrad() : alpha(float_t(0.01)), eps(float_t(1e-8)) {}
79
80 void update(const vec_t& dW, vec_t &W, bool parallelize) {
81 vec_t& g = get<0>(W);
82 for_i(parallelize, static_cast<int>(W.size()), [&](int i) {
83 g[i] += dW[i] * dW[i];
84 W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
85 });
86 }
87
88 float_t alpha; // learning rate
89private:
90 float_t eps;
91};
92
99struct RMSprop : public stateful_optimizer<1> {
100 RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {}
101
102 void update(const vec_t& dW, vec_t& W, bool parallelize) {
103 vec_t& g = get<0>(W);
104
105 for_i(parallelize, static_cast<int>(W.size()), [&](int i)
106 {
107 g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
108 W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
109 });
110 }
111
112 float_t alpha; // learning rate
113 float_t mu; // decay term
114private:
115 float_t eps; // constant value to avoid zero-division
116};
117
118
125struct adam : public stateful_optimizer<2> {
126 adam() : alpha(float_t(0.001)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
127
128 void update(const vec_t& dW, vec_t& W, bool parallelize) {
129 vec_t& mt = get<0>(W);
130 vec_t& vt = get<1>(W);
131
132 b1_t*=b1;b2_t*=b2;
133
134 for_i(parallelize, static_cast<int>(W.size()), [&](int i){
135 mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
136 vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
137
138 W[i] -= alpha * ( mt[i]/(float_t(1) -b1_t) ) / std::sqrt( (vt[i]/(float_t(1)-b2_t)) + eps);
139 });
140 }
141
142 float_t alpha; // learning rate
143 float_t b1; // decay term
144 float_t b2; // decay term
145 float_t b1_t; // decay term power t
146 float_t b2_t; // decay term power t
147private:
148 float_t eps; // constant value to avoid zero-division
149};
150
151
152
159 gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {}
160
161 void update(const vec_t& dW, vec_t& W, bool parallelize) {
162 for_i(parallelize, static_cast<int>(W.size()), [&](int i){
163 W[i] = W[i] - alpha * (dW[i] + lambda * W[i]);
164 });
165 }
166
167 float_t alpha; // learning rate
168 float_t lambda; // weight decay
169};
170
178struct momentum : public stateful_optimizer<1> {
179public:
180 momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
181
182 void update(const vec_t& dW, vec_t& W, bool parallelize) {
183 vec_t& dWprev = get<0>(W);
184
185 for_i(parallelize, static_cast<int>(W.size()), [&](int i){
186 float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
187 W[i] += V;
188 dWprev[i] = V;
189 });
190 }
191
192 float_t alpha; // learning rate
193 float_t lambda; // weight decay
194 float_t mu; // momentum
195};
196
197} // namespace tiny_dnn
Simple image utility class.
Definition image.h:94
RMSprop.
Definition optimizer.h:99
adaptive gradient method
Definition optimizer.h:77
[a new optimizer (2015)]
Definition optimizer.h:125
SGD without momentum.
Definition optimizer.h:158
SGD with momentum.
Definition optimizer.h:178
base class of optimizer usesHessian : true if an optimizer uses hessian (2nd order derivative of loss...
Definition optimizer.h:37
Definition optimizer.h:54