Skip to content

Commit

Permalink
Adagrad Optimizer Implementation (#154)
Browse files Browse the repository at this point in the history
* Adagrad Implementation

* Resolved comments

* Added test for adagrad

* Comment

* Fix L2 penalty and learning rate decay

* Add Adagrad to the list in README

* Bump minor version

---------

Co-authored-by: milancurcic <[email protected]>
  • Loading branch information
Spnetic-5 and milancurcic authored Aug 6, 2023
1 parent 6adc1c2 commit b119194
Show file tree
Hide file tree
Showing 6 changed files with 158 additions and 10 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
* Training and inference of dense (fully connected) and convolutional neural
networks
* Stochastic gradient descent optimizers: Classic, momentum, Nesterov momentum,
RMSProp, Adam, AdamW
RMSProp, Adagrad, Adam, AdamW
* More than a dozen activation functions and their derivatives
* Loading dense and convolutional models from Keras HDF5 (.h5) files
* Data-based parallelism
Expand Down
77 changes: 75 additions & 2 deletions example/quadratic.f90
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ program quadratic_fit
! descent.
use nf, only: dense, input, network
use nf_dense_layer, only: dense_layer
use nf_optimizers, only: sgd, rmsprop, adam
use nf_optimizers, only: sgd, rmsprop, adam, adagrad

implicit none
type(network) :: net(9)
type(network) :: net(11)

! Training parameters
integer, parameter :: num_epochs = 1000
Expand Down Expand Up @@ -95,6 +95,17 @@ program quadratic_fit
beta1, beta2, epsilon, weight_decay_decoupled=1e-5 &
)

! Adagrad optimizer
call adagrad_optimizer( &
net(10), x, y, xtest, ytest, learning_rate, num_epochs, epsilon &
)

! Adagrad optimizer with L2 regularization and learning rate decay
call adagrad_optimizer( &
net(11), x, y, xtest, ytest, learning_rate, num_epochs, epsilon, &
weight_decay_l2=1e-4, learning_rate_decay=0.99 &
)

contains

real elemental function quadratic(x) result(y)
Expand Down Expand Up @@ -358,6 +369,68 @@ subroutine adam_optimizer( &

end subroutine adam_optimizer

subroutine adagrad_optimizer( &
net, x, y, xtest, ytest, learning_rate, num_epochs, epsilon, &
weight_decay_l2, learning_rate_decay &
)
! Adagrad optimizer for updating weights using adaptive gradient algorithm
type(network), intent(inout) :: net
real, intent(in) :: x(:), y(:)
real, intent(in) :: xtest(:), ytest(:)
real, intent(in) :: learning_rate, epsilon
real, intent(in), optional :: weight_decay_l2
real, intent(in), optional :: learning_rate_decay
integer, intent(in) :: num_epochs
integer :: i, n
real, allocatable :: ypred(:)
real :: weight_decay_l2_val
real :: learning_rate_decay_val

! Set default values for weight_decay_l2
if (.not. present(weight_decay_l2)) then
weight_decay_l2_val = 0.0
else
weight_decay_l2_val = weight_decay_l2
end if

! Set default values for learning_rate_decay
if (.not. present(learning_rate_decay)) then
learning_rate_decay_val = 0.0
else
learning_rate_decay_val = learning_rate_decay
end if

print '(a)', 'Adagrad optimizer'
print '(34("-"))'

do n = 1, num_epochs

do i = 1, size(x)
call net % forward([x(i)])
call net % backward([y(i)])
end do

call net % update( &
adagrad( &
learning_rate=learning_rate, &
epsilon=epsilon, &
weight_decay_l2=weight_decay_l2_val, &
learning_rate_decay=learning_rate_decay_val &
) &
)

if (mod(n, num_epochs / 10) == 0) then
ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', &
n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
end if

end do

print *, ''

end subroutine adagrad_optimizer

subroutine shuffle(arr)
! Shuffle an array using the Fisher-Yates algorithm.
integer, intent(inout) :: arr(:)
Expand Down
2 changes: 1 addition & 1 deletion fpm.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name = "neural-fortran"
version = "0.14.0"
version = "0.15.0"
license = "MIT"
author = "Milan Curcic"
maintainer = "[email protected]"
Expand Down
2 changes: 1 addition & 1 deletion src/nf.f90
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ module nf
use nf_layer_constructors, only: &
conv2d, dense, flatten, input, maxpool2d, reshape
use nf_network, only: network
use nf_optimizers, only: sgd, rmsprop, adam
use nf_optimizers, only: sgd, rmsprop, adam, adagrad
use nf_activation, only: activation_function, elu, exponential, &
gaussian, linear, relu, leaky_relu, &
sigmoid, softmax, softplus, step, tanhf, &
Expand Down
61 changes: 58 additions & 3 deletions src/nf/nf_optimizers.f90
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ module nf_optimizers
implicit none

private
public :: optimizer_base_type, sgd, rmsprop, adam
public :: optimizer_base_type, sgd, rmsprop, adam, adagrad

type, abstract :: optimizer_base_type
real :: learning_rate = 0.01
Expand Down Expand Up @@ -87,6 +87,23 @@ end subroutine minimize
procedure :: minimize => minimize_adam
end type adam

type, extends(optimizer_base_type) :: adagrad
!! Adagrad optimizer by Duchi et al. (2011)
!!
!! Duchi, J., Hazan, E. and Singer, Y., 2011. Adaptive subgradient
!! methods for online learning and stochastic optimization. Journal
!! of Machine Learning Research, 12(Jul), pp.2121-2159.
!! http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
real :: epsilon = 1e-8
real :: weight_decay_l2 = 0
real :: learning_rate_decay = 0
real, allocatable, private :: sum_squared_gradient(:)
integer, private :: t = 0
contains
procedure :: init => init_adagrad
procedure :: minimize => minimize_adagrad
end type adagrad

contains

impure elemental subroutine init_sgd(self, num_params)
Expand Down Expand Up @@ -186,11 +203,49 @@ pure subroutine minimize_adam(self, param, gradient)

! Update parameters.
param = param &
- self % learning_rate * m_hat / (sqrt(v_hat) + self % epsilon) &
- self % weight_decay_decoupled * param
- self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon) &
+ self % weight_decay_decoupled * param)

end associate

end subroutine minimize_adam


impure elemental subroutine init_adagrad(self, num_params)
class(adagrad), intent(inout) :: self
integer, intent(in) :: num_params
if (.not. allocated(self % sum_squared_gradient)) then
allocate(self % sum_squared_gradient(num_params))
self % sum_squared_gradient = 0
end if
end subroutine init_adagrad


pure subroutine minimize_adagrad(self, param, gradient)
!! Concrete implementation of an Adagrad optimizer update rule.
class(adagrad), intent(inout) :: self
real, intent(inout) :: param(:)
real, intent(in) :: gradient(:)

! Update the current time step
self % t = self % t + 1

associate( &
! If weight_decay_l2 > 0, use L2 regularization;
! otherwise, default to regular Adagrad.
g => gradient + self % weight_decay_l2 * param, &
! Amortize the learning rate as function of the current time step.
learning_rate => self % learning_rate &
/ (1 + (self % t - 1) * self % learning_rate_decay) &
)

self % sum_squared_gradient = self % sum_squared_gradient + g**2

param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) &
+ self % epsilon)

end associate

end subroutine minimize_adagrad

end module nf_optimizers
24 changes: 22 additions & 2 deletions test/test_optimizers.f90
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
program test_optimizers

use nf, only: dense, input, network, rmsprop, sgd, adam
use nf, only: dense, input, network, rmsprop, sgd, adam, adagrad
use iso_fortran_env, only: stderr => error_unit

implicit none
type(network) :: net(5)
type(network) :: net(6)
real, allocatable :: x(:), y(:)
real, allocatable :: ypred(:)
integer, parameter :: num_iterations = 1000
Expand Down Expand Up @@ -116,6 +116,26 @@ program test_optimizers
ok = .false.
end if

! Test Adagrad optimizer
converged = .false.

do n = 0, num_iterations

call net(6) % forward(x)
call net(6) % backward(y)
call net(6) % update(optimizer=adagrad(learning_rate=0.01, weight_decay_l2=1e-4, learning_rate_decay=0.99))

ypred = net(5) % predict(x)
converged = check_convergence(y, ypred)
if (converged) exit

end do

if (.not. converged) then
write(stderr, '(a)') 'adagrad should converge in simple training.. failed'
ok = .false.
end if


if (ok) then
print '(a)', 'test_optimizers: All tests passed.'
Expand Down

0 comments on commit b119194

Please sign in to comment.