Skip to content

Commit

Permalink
layernorm: rearrange into submodule
Browse files Browse the repository at this point in the history
  • Loading branch information
OneAdder committed Feb 17, 2025
1 parent 70d0f27 commit f08f804
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 107 deletions.
124 changes: 17 additions & 107 deletions src/nf/nf_layernorm.f90
Original file line number Diff line number Diff line change
Expand Up @@ -41,111 +41,21 @@ module function layernorm_layer_cons() &
end function layernorm_layer_cons
end interface layernorm_layer

contains
module function layernorm_layer_cons() &
result(res)
type(layernorm_layer) :: res

res % eps = 1e-5
end function layernorm_layer_cons

pure module subroutine forward(self, input)
class(layernorm_layer), intent(in out) :: self
real, intent(in) :: input(:, :)
real, allocatable :: normalized(:, :)
integer :: i

allocate(normalized(self % sequence_length, self % model_dimension))

! mu = x - MEAN_last_dim(x)
do concurrent(i = 1: self % model_dimension)
self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension)
end do

! square root of variance shifted be eps
self % sigma = sqrt((sum(self % mu ** 2, dim=2) / self % model_dimension) + self % eps)

! normalize mu by variance by first axis
do concurrent(i = 1: self % model_dimension)
normalized(:, i) = self % mu(:, i) / self % sigma
end do

! forward through trainable params gamma and beta
do concurrent(i = 1: self % sequence_length)
self % output(i, :) = normalized(i, :) * self % gamma + self % beta
end do

deallocate(normalized)
end subroutine forward

pure module subroutine backward(self, input, gradient)
class(layernorm_layer), intent(in out) :: self
real, intent(in) :: input(:, :)
real, intent(in) :: gradient(:, :)
real, allocatable :: one_over_sigma(:, :)
real, allocatable :: gradient_by_gamma_over_sigma(:, :)

allocate(one_over_sigma(self % sequence_length, self % model_dimension))
allocate(gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension))

one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension))
gradient_by_gamma_over_sigma = &
gradient &
* spread(self % gamma, dim=1, ncopies=self % sequence_length) &
* one_over_sigma

! d_output/d_gamma = sum(d_output/d_y * mu/sigma)
self % d_gamma = sum(gradient * self % mu * one_over_sigma, dim=1)

! d_output/d_beta = sum(d_output/d_y) * 1
self % d_beta = sum(gradient, dim=1)

! From this article:
! https://robotchinwag.com/posts/layer-normalization-deriving-the-gradient-for-the-backward-pass/
! d_output/d_x = d_output/d_y * gamma/sigma
! - d_output/d_y
! - sum(d_output/d_y * gamma/sigma) / len
! - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len
self % gradient = &
gradient_by_gamma_over_sigma &
- spread(&
sum(gradient_by_gamma_over_sigma, dim=2),&
dim=2,&
ncopies=self % model_dimension&
) / self % model_dimension &
- self % mu * spread(&
sum(gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2), dim=2),&
dim=2,&
ncopies=self % model_dimension&
) / self % model_dimension

deallocate(one_over_sigma)
deallocate(gradient_by_gamma_over_sigma)
end subroutine backward

module subroutine init(self, input_shape)
class(layernorm_layer), intent(in out) :: self
integer, intent(in) :: input_shape(:)

if (size(input_shape) /= 2) then
error stop "LayerNorm Layer accepts 2D input"
end if
self % sequence_length = input_shape(1)
self % model_dimension = input_shape(2)

! default initialization from PyTorch
allocate(self % gamma(self % model_dimension))
self % gamma = 1.
allocate(self % beta(self % model_dimension))
self % beta = 0.

allocate(self % d_gamma(self % model_dimension))
allocate(self % d_beta(self % model_dimension))
allocate(self % gradient(self % sequence_length, self % model_dimension))

allocate(self % mu(self % sequence_length, self % model_dimension))
allocate(self % sigma(self % sequence_length))

allocate(self % output(self % sequence_length, self % model_dimension))
end subroutine init
interface
pure module subroutine forward(self, input)
class(layernorm_layer), intent(in out) :: self
real, intent(in) :: input(:, :)
end subroutine forward

pure module subroutine backward(self, input, gradient)
class(layernorm_layer), intent(in out) :: self
real, intent(in) :: input(:, :)
real, intent(in) :: gradient(:, :)
end subroutine backward

module subroutine init(self, input_shape)
class(layernorm_layer), intent(in out) :: self
integer, intent(in) :: input_shape(:)
end subroutine init
end interface
end module nf_layernorm_layer
110 changes: 110 additions & 0 deletions src/nf/nf_layernorm_submodule.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
submodule(nf_layernorm_layer) nf_layernorm_layer_submodule
implicit none
contains
module function layernorm_layer_cons() &
result(res)
type(layernorm_layer) :: res

res % eps = 1e-5
end function layernorm_layer_cons

pure module subroutine forward(self, input)
class(layernorm_layer), intent(in out) :: self
real, intent(in) :: input(:, :)
real, allocatable :: normalized(:, :)
integer :: i

allocate(normalized(self % sequence_length, self % model_dimension))

! mu = x - MEAN_last_dim(x)
do concurrent(i = 1: self % model_dimension)
self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension)
end do

! square root of variance shifted be eps
self % sigma = sqrt((sum(self % mu ** 2, dim=2) / self % model_dimension) + self % eps)

! normalize mu by variance by first axis
do concurrent(i = 1: self % model_dimension)
normalized(:, i) = self % mu(:, i) / self % sigma
end do

! forward through trainable params gamma and beta
do concurrent(i = 1: self % sequence_length)
self % output(i, :) = normalized(i, :) * self % gamma + self % beta
end do

deallocate(normalized)
end subroutine forward

pure module subroutine backward(self, input, gradient)
class(layernorm_layer), intent(in out) :: self
real, intent(in) :: input(:, :)
real, intent(in) :: gradient(:, :)
real, allocatable :: one_over_sigma(:, :)
real, allocatable :: gradient_by_gamma_over_sigma(:, :)

allocate(one_over_sigma(self % sequence_length, self % model_dimension))
allocate(gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension))

one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension))
gradient_by_gamma_over_sigma = &
gradient &
* spread(self % gamma, dim=1, ncopies=self % sequence_length) &
* one_over_sigma

! d_output/d_gamma = sum(d_output/d_y * mu/sigma)
self % d_gamma = sum(gradient * self % mu * one_over_sigma, dim=1)

! d_output/d_beta = sum(d_output/d_y) * 1
self % d_beta = sum(gradient, dim=1)

! From this article:
! https://robotchinwag.com/posts/layer-normalization-deriving-the-gradient-for-the-backward-pass/
! d_output/d_x = d_output/d_y * gamma/sigma
! - d_output/d_y
! - sum(d_output/d_y * gamma/sigma) / len
! - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len
self % gradient = &
gradient_by_gamma_over_sigma &
- spread(&
sum(gradient_by_gamma_over_sigma, dim=2),&
dim=2,&
ncopies=self % model_dimension&
) / self % model_dimension &
- self % mu * spread(&
sum(gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2), dim=2),&
dim=2,&
ncopies=self % model_dimension&
) / self % model_dimension

deallocate(one_over_sigma)
deallocate(gradient_by_gamma_over_sigma)
end subroutine backward

module subroutine init(self, input_shape)
class(layernorm_layer), intent(in out) :: self
integer, intent(in) :: input_shape(:)

if (size(input_shape) /= 2) then
error stop "LayerNorm Layer accepts 2D input"
end if
self % sequence_length = input_shape(1)
self % model_dimension = input_shape(2)

! default initialization from PyTorch
allocate(self % gamma(self % model_dimension))
self % gamma = 1.
allocate(self % beta(self % model_dimension))
self % beta = 0.

allocate(self % d_gamma(self % model_dimension))
allocate(self % d_beta(self % model_dimension))
allocate(self % gradient(self % sequence_length, self % model_dimension))

allocate(self % mu(self % sequence_length, self % model_dimension))
allocate(self % sigma(self % sequence_length))

allocate(self % output(self % sequence_length, self % model_dimension))
end subroutine init
end submodule nf_layernorm_layer_submodule

0 comments on commit f08f804

Please sign in to comment.