modern-fortran · milancurcic · Feb 21, 2025 · Feb 2, 2025 · Feb 14, 2025 · Feb 16, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -20,6 +20,7 @@ add_library(neural-fortran
   src/nf/nf_base_layer.f90
   src/nf/nf_conv2d_layer.f90
   src/nf/nf_conv2d_layer_submodule.f90
+  src/nf/nf_cross_attention_layer.f90
   src/nf/nf_datasets.f90
   src/nf/nf_datasets_submodule.f90
   src/nf/nf_datasets_mnist.f90
@@ -45,6 +46,8 @@ add_library(neural-fortran
   src/nf/nf_maxpool2d_layer.f90
   src/nf/nf_maxpool2d_layer_submodule.f90
   src/nf/nf_metrics.f90
+  src/nf/nf_multihead_attention.f90
+  src/nf/nf_multihead_attention_submodule.f90
   src/nf/nf_network.f90
   src/nf/nf_network_submodule.f90
   src/nf/nf_optimizers.f90
@@ -53,6 +56,7 @@ add_library(neural-fortran
   src/nf/nf_random.f90
   src/nf/nf_reshape_layer.f90
   src/nf/nf_reshape_layer_submodule.f90
+  src/nf/nf_self_attention_layer.f90
   src/nf/io/nf_io_binary.f90
   src/nf/io/nf_io_binary_submodule.f90
 )

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
@@ -6,6 +6,7 @@ foreach(execid
   simple
   sine
   quadratic
+  mha_simple
 )
   add_executable(${execid} ${execid}.f90)
   target_link_libraries(${execid} PRIVATE

diff --git a/example/mha_simple.f90 b/example/mha_simple.f90
@@ -0,0 +1,37 @@
+program simple
+  use nf, only: dense, input, network, sgd, self_attention, flatten
+  implicit none
+  type(network) :: net
+  real, allocatable :: x(:, :), y(:)
+  integer, parameter :: num_iterations = 500
+  integer :: n
+
+  print '("Simple")'
+  print '(60("="))'
+
+  net = network([ &
+    input(3, 8), &
+    self_attention(4), &
+    flatten(), &
+    dense(2) &
+  ])
+
+  call net % print_info()
+
+  allocate(x(3, 8))
+  call random_number(x)
+
+  y = [0.123456, 0.246802]
+
+  do n = 0, num_iterations
+
+    call net % forward(x)
+    call net % backward(y)
+    call net % update(optimizer=sgd(learning_rate=1.))
+
+    if (mod(n, 50) == 0) &
+      print '(i4,2(3x,f8.6))', n, net % predict(x)
+
+  end do
+
+end program simple
diff --git a/src/nf.f90 b/src/nf.f90
@@ -3,7 +3,7 @@ module nf
   use nf_datasets_mnist, only: label_digits, load_mnist
   use nf_layer, only: layer
   use nf_layer_constructors, only: &
-    conv2d, dense, flatten, input, maxpool2d, reshape, linear2d
+    conv2d, dense, flatten, input, maxpool2d, reshape, linear2d, self_attention
   use nf_loss, only: mse, quadratic
   use nf_metrics, only: corr, maxabs
   use nf_network, only: network
@@ -12,4 +12,6 @@ module nf
                            gaussian, linear, relu, leaky_relu,     &
                            sigmoid, softmax, softplus, step, tanhf, &
                            celu
+  use nf_linear2d_layer, only: linear2d_layer
+  use nf_multihead_attention_layer, only: multihead_attention_layer
 end module nf
diff --git a/src/nf/nf_cross_attention_layer.f90 b/src/nf/nf_cross_attention_layer.f90
@@ -0,0 +1,66 @@
+module nf_cross_attention_layer
+  use iso_fortran_env, only: stderr => error_unit
+  use nf_activation, only: softmax
+  use nf_linear2d_layer, only: linear2d_layer
+  use nf_multihead_attention_layer, only: multihead_attention_layer
+
+  implicit none
+
+  type, extends(multihead_attention_layer) :: cross_attention_layer
+    !! Cross Attention Layer
+    !! Source:
+    !! Bahdanau, D. (2014)
+    !! Neural machine translation by jointly learning to align and translate.
+    !! https://arxiv.org/pdf/1409.0473
+    real, allocatable :: gradient(:, :, :)
+  contains
+    procedure :: forward
+    procedure :: backward
+    procedure :: init
+  end type cross_attention_layer
+
+  interface cross_attention_layer
+    module function cross_attention_layer_cons(n_heads) result(res)
+      !! This function returns the `cross_attention_layer` instance.
+      integer, intent(in) :: sequence_length, model_dimension, n_heads
+      type(cross_attention_layer) :: res
+    end function cross_attention_layer_cons
+  end interface cross_attention_layer
+
+contains
+  module function cross_attention_layer_cons(n_heads) result(res)
+    !! This function returns the `cross_attention_layer` instance.
+    integer, intent(in) :: n_heads
+    type(cross_attention_layer) :: res
+    res % n_heads = n_heads
+  end function cross_attention_layer_cons
+
+  pure module subroutine backward(self, input, gradient)
+    !! Cross Attention Back propagation
+    class(cross_attention_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :, :)
+    real, intent(in) :: gradient(:, :)
+
+    call self % common_backward(input(1, :, :), gradient)
+    self % gradient(1, :, :) = self % query_layer % gradient
+    self % gradient(2, :, :) = self % key_layer % gradient + self % value_layer % gradient
+  end subroutine backward
+
+  pure module subroutine forward(self, input)
+    !! Cross Attention Forward propagation
+    !! Input Shape (kind, sequence_length, model_dimension)
+    !! where kind is 1 for Query and 2 for Key-Value
+    class(cross_attention_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :, :)
+
+    call self % common_forward(input(1, :, :), input(2, :, :), input(2, :, :))
+  end subroutine forward
+
+  module subroutine init(self, input_shape)
+    class(cross_attention_layer), intent(in out) :: self
+    integer, intent(in) :: input_shape(:)
+
+    call self % init_base(input_shape)
+    allocate(self % gradient(2, self % sequence_length, self % model_dimension))
+  end subroutine init
+end module nf_cross_attention_layer
diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90
@@ -8,7 +8,7 @@ module nf_layer_constructors
   implicit none
 
   private
-  public :: conv2d, dense, flatten, input, maxpool2d, reshape, linear2d
+  public :: conv2d, dense, flatten, input, maxpool2d, reshape, linear2d, self_attention
 
   interface input
 
@@ -195,6 +195,16 @@ module function linear2d(out_features) result(res)
         !! Resulting layer instance
     end function linear2d
 
+  module function self_attention(n_heads) result(res)
+    !! Rank-2 (sequence_length, out_features) self attention constructor.
+    !! sequence_length and model_dimension are determined at layer initialization, based on the
+    !! output shape of the previous layer.
+    integer, intent(in) :: n_heads
+      !! Number of attention heads
+    type(layer) :: res
+      !! Resulting layer instance
+  end function self_attention
+
   end interface
 
 end module nf_layer_constructors
diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90
@@ -10,6 +10,7 @@
   use nf_maxpool2d_layer, only: maxpool2d_layer
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
+  use nf_self_attention_layer, only: self_attention_layer
   use nf_activation, only: activation_function, relu, sigmoid
 
   implicit none
@@ -160,4 +161,12 @@ module function linear2d(out_features) result(res)
 
   end function linear2d
 
+  module function self_attention(n_heads) result(res)
+    integer, intent(in) :: n_heads
+    type(layer) :: res
+
+    res % name = 'self_attention'
+    allocate(res % p, source=self_attention_layer(n_heads))
+  end function self_attention
+
 end submodule nf_layer_constructors_submodule
diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90
@@ -10,6 +10,7 @@
   use nf_maxpool2d_layer, only: maxpool2d_layer
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
+  use nf_self_attention_layer, only: self_attention_layer
   use nf_optimizers, only: optimizer_base_type
 
 contains
@@ -50,6 +51,8 @@ pure module subroutine backward_1d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(linear2d_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(self_attention_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
     end select
@@ -72,6 +75,19 @@ pure module subroutine backward_2d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(linear2d_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(self_attention_layer)
+            call this_layer % backward(prev_layer % output, gradient)
+        end select
+
+      type is(self_attention_layer)
+
+        select type(prev_layer => previous % p)
+          type is(input2d_layer)
+            call this_layer % backward(prev_layer % output, gradient)
+          type is(linear2d_layer)
+            call this_layer % backward(prev_layer % output, gradient)
+          type is(self_attention_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
     end select
@@ -219,6 +235,20 @@ pure module subroutine forward(self, input)
             call this_layer % forward(prev_layer % output)
           type is(linear2d_layer)
             call this_layer % forward(prev_layer % output)
+          type is(self_attention_layer)
+            call this_layer % forward(prev_layer % output)
+        end select
+
+      type is(self_attention_layer)
+
+        ! Upstream layers permitted: input2d, linear2d
+        select type(prev_layer => input % p)
+          type is(input2d_layer)
+            call this_layer % forward(prev_layer % output)
+          type is(linear2d_layer)
+            call this_layer % forward(prev_layer % output)
+          type is(self_attention_layer)
+            call this_layer % forward(prev_layer % output)
         end select
 
     end select
@@ -258,6 +288,8 @@ pure module subroutine get_output_2d(self, output)
         allocate(output, source=this_layer % output)
       type is(linear2d_layer)
         allocate(output, source=this_layer % output)
+      type is(self_attention_layer)
+        allocate(output, source=this_layer % output)
       class default
         error stop '2-d output can only be read from an input2d or linear2d layer.'
 
@@ -301,7 +333,7 @@ impure elemental module subroutine init(self, input)
       call this_layer % init(input % layer_shape)
     end select
 
-    ! The shape of linear2d, conv2d, maxpool2d, or flatten layers is not known
+    ! The shape of self_attention, linear2d, conv2d, maxpool2d, or flatten layers is not known
     ! until we receive an input layer.
     select type(this_layer => self % p)
       type is(conv2d_layer)
@@ -312,6 +344,8 @@ impure elemental module subroutine init(self, input)
         self % layer_shape = shape(this_layer % output)
       type is(linear2d_layer)
         self % layer_shape = shape(this_layer % output)
+      type is(self_attention_layer)
+        self % layer_shape = shape(this_layer % output)
     end select
 
     self % input_layer_shape = input % layer_shape
@@ -359,6 +393,8 @@ elemental module function get_num_params(self) result(num_params)
         num_params = 0
       type is (linear2d_layer)
         num_params = this_layer % get_num_params()
+      type is (self_attention_layer)
+        num_params = this_layer % get_num_params()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -388,6 +424,8 @@ module function get_params(self) result(params)
         ! No parameters to get.
       type is (linear2d_layer)
         params = this_layer % get_params()
+      type is (self_attention_layer)
+        params = this_layer % get_params()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -417,6 +455,8 @@ module function get_gradients(self) result(gradients)
         ! No gradients to get.
       type is (linear2d_layer)
         gradients = this_layer % get_gradients()
+      type is (self_attention_layer)
+        gradients = this_layer % get_gradients()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -467,6 +507,9 @@ module subroutine set_params(self, params)
       type is (linear2d_layer)
         call this_layer % set_params(params)
 
+      type is (self_attention_layer)
+        call this_layer % set_params(params)
+
       type is (maxpool2d_layer)
         ! No parameters to set.
         write(stderr, '(a)') 'Warning: calling set_params() ' &