Tried making more RAM friendly, is now broken.

karpathy · Jun 3, 2015 · 477b19a · 477b19a
1 parent 716837a
commit 477b19a
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 5 deletions.
diff --git a/train.lua b/train.lua
@@ -20,6 +20,7 @@ require 'optim'
 require 'lfs'
 
 require 'util.OneHot'
+require 'util.Embedding'
 require 'util.misc'
 
 local model_utils = require 'util.model_utils'
@@ -87,9 +88,19 @@ if not path.exists(opt.checkpoint_dir) then lfs.mkdir(opt.checkpoint_dir) end
 
 -- define the model: prototypes for one timestep, then clone them in time
 protos = {}
-protos.embed = OneHot(vocab_size)
+local embeded_size = 100
+local input_size, embeded_size
+if opt.words then
+    print('using an embedding transform for input...')
+    embeded_size = 100
+    protos.embed = Embedding(vocab_size, embeded_size)
+else
+    print('using one-hot for input...')
+    embeded_size = vocab_size
+    protos.embed = OneHot(vocab_size)
+end
 print('creating an LSTM with ' .. opt.num_layers .. ' layers')
-protos.rnn = LSTM.lstm(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout)
+protos.rnn = LSTM.lstm(embeded_size, opt.rnn_size, opt.num_layers, opt.dropout)
 -- the initial state of the cell/hidden states
 init_state = {}
 for L=1,opt.num_layers do
@@ -100,7 +111,7 @@ for L=1,opt.num_layers do
 end
 state_predict_index = #init_state -- index of blob to make prediction from
 -- classifier on top
-protos.softmax = nn.Sequential():add(nn.Linear(opt.rnn_size, vocab_size)):add(nn.LogSoftMax())
+protos.softmax = nn.Sequential():add(nn.Linear(opt.rnn_size, embeded_size)):add(nn.LogSoftMax())
 -- training criterion (negative log likelihood)
 protos.criterion = nn.ClassNLLCriterion()
 
@@ -182,7 +193,10 @@ function feval(x)
         rnn_state[t] = clones.rnn[t]:forward{embeddings[t], unpack(rnn_state[t-1])}
         -- the following line is needed because nngraph tries to be clever
         if type(rnn_state[t]) ~= 'table' then rnn_state[t] = {rnn_state[t]} end
+
         predictions[t] = clones.softmax[t]:forward(rnn_state[t][state_predict_index])
+
+        -- predictions should be 200 me thinks
         loss = loss + clones.criterion[t]:forward(predictions[t], y[{{}, t}])
     end
     loss = loss / opt.seq_length
@@ -227,16 +241,18 @@ local optim_state = {learningRate = opt.learning_rate, alpha = opt.decay_rate}
 local iterations = opt.max_epochs * loader.ntrain
 local iterations_per_epoch = loader.ntrain
 local loss0 = nil
+
 for i = 1, iterations do
+
     local epoch = i / loader.ntrain
 
     local timer = torch.Timer()
+
     local _, loss = optim.rmsprop(feval, params, optim_state)
     local time = timer:time().real
 
     local train_loss = loss[1] -- the loss is inside a list, pop it
     train_losses[i] = train_loss
-
     -- every now and then or on last iteration
     if i % opt.eval_val_every == 0 or i == iterations then
         -- evaluate loss on validation data

diff --git a/util/Embedding.lua b/util/Embedding.lua
@@ -0,0 +1,53 @@
+--[[
+  Copyright 2014 Google Inc. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+]]--
+
+local Embedding, parent = torch.class('Embedding', 'nn.Module')
+
+function Embedding:__init(inputSize, outputSize)
+  parent.__init(self)
+  self.outputSize = outputSize
+  self.weight = torch.Tensor(inputSize, outputSize)
+  self.gradWeight = torch.Tensor(inputSize, outputSize)
+end
+
+function Embedding:updateOutput(input)
+  self.output:resize(input:size(1), self.outputSize)
+  for i = 1, input:size(1) do
+    self.output[i]:copy(self.weight[input[i]])
+  end
+  return self.output
+end
+
+function Embedding:updateGradInput(input, gradOutput)
+  if self.gradInput then
+    self.gradInput:resize(input:size())
+    return self.gradInput
+  end
+end
+
+function Embedding:accGradParameters(input, gradOutput, scale)
+  scale = scale or 1
+  if scale == 0 then
+    self.gradWeight:zero()
+  end
+  for i = 1, input:size(1) do
+    local word = input[i]
+    self.gradWeight[word]:add(gradOutput[i])
+  end
+end
+
+-- we do not need to accumulate parameters when sharing
+Embedding.sharedAccUpdateGradParameters = Embedding.accUpdateGradParameters
diff --git a/util/model_utils.lua b/util/model_utils.lua
@@ -120,7 +120,6 @@ function model_utils.clone_many_times(net, T)
             params = {}
         end
     end
-
     local paramsNoGrad
     if net.parametersNoGrad then
         paramsNoGrad = net:parametersNoGrad()