Skip to content
This repository has been archived by the owner on Aug 15, 2024. It is now read-only.

Commit

Permalink
A new heartbeat function meant to create a file signaling that a main…
Browse files Browse the repository at this point in the history
… process is still alive, and also send a kill signal to the main process if a .kill file is detected. That mechanism has been added (with a flag_heartbeat, false by default) to PSOM_RUN_JOB. It still needs to be added to PSOM_RUN_PIPELINE.
  • Loading branch information
pbellec committed Mar 28, 2015
1 parent 2e94551 commit a2a13e1
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 24 deletions.
65 changes: 65 additions & 0 deletions psom_heartbeat.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
function [] = psom_heartbeat(file_heart,file_kill,pid)
% This function is internal to PSOM and not meant to be used directly.
%
% PSOM_HEARTBEAT(FILE_HEART,FILE_KILL,PID)
%
% FILE_HEART (string) the name of a .mat file. The following variables will be updated every 5s
% inside the .mat file:
% CURR_TIME (vector) the output of clock
% TELAPSED (scalar) the time (s) elapsed since the heartbeat was started.
% FILE_KILL (string) the name of a file. If this file is detected at any point in time
% the function will kill the process PID and exit.
% PID (scalar) a process ID.
%
% See licensing information in the code.
%
%system(['octave --eval "cd /home/pbellec/, build_path_std, cd /home/pbellec/tmp/tmp, psom_heartbeat(''toto.mat'',''tata.mat'',' num2str(getpid) '); exit"'],false,'async')

% Copyright (c) Pierre Bellec, Montreal Neurological Institute, 2008-2010.
% Departement d'informatique et de recherche operationnelle
% Centre de recherche de l'institut de Geriatrie de Montreal
% Universite de Montreal, 2011
% Maintainer : [email protected]
% See licensing information in the code.
% Keywords : pipeline

% Permission is hereby granted, free of charge, to any person obtaining a copy
% of this software and associated documentation files (the "Software"), to deal
% in the Software without restriction, including without limitation the rights
% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
% copies of the Software, and to permit persons to whom the Software is
% furnished to do so, subject to the following conditions:
%
% The above copyright notice and this permission notice shall be included in
% all copies or substantial portions of the Software.
%
% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
% THE SOFTWARE.
tic;
flag_beat = true;
while flag_beat
if exist('OCTAVE_VERSION','builtin')
[err,msg] = kill(pid,0); % use the kill octave command
else
[err,msg] = system(sprintf('kill %i',pid)); % kill is not available, try a system call
end
flag_beat = err==0;
telapsed = toc;
curr_time = clock;
save(file_heart,'telapsed','curr_time');
if exist(file_kill,'file')
if exist('OCTAVE_VERSION','builtin')
kill(pid,9)
else
system(sprintf('kill -9 %i',pid));
end
exit
end
pause(5)
end

58 changes: 34 additions & 24 deletions psom_run_job.m
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
function flag_failed = psom_run_job(file_job)
function flag_failed = psom_run_job(file_job,flag_heartbeat)
% Run a PSOM job.
%
% SYNTAX:
% FLAG_FAILED = PSOM_RUN_JOB(FILE_JOB)
% FLAG_FAILED = PSOM_RUN_JOB(FILE_JOB , FLAG_HEARTBEAT)
%_________________________________________________________________________
% INPUTS:
%
Expand All @@ -12,6 +12,14 @@
% also be specified through a mat file, where the job attributes are
% saved as variables.
%
% FLAG_HEARTBEAT
% (boolean, default false) if the flag is true, then a new subprocess
% will be started, using matlab or octave, that will generate
% a <NAME_JOB>.heartbeat.mat file updated every 5 seconds. This subprocess
% will also detect the presence of a <NAME_JOB>.kill file and, if detected,
% will kill the main process. This mechanism is only available when
% PSOM_RUN_JOB is called by PSOM_RUN_PIPELINE.
%
%_________________________________________________________________________
% OUTPUTS:
%
Expand Down Expand Up @@ -64,29 +72,38 @@
psom_gb_vars
seed = psom_set_rand_seed();

try
%% Generate file names
[path_f,name_job,ext_f] = fileparts(file_job);
%% Default options
if nargin < 2
flag_heartbeat = false;
end

if ~strcmp(ext_f,'.mat')
error('The job file %s should be a .mat file !',file_job);
%% name of the job
if ischar(file_job)
[path_f,name_job,ext_f] = fileparts(file_job);
if isempty(path_f)
path_f = '.';
end
flag_char = true;
else
name_job = 'manual';
flag_char = false;
end
gb_psom_name_job = name_job;

%% Generate file names
if flag_char && strcmp(ext_f,'.mat')
file_jobs = [path_f filesep 'PIPE_jobs.mat'];
file_running = [path_f filesep name_job '.running'];
file_failed = [path_f filesep name_job '.failed'];
file_finished = [path_f filesep name_job '.finished'];
file_profile = [path_f filesep name_job '.profile.mat'];
file_heartbeat = [path_f filesep name_job '.heartbeat.mat'];
file_kill = [path_f filesep name_job '.kill'];

catch
name_job = 'manual';
end
gb_psom_name_job = name_job;
end

try
job = sub_load_job(file_jobs,name_job); % This is launched through the pipeline manager
pipe = load(file_jobs,name_job); % This is launched through the pipeline manager
job = pipe.(name_job);
flag_psom = true;
catch
if ischar(file_job)
Expand All @@ -108,18 +125,15 @@
end

%% Start a heartbeat
pid = getpid;
[err,msg] = system(sprintf('kill -0 %i')); % check that the running status of the process can be checked using kill
flag_heartbeat = (err==0)||strcmp('gb_psom_language','octave')
if flag_heartbeat
cmd = sprintf('psom_heartbeat(''%s'',''%s'',%i)',file_hearbeat,file_kill,pid);
if strcmp('gb_psom_language','octave')
if flag_psom && flag_heartbeat
main_pid = getpid;
cmd = sprintf('psom_heartbeat(''%s'',''%s'',%i)',file_heartbeat,file_kill,main_pid);
if strcmp(gb_psom_language,'octave')
instr_heartbeat = sprintf('"%s" %s "addpath(''%s''), %s,exit"',gb_psom_command_octave,gb_psom_opt_matlab,gb_psom_path_psom,cmd);
else
instr_heartbeat = sprintf('"%s" %s "addpath(''%s''), %s,exit"',gb_psom_command_matlab,gb_psom_opt_matlab,gb_psom_path_psom,cmd);
end
system([instr_heartbeat '&']);
warning('PSOM was not able to implement a heartbeat on that system, and will not be able to detect a job crash, or to terminate the job if the pipeline is stopped');
end

%% Upload job info
Expand Down Expand Up @@ -231,7 +245,3 @@

eval(command)

function job = sub_load_job(file_jobs,name_job)

load(file_jobs,name_job);
eval(['job = ' name_job ';']);

0 comments on commit a2a13e1

Please sign in to comment.