diff --git a/psom_heartbeat.m b/psom_heartbeat.m new file mode 100644 index 0000000..82ccfcb --- /dev/null +++ b/psom_heartbeat.m @@ -0,0 +1,65 @@ +function [] = psom_heartbeat(file_heart,file_kill,pid) +% This function is internal to PSOM and not meant to be used directly. +% +% PSOM_HEARTBEAT(FILE_HEART,FILE_KILL,PID) +% +% FILE_HEART (string) the name of a .mat file. The following variables will be updated every 5s +% inside the .mat file: +% CURR_TIME (vector) the output of clock +% TELAPSED (scalar) the time (s) elapsed since the heartbeat was started. +% FILE_KILL (string) the name of a file. If this file is detected at any point in time +% the function will kill the process PID and exit. +% PID (scalar) a process ID. +% +% See licensing information in the code. +% +%system(['octave --eval "cd /home/pbellec/, build_path_std, cd /home/pbellec/tmp/tmp, psom_heartbeat(''toto.mat'',''tata.mat'',' num2str(getpid) '); exit"'],false,'async') + +% Copyright (c) Pierre Bellec, Montreal Neurological Institute, 2008-2010. +% Departement d'informatique et de recherche operationnelle +% Centre de recherche de l'institut de Geriatrie de Montreal +% Universite de Montreal, 2011 +% Maintainer : pierre.bellec@criugm.qc.ca +% See licensing information in the code. +% Keywords : pipeline + +% Permission is hereby granted, free of charge, to any person obtaining a copy +% of this software and associated documentation files (the "Software"), to deal +% in the Software without restriction, including without limitation the rights +% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +% copies of the Software, and to permit persons to whom the Software is +% furnished to do so, subject to the following conditions: +% +% The above copyright notice and this permission notice shall be included in +% all copies or substantial portions of the Software. +% +% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +% THE SOFTWARE. +tic; +flag_beat = true; +while flag_beat + if exist('OCTAVE_VERSION','builtin') + [err,msg] = kill(pid,0); % use the kill octave command + else + [err,msg] = system(sprintf('kill %i',pid)); % kill is not available, try a system call + end + flag_beat = err==0; + telapsed = toc; + curr_time = clock; + save(file_heart,'telapsed','curr_time'); + if exist(file_kill,'file') + if exist('OCTAVE_VERSION','builtin') + kill(pid,9) + else + system(sprintf('kill -9 %i',pid)); + end + exit + end + pause(5) +end + \ No newline at end of file diff --git a/psom_run_job.m b/psom_run_job.m index 9aece69..f1f8b18 100644 --- a/psom_run_job.m +++ b/psom_run_job.m @@ -1,8 +1,8 @@ -function flag_failed = psom_run_job(file_job) +function flag_failed = psom_run_job(file_job,flag_heartbeat) % Run a PSOM job. % % SYNTAX: -% FLAG_FAILED = PSOM_RUN_JOB(FILE_JOB) +% FLAG_FAILED = PSOM_RUN_JOB(FILE_JOB , FLAG_HEARTBEAT) %_________________________________________________________________________ % INPUTS: % @@ -12,6 +12,14 @@ % also be specified through a mat file, where the job attributes are % saved as variables. % +% FLAG_HEARTBEAT +% (boolean, default false) if the flag is true, then a new subprocess +% will be started, using matlab or octave, that will generate +% a .heartbeat.mat file updated every 5 seconds. This subprocess +% will also detect the presence of a .kill file and, if detected, +% will kill the main process. This mechanism is only available when +% PSOM_RUN_JOB is called by PSOM_RUN_PIPELINE. +% %_________________________________________________________________________ % OUTPUTS: % @@ -64,14 +72,26 @@ psom_gb_vars seed = psom_set_rand_seed(); -try - %% Generate file names - [path_f,name_job,ext_f] = fileparts(file_job); +%% Default options +if nargin < 2 + flag_heartbeat = false; +end - if ~strcmp(ext_f,'.mat') - error('The job file %s should be a .mat file !',file_job); +%% name of the job +if ischar(file_job) + [path_f,name_job,ext_f] = fileparts(file_job); + if isempty(path_f) + path_f = '.'; end + flag_char = true; +else + name_job = 'manual'; + flag_char = false; +end +gb_psom_name_job = name_job; +%% Generate file names +if flag_char && strcmp(ext_f,'.mat') file_jobs = [path_f filesep 'PIPE_jobs.mat']; file_running = [path_f filesep name_job '.running']; file_failed = [path_f filesep name_job '.failed']; @@ -79,14 +99,11 @@ file_profile = [path_f filesep name_job '.profile.mat']; file_heartbeat = [path_f filesep name_job '.heartbeat.mat']; file_kill = [path_f filesep name_job '.kill']; - -catch - name_job = 'manual'; -end -gb_psom_name_job = name_job; +end try - job = sub_load_job(file_jobs,name_job); % This is launched through the pipeline manager + pipe = load(file_jobs,name_job); % This is launched through the pipeline manager + job = pipe.(name_job); flag_psom = true; catch if ischar(file_job) @@ -108,18 +125,15 @@ end %% Start a heartbeat -pid = getpid; -[err,msg] = system(sprintf('kill -0 %i')); % check that the running status of the process can be checked using kill -flag_heartbeat = (err==0)||strcmp('gb_psom_language','octave') -if flag_heartbeat - cmd = sprintf('psom_heartbeat(''%s'',''%s'',%i)',file_hearbeat,file_kill,pid); - if strcmp('gb_psom_language','octave') +if flag_psom && flag_heartbeat + main_pid = getpid; + cmd = sprintf('psom_heartbeat(''%s'',''%s'',%i)',file_heartbeat,file_kill,main_pid); + if strcmp(gb_psom_language,'octave') instr_heartbeat = sprintf('"%s" %s "addpath(''%s''), %s,exit"',gb_psom_command_octave,gb_psom_opt_matlab,gb_psom_path_psom,cmd); else instr_heartbeat = sprintf('"%s" %s "addpath(''%s''), %s,exit"',gb_psom_command_matlab,gb_psom_opt_matlab,gb_psom_path_psom,cmd); end system([instr_heartbeat '&']); - warning('PSOM was not able to implement a heartbeat on that system, and will not be able to detect a job crash, or to terminate the job if the pipeline is stopped'); end %% Upload job info @@ -231,7 +245,3 @@ eval(command) -function job = sub_load_job(file_jobs,name_job) - -load(file_jobs,name_job); -eval(['job = ' name_job ';']);