From 32174c6fa0ca6ad15033a2c982038a1ef3b0cfe8 Mon Sep 17 00:00:00 2001 From: Pierre Bellec Date: Thu, 26 Mar 2015 00:02:48 -0400 Subject: [PATCH 1/3] Issue #2: an untested heartbeat function, a stub of code to start it as part of psom_run_job, and inclusion of matlab/octave invokation options in the gb_vars file. --- psom_gb_vars.m | 12 ++++++++++++ psom_run_job.m | 28 +++++++++++++++++++++++----- psom_run_script.m | 8 ++++---- 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/psom_gb_vars.m b/psom_gb_vars.m index 0395c4d..cbb4b8c 100644 --- a/psom_gb_vars.m +++ b/psom_gb_vars.m @@ -79,6 +79,18 @@ gb_psom_language = 'matlab'; end +% Options to start matlab +switch gb_psom_language + case 'matlab' + if ispc + gb_psom_opt_matlab = '-automation -nodesktop -singleCompThread -r'; + else + gb_psom_opt_matlab = '-nosplash -nodesktop -singleCompThread -r'; + end + case 'octave' + gb_psom_opt_matlab = '--silent --eval'; +end + % Get langage version if strcmp(gb_psom_language,'octave'); gb_psom_language_version = OCTAVE_VERSION; diff --git a/psom_run_job.m b/psom_run_job.m index af0ee98..9aece69 100644 --- a/psom_run_job.m +++ b/psom_run_job.m @@ -72,11 +72,14 @@ error('The job file %s should be a .mat file !',file_job); end - file_jobs = [path_f filesep 'PIPE_jobs.mat']; - file_running = [path_f filesep name_job '.running']; - file_failed = [path_f filesep name_job '.failed']; - file_finished = [path_f filesep name_job '.finished']; - file_profile = [path_f filesep name_job '.profile.mat']; + file_jobs = [path_f filesep 'PIPE_jobs.mat']; + file_running = [path_f filesep name_job '.running']; + file_failed = [path_f filesep name_job '.failed']; + file_finished = [path_f filesep name_job '.finished']; + file_profile = [path_f filesep name_job '.profile.mat']; + file_heartbeat = [path_f filesep name_job '.heartbeat.mat']; + file_kill = [path_f filesep name_job '.kill']; + catch name_job = 'manual'; end @@ -104,6 +107,21 @@ save(file_running,'tmp') end +%% Start a heartbeat +pid = getpid; +[err,msg] = system(sprintf('kill -0 %i')); % check that the running status of the process can be checked using kill +flag_heartbeat = (err==0)||strcmp('gb_psom_language','octave') +if flag_heartbeat + cmd = sprintf('psom_heartbeat(''%s'',''%s'',%i)',file_hearbeat,file_kill,pid); + if strcmp('gb_psom_language','octave') + instr_heartbeat = sprintf('"%s" %s "addpath(''%s''), %s,exit"',gb_psom_command_octave,gb_psom_opt_matlab,gb_psom_path_psom,cmd); + else + instr_heartbeat = sprintf('"%s" %s "addpath(''%s''), %s,exit"',gb_psom_command_matlab,gb_psom_opt_matlab,gb_psom_path_psom,cmd); + end + system([instr_heartbeat '&']); + warning('PSOM was not able to implement a heartbeat on that system, and will not be able to detect a job crash, or to terminate the job if the pipeline is stopped'); +end + %% Upload job info gb_name_structure = 'job'; gb_list_fields = { 'files_in' , 'files_out' , 'files_clean' , 'command','opt' , 'dep' , 'ispipeline' }; diff --git a/psom_run_script.m b/psom_run_script.m index d9306f8..8590cf5 100644 --- a/psom_run_script.m +++ b/psom_run_script.m @@ -226,12 +226,12 @@ switch gb_psom_language case 'matlab' if ispc - opt_matlab = '-automation -nodesktop -singleCompThread -r'; + gb_psom_opt_matlab = '-automation -nodesktop -singleCompThread -r'; else - opt_matlab = '-nosplash -nodesktop -singleCompThread -r'; + gb_psom_opt_matlab = '-nosplash -nodesktop -singleCompThread -r'; end case 'octave' - opt_matlab = '--silent --eval'; + gb_psom_opt_matlab = '--silent --eval'; end %% Set-up the search path for the job @@ -251,7 +251,7 @@ %% Add an appropriate call to Matlab/Octave if ~isempty(cmd) - instr_job = sprintf('"%s" %s "%s %s,exit"',opt.command_matlab,opt_matlab,opt.init_matlab,cmd); + instr_job = sprintf('"%s" %s "%s %s,exit"',opt.command_matlab,gb_psom_opt_matlab,opt.init_matlab,cmd); if ~isempty(logs) if opt.flag_debug instr_job = sprintf('%s >"%s" 2>&1\n',instr_job,logs.txt); From 2e94551c3ef2d7ff629efb7fcae28e3615a7e449 Mon Sep 17 00:00:00 2001 From: Pierre Bellec Date: Thu, 26 Mar 2015 00:05:13 -0400 Subject: [PATCH 2/3] Got rid of code that is not useful anymore. --- psom_run_script.m | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/psom_run_script.m b/psom_run_script.m index 8590cf5..3adc0ff 100644 --- a/psom_run_script.m +++ b/psom_run_script.m @@ -221,19 +221,6 @@ %%%%%%%%%%%%%%%%%%%%%%%%% %% Generate the script %% %%%%%%%%%%%%%%%%%%%%%%%%% - -%% Generate some OS-appropriate options to start Matlab/Octave -switch gb_psom_language - case 'matlab' - if ispc - gb_psom_opt_matlab = '-automation -nodesktop -singleCompThread -r'; - else - gb_psom_opt_matlab = '-nosplash -nodesktop -singleCompThread -r'; - end - case 'octave' - gb_psom_opt_matlab = '--silent --eval'; -end - %% Set-up the search path for the job if ~strcmp(opt.mode,'session')&&~isempty(cmd) if (length(opt.path_search)>4)&&(strcmp(opt.path_search(end-3:end),'.mat')) From a2a13e1b17da610e5af0122ccf30f546160f81ad Mon Sep 17 00:00:00 2001 From: Pierre Bellec Date: Sat, 28 Mar 2015 03:44:58 -0400 Subject: [PATCH 3/3] A new heartbeat function meant to create a file signaling that a main process is still alive, and also send a kill signal to the main process if a .kill file is detected. That mechanism has been added (with a flag_heartbeat, false by default) to PSOM_RUN_JOB. It still needs to be added to PSOM_RUN_PIPELINE. --- psom_heartbeat.m | 65 ++++++++++++++++++++++++++++++++++++++++++++++++ psom_run_job.m | 58 ++++++++++++++++++++++++------------------ 2 files changed, 99 insertions(+), 24 deletions(-) create mode 100644 psom_heartbeat.m diff --git a/psom_heartbeat.m b/psom_heartbeat.m new file mode 100644 index 0000000..82ccfcb --- /dev/null +++ b/psom_heartbeat.m @@ -0,0 +1,65 @@ +function [] = psom_heartbeat(file_heart,file_kill,pid) +% This function is internal to PSOM and not meant to be used directly. +% +% PSOM_HEARTBEAT(FILE_HEART,FILE_KILL,PID) +% +% FILE_HEART (string) the name of a .mat file. The following variables will be updated every 5s +% inside the .mat file: +% CURR_TIME (vector) the output of clock +% TELAPSED (scalar) the time (s) elapsed since the heartbeat was started. +% FILE_KILL (string) the name of a file. If this file is detected at any point in time +% the function will kill the process PID and exit. +% PID (scalar) a process ID. +% +% See licensing information in the code. +% +%system(['octave --eval "cd /home/pbellec/, build_path_std, cd /home/pbellec/tmp/tmp, psom_heartbeat(''toto.mat'',''tata.mat'',' num2str(getpid) '); exit"'],false,'async') + +% Copyright (c) Pierre Bellec, Montreal Neurological Institute, 2008-2010. +% Departement d'informatique et de recherche operationnelle +% Centre de recherche de l'institut de Geriatrie de Montreal +% Universite de Montreal, 2011 +% Maintainer : pierre.bellec@criugm.qc.ca +% See licensing information in the code. +% Keywords : pipeline + +% Permission is hereby granted, free of charge, to any person obtaining a copy +% of this software and associated documentation files (the "Software"), to deal +% in the Software without restriction, including without limitation the rights +% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +% copies of the Software, and to permit persons to whom the Software is +% furnished to do so, subject to the following conditions: +% +% The above copyright notice and this permission notice shall be included in +% all copies or substantial portions of the Software. +% +% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +% THE SOFTWARE. +tic; +flag_beat = true; +while flag_beat + if exist('OCTAVE_VERSION','builtin') + [err,msg] = kill(pid,0); % use the kill octave command + else + [err,msg] = system(sprintf('kill %i',pid)); % kill is not available, try a system call + end + flag_beat = err==0; + telapsed = toc; + curr_time = clock; + save(file_heart,'telapsed','curr_time'); + if exist(file_kill,'file') + if exist('OCTAVE_VERSION','builtin') + kill(pid,9) + else + system(sprintf('kill -9 %i',pid)); + end + exit + end + pause(5) +end + \ No newline at end of file diff --git a/psom_run_job.m b/psom_run_job.m index 9aece69..f1f8b18 100644 --- a/psom_run_job.m +++ b/psom_run_job.m @@ -1,8 +1,8 @@ -function flag_failed = psom_run_job(file_job) +function flag_failed = psom_run_job(file_job,flag_heartbeat) % Run a PSOM job. % % SYNTAX: -% FLAG_FAILED = PSOM_RUN_JOB(FILE_JOB) +% FLAG_FAILED = PSOM_RUN_JOB(FILE_JOB , FLAG_HEARTBEAT) %_________________________________________________________________________ % INPUTS: % @@ -12,6 +12,14 @@ % also be specified through a mat file, where the job attributes are % saved as variables. % +% FLAG_HEARTBEAT +% (boolean, default false) if the flag is true, then a new subprocess +% will be started, using matlab or octave, that will generate +% a .heartbeat.mat file updated every 5 seconds. This subprocess +% will also detect the presence of a .kill file and, if detected, +% will kill the main process. This mechanism is only available when +% PSOM_RUN_JOB is called by PSOM_RUN_PIPELINE. +% %_________________________________________________________________________ % OUTPUTS: % @@ -64,14 +72,26 @@ psom_gb_vars seed = psom_set_rand_seed(); -try - %% Generate file names - [path_f,name_job,ext_f] = fileparts(file_job); +%% Default options +if nargin < 2 + flag_heartbeat = false; +end - if ~strcmp(ext_f,'.mat') - error('The job file %s should be a .mat file !',file_job); +%% name of the job +if ischar(file_job) + [path_f,name_job,ext_f] = fileparts(file_job); + if isempty(path_f) + path_f = '.'; end + flag_char = true; +else + name_job = 'manual'; + flag_char = false; +end +gb_psom_name_job = name_job; +%% Generate file names +if flag_char && strcmp(ext_f,'.mat') file_jobs = [path_f filesep 'PIPE_jobs.mat']; file_running = [path_f filesep name_job '.running']; file_failed = [path_f filesep name_job '.failed']; @@ -79,14 +99,11 @@ file_profile = [path_f filesep name_job '.profile.mat']; file_heartbeat = [path_f filesep name_job '.heartbeat.mat']; file_kill = [path_f filesep name_job '.kill']; - -catch - name_job = 'manual'; -end -gb_psom_name_job = name_job; +end try - job = sub_load_job(file_jobs,name_job); % This is launched through the pipeline manager + pipe = load(file_jobs,name_job); % This is launched through the pipeline manager + job = pipe.(name_job); flag_psom = true; catch if ischar(file_job) @@ -108,18 +125,15 @@ end %% Start a heartbeat -pid = getpid; -[err,msg] = system(sprintf('kill -0 %i')); % check that the running status of the process can be checked using kill -flag_heartbeat = (err==0)||strcmp('gb_psom_language','octave') -if flag_heartbeat - cmd = sprintf('psom_heartbeat(''%s'',''%s'',%i)',file_hearbeat,file_kill,pid); - if strcmp('gb_psom_language','octave') +if flag_psom && flag_heartbeat + main_pid = getpid; + cmd = sprintf('psom_heartbeat(''%s'',''%s'',%i)',file_heartbeat,file_kill,main_pid); + if strcmp(gb_psom_language,'octave') instr_heartbeat = sprintf('"%s" %s "addpath(''%s''), %s,exit"',gb_psom_command_octave,gb_psom_opt_matlab,gb_psom_path_psom,cmd); else instr_heartbeat = sprintf('"%s" %s "addpath(''%s''), %s,exit"',gb_psom_command_matlab,gb_psom_opt_matlab,gb_psom_path_psom,cmd); end system([instr_heartbeat '&']); - warning('PSOM was not able to implement a heartbeat on that system, and will not be able to detect a job crash, or to terminate the job if the pipeline is stopped'); end %% Upload job info @@ -231,7 +245,3 @@ eval(command) -function job = sub_load_job(file_jobs,name_job) - -load(file_jobs,name_job); -eval(['job = ' name_job ';']);