diff --git a/bin/myhadoop-bootstrap.sh b/bin/myhadoop-bootstrap.sh index fd50833..9f4c30b 100755 --- a/bin/myhadoop-bootstrap.sh +++ b/bin/myhadoop-bootstrap.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash ################################################################################ # myhadoop-bootstrap - call from within a job script to do the entire cluster # setup in a hands-off fashion. This script is admittedly much less useful diff --git a/bin/myhadoop-cleanup.sh b/bin/myhadoop-cleanup.sh index d48381c..53cac4e 100755 --- a/bin/myhadoop-cleanup.sh +++ b/bin/myhadoop-cleanup.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash ################################################################################ # myhadoop-cleanup.sh - clean up all of the directories created by running a # Hadoop cluster via myHadoop. diff --git a/bin/myhadoop-configure.sh b/bin/myhadoop-configure.sh index a4d7abf..29b11ee 100755 --- a/bin/myhadoop-configure.sh +++ b/bin/myhadoop-configure.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash ################################################################################ # myhadoop-configure.sh - establish a valid $HADOOP_CONF_DIR with all of the # configurations necessary to start a Hadoop cluster from within a HPC batch @@ -304,6 +304,46 @@ if [ ! -e ${config_subs[DFS_NAME_DIR]}/current ]; then fi fi +### Enable Spark support if SPARK_HOME is defined +if [ "z$SPARK_HOME" != "z" ]; then + mh_print " " + mh_print "Enabling experimental Spark support" + if [ "z$SPARK_CONF_DIR" == "z" ]; then + SPARK_CONF_DIR=$HADOOP_CONF_DIR/spark + fi + mh_print "Using SPARK_CONF_DIR=$SPARK_CONF_DIR" + mh_print " " + + mkdir -p $SPARK_CONF_DIR + cp $SPARK_HOME/conf/* $SPARK_CONF_DIR/ + cp $HADOOP_CONF_DIR/slaves $SPARK_CONF_DIR/slaves + + cat <> $SPARK_CONF_DIR/spark-env.sh +export SPARK_CONF_DIR=$SPARK_CONF_DIR +export SPARK_MASTER_IP=$MASTER_NODE +export SPARK_MASTER_PORT=7077 +export SPARK_WORKER_DIR=$MH_SCRATCH_DIR/work +export SPARK_LOG_DIR=$MH_SCRATCH_DIR/logs + +### pyspark shell requires this environment variable be set to work +export MASTER=spark://$MASTER_NODE:7077 + +### push out the local environment to all slaves so that any loaded modules +### from the user environment are honored by the execution environment +export PATH=$PATH +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH + +### to prevent Spark from binding to the first address it can find +export SPARK_LOCAL_IP=\$(sed -e '$MH_IPOIB_TRANSFORM' <<< \$HOSTNAME) +EOF + +cat < $HBASE_CONF_DIR/myhbase.conf - + cat < $HBASE_CONF_DIR/myhbase.conf declare -A config_hbase_subs config_hbase_subs[NAME_NODE]="$MASTER_NODE" config_hbase_subs[ZOOKEEPER_DATADIR]="$MH_SCRATCH_DIR/zookeeper" @@ -340,29 +379,27 @@ config_hbase_subs[HBASE_LOG_DIR]="$MH_SCRATCH_DIR/logs" config_hbase_subs[HBASE_PID_DIR]="$MH_SCRATCH_DIR/pids" EOF -source $HBASE_CONF_DIR/myhbase.conf + source $HBASE_CONF_DIR/myhbase.conf -### And actually apply those substitutions: -for key in "${!config_hbase_subs[@]}"; do - for xml in hbase-site.xml - do - if [ -f $HBASE_CONF_DIR/$xml ]; then - sed -i 's#'$key'#'${config_hbase_subs[$key]}'#g' $HBASE_CONF_DIR/$xml - fi + ### And actually apply those substitutions: + for key in "${!config_hbase_subs[@]}"; do + for xml in hbase-site.xml + do + if [ -f $HBASE_CONF_DIR/$xml ]; then + sed -i 's#'$key'#'${config_hbase_subs[$key]}'#g' $HBASE_CONF_DIR/$xml + fi + done done -done - -cat << EOF >> $HBASE_CONF_DIR/hbase-env.sh + cat << EOF >> $HBASE_CONF_DIR/hbase-env.sh export JAVA_HOME=$JAVA_HOME export HBASE_LOG_DIR=${config_hbase_subs[HBASE_LOG_DIR]} export HBASE_PID_DIR=${config_hbase_subs[HBASE_PID_DIR]} EOF -cat <" >&2 + exit 1 +fi + +### Establish the slaves file containing our worker nodes +if [ "z$SPARK_SLAVES" == "z" ]; then + SPARK_SLAVES=$SPARK_CONF_DIR/slaves +fi +mh_print "Using $SPARK_SLAVES as our slaves file" + +### Read in our cluster's unique configuration variables +source $SPARK_CONF_DIR/spark-env.sh +mh_print "Reading in $SPARK_CONF_DIR/spark-env.sh as our slaves file" + +### mySpark does not currently support multiple worker instances per node +if [ "$action" == "Starting" ]; then + cmd_ma="$SPARK_HOME/sbin/start-master.sh" + cmd_sl="$SPARK_HOME/sbin/spark-daemon.sh --config $SPARK_CONF_DIR start org.apache.spark.deploy.worker.Worker 1 spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT" +elif [ "$action" == "Stopping" ]; then + cmd_ma="$SPARK_HOME/sbin/stop-master.sh" + cmd_sl="$SPARK_HOME/sbin/spark-daemon.sh --config $SPARK_CONF_DIR stop org.apache.spark.deploy.worker.Worker 1" +else + exit 1 +fi + +### Apply action across master + slave nodes +$cmd_ma +for slave in $(sort -u $SPARK_SLAVES) +do + mh_print "$action worker on $slave:\n $cmd_sl" + ssh $slave "$cmd_sl" +done diff --git a/etc/myhadoop.conf-gordon b/etc/myhadoop.conf-gordon index df92989..3ececb7 100644 --- a/etc/myhadoop.conf-gordon +++ b/etc/myhadoop.conf-gordon @@ -32,7 +32,7 @@ HADOOP_HOME=/home/glock/hadoop/hadoop-1.2.1 # "node-0-1" can be accessed via IP over InfiniBand by "node-0-1.ibnet0", # the transform would be "s/$/.ibnet0/" # -MH_IPOIB_TRANSFORM='s/$/.ibnet0/' +MH_IPOIB_TRANSFORM='s/\\([^.]*\\).*$/\\1.ibnet0/' ################################################################################ # Variable: MH_SCRATCH_DIR @@ -63,4 +63,4 @@ MH_SCRATCH_DIR=/scratch/$USER/$PBS_JOBID # # This is the location of the user's per-job Hadoop configuration directory. # -HADOOP_CONF_DIR=$HOME/hadoop-conf.$(cut -d. -f1 <<< $PBS_JOBID) +#HADOOP_CONF_DIR=$HOME/hadoop-conf.$(cut -d. -f1 <<< $PBS_JOBID) diff --git a/etc/myhadoop.conf-sierra b/etc/myhadoop.conf-sierra new file mode 100644 index 0000000..738689b --- /dev/null +++ b/etc/myhadoop.conf-sierra @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +################################################################################ +# etc/myhadoop.conf - a list of environment variables that can be set system- +# wide to alter the default behavior of a myHadoop installation. These will +# be overridden by anything provided by the user as a myHadoop option when +# myhadoop-configure.sh is executed. However these will NOT be overridden +# by anything in the user's environment. +# +# This particular configuration is designed for FutureGrid's Sierra resource +# +# Glenn K. Lockwood, San Diego Supercomputer Center April 2014 +################################################################################ + +################################################################################ +# Variable: HADOOP_HOME +# Command-line override: -h +# +# This is the base installation of Hadoop on the system. Note that if +# this is defined here, it will override any HADOOP_HOME that may exist in +# the user's environment, effectively railroading the user into using a +# specific Hadoop version when using this installation of myHadoop unless +# they override on the command line. +# +HADOOP_HOME=/N/u/glock/apps/hadoop-2.2.0 + +################################################################################ +# Variable: MH_IPOIB_TRANSFORM +# Command-line override: -i +# +# This is the regex substituion to be applied to all of the hosts in the node +# list before constructing the masters/slaves list. For example, if +# "node-0-1" can be accessed via IP over InfiniBand by "node-0-1.ibnet0", +# the transform would be "s/$/.ibnet0/" +# +MH_IPOIB_TRANSFORM='s/\\([^.]*\\).*$/\\1ib/' + +################################################################################ +# Variable: MH_SCRATCH_DIR +# Command-line override: -s +# +# This is the lcoation of the node-local scratch space for a system. You +# may include variables such as $USER and $SLURM_JOBID which will be evaluated +# within the context of the user's myHadoop execution environment. This is +# normally defined using the "-s" option when calling myhadoop-configure. +# +MH_SCRATCH_DIR=/scratch/$USER/$PBS_JOBID + +################################################################################ +# Variable: MH_PERSIST_DIR +# Command-line override: -p +# +# This specifies the location of a shared filesystem on which persistent +# myHadoop instantiations should be stored when myhadoop-configure is called +# in persistent mode. This is normally specified with the "-p" option when +# running myhadoop-configure. NOTE THAT IF YOU SET THIS, ALL JOBS WILL BE +# RUN IN PERSISTENT MODE unless the user explicitly requests -p '' +# +#MH_PERSIST_DIR= + +################################################################################ +# Variable: HADOOP_CONF_DIR +# Command-line override: -c +# +# This is the location of the user's per-job Hadoop configuration directory. +# +#HADOOP_CONF_DIR=$HOME/hadoop-conf.$(cut -d. -f1 <<< $PBS_JOBID)