Merge branch 'spark'

glennklockwood · May 24, 2016 · 6a581f6 · 6a581f6
2 parents 917d789 + 624c105
commit 6a581f6
Show file tree

Hide file tree

Showing 6 changed files with 173 additions and 20 deletions.
diff --git a/bin/myhadoop-bootstrap.sh b/bin/myhadoop-bootstrap.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 ################################################################################
 #  myhadoop-bootstrap - call from within a job script to do the entire cluster
 #    setup in a hands-off fashion.  This script is admittedly much less useful

diff --git a/bin/myhadoop-cleanup.sh b/bin/myhadoop-cleanup.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 ################################################################################
 # myhadoop-cleanup.sh - clean up all of the directories created by running a
 #   Hadoop cluster via myHadoop.

diff --git a/bin/myhadoop-configure.sh b/bin/myhadoop-configure.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 ################################################################################
 # myhadoop-configure.sh - establish a valid $HADOOP_CONF_DIR with all of the
 #   configurations necessary to start a Hadoop cluster from within a HPC batch
@@ -304,6 +304,46 @@ if [ ! -e ${config_subs[DFS_NAME_DIR]}/current ]; then
   fi
 fi
 
+### Enable Spark support if SPARK_HOME is defined
+if [ "z$SPARK_HOME" != "z" ]; then
+  mh_print " "
+  mh_print "Enabling experimental Spark support"
+  if [ "z$SPARK_CONF_DIR" == "z" ]; then
+    SPARK_CONF_DIR=$HADOOP_CONF_DIR/spark
+  fi
+  mh_print "Using SPARK_CONF_DIR=$SPARK_CONF_DIR"
+  mh_print " "
+
+  mkdir -p $SPARK_CONF_DIR
+  cp $SPARK_HOME/conf/* $SPARK_CONF_DIR/
+  cp $HADOOP_CONF_DIR/slaves $SPARK_CONF_DIR/slaves
+
+  cat <<EOF >> $SPARK_CONF_DIR/spark-env.sh
+export SPARK_CONF_DIR=$SPARK_CONF_DIR
+export SPARK_MASTER_IP=$MASTER_NODE
+export SPARK_MASTER_PORT=7077
+export SPARK_WORKER_DIR=$MH_SCRATCH_DIR/work
+export SPARK_LOG_DIR=$MH_SCRATCH_DIR/logs
+
+### pyspark shell requires this environment variable be set to work
+export MASTER=spark://$MASTER_NODE:7077
+
+### push out the local environment to all slaves so that any loaded modules
+### from the user environment are honored by the execution environment
+export PATH=$PATH
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH
+
+### to prevent Spark from binding to the first address it can find
+export SPARK_LOCAL_IP=\$(sed -e '$MH_IPOIB_TRANSFORM' <<< \$HOSTNAME)
+EOF
+
+cat <<EOF
+To use Spark, you will want to type the following commands:"
+  source $SPARK_CONF_DIR/spark-env.sh
+  myspark start
+EOF
+fi
+
 ### Enable HBase support if HBASE_HOME is defined
 if [ "z$HBASE_HOME" != "z" ]; then
 
@@ -330,8 +370,7 @@ if [ "z$HBASE_HOME" != "z" ]; then
   mh_print "ZOOKEEPER_QUORUM= $FIRST_NODE,$SECOND_NODE,$THIRD_NODE"
   mh_print " "
 
-cat <<EOF > $HBASE_CONF_DIR/myhbase.conf
-
+  cat <<EOF > $HBASE_CONF_DIR/myhbase.conf
 declare -A config_hbase_subs
 config_hbase_subs[NAME_NODE]="$MASTER_NODE"
 config_hbase_subs[ZOOKEEPER_DATADIR]="$MH_SCRATCH_DIR/zookeeper"
@@ -340,29 +379,27 @@ config_hbase_subs[HBASE_LOG_DIR]="$MH_SCRATCH_DIR/logs"
 config_hbase_subs[HBASE_PID_DIR]="$MH_SCRATCH_DIR/pids"
 EOF
 
-source $HBASE_CONF_DIR/myhbase.conf
+  source $HBASE_CONF_DIR/myhbase.conf
 
-### And actually apply those substitutions:
-for key in "${!config_hbase_subs[@]}"; do
-  for xml in hbase-site.xml
-  do
-    if [ -f $HBASE_CONF_DIR/$xml ]; then
-      sed -i 's#'$key'#'${config_hbase_subs[$key]}'#g' $HBASE_CONF_DIR/$xml
-    fi
+  ### And actually apply those substitutions:
+  for key in "${!config_hbase_subs[@]}"; do
+    for xml in hbase-site.xml
+    do
+      if [ -f $HBASE_CONF_DIR/$xml ]; then
+        sed -i 's#'$key'#'${config_hbase_subs[$key]}'#g' $HBASE_CONF_DIR/$xml
+      fi
+    done
   done
-done
-
-cat << EOF >> $HBASE_CONF_DIR/hbase-env.sh
 
+  cat << EOF >> $HBASE_CONF_DIR/hbase-env.sh
 export JAVA_HOME=$JAVA_HOME
 export HBASE_LOG_DIR=${config_hbase_subs[HBASE_LOG_DIR]}
 export HBASE_PID_DIR=${config_hbase_subs[HBASE_PID_DIR]}
 EOF
 
-cat <<EOF
+  cat <<EOF
 To use HBase, you will want to type the following commands:"
   export HBASE_CONF_DIR=$HBASE_CONF_DIR
   $HBASE_HOME/bin/start-hbase.sh
 EOF
-
 fi
diff --git a/bin/myspark b/bin/myspark
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+################################################################################
+# myspark.sh - start or stop a standalone Spark cluster.  Cannot use the control
+#   scripts provided with Spark 0.9.0 due to various bugs and lack of 
+#   functionality necessary to create transient Spark clusters.
+#
+#   Glenn K. Lockwood, San Diego Supercomputer Center               April 2014
+################################################################################
+
+function mh_print {
+    echo -e "mySpark: $@"
+}
+
+if [ "z$1" == "zstart" ]; then
+  action="Starting"
+elif [ "z$1" == "zstop" ]; then
+  action="Stopping"
+else
+  echo "Syntax: $0 <start|stop>" >&2
+  exit 1
+fi
+
+### Establish the slaves file containing our worker nodes
+if [ "z$SPARK_SLAVES" == "z" ]; then
+  SPARK_SLAVES=$SPARK_CONF_DIR/slaves
+fi
+mh_print "Using $SPARK_SLAVES as our slaves file"
+
+### Read in our cluster's unique configuration variables
+source $SPARK_CONF_DIR/spark-env.sh
+mh_print "Reading in $SPARK_CONF_DIR/spark-env.sh as our slaves file"
+
+### mySpark does not currently support multiple worker instances per node
+if [ "$action" == "Starting" ]; then
+  cmd_ma="$SPARK_HOME/sbin/start-master.sh"
+  cmd_sl="$SPARK_HOME/sbin/spark-daemon.sh --config $SPARK_CONF_DIR start org.apache.spark.deploy.worker.Worker 1 spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT"
+elif [ "$action" == "Stopping" ]; then
+  cmd_ma="$SPARK_HOME/sbin/stop-master.sh"
+  cmd_sl="$SPARK_HOME/sbin/spark-daemon.sh --config $SPARK_CONF_DIR stop org.apache.spark.deploy.worker.Worker 1"
+else
+  exit 1
+fi
+
+### Apply action across master + slave nodes
+$cmd_ma
+for slave in $(sort -u $SPARK_SLAVES)
+do
+  mh_print "$action worker on $slave:\n  $cmd_sl"
+  ssh $slave "$cmd_sl"
+done
diff --git a/etc/myhadoop.conf-gordon b/etc/myhadoop.conf-gordon
@@ -32,7 +32,7 @@ HADOOP_HOME=/home/glock/hadoop/hadoop-1.2.1
 #   "node-0-1" can be accessed via IP over InfiniBand by "node-0-1.ibnet0",
 #   the transform would be "s/$/.ibnet0/"
 #
-MH_IPOIB_TRANSFORM='s/$/.ibnet0/'
+MH_IPOIB_TRANSFORM='s/\\([^.]*\\).*$/\\1.ibnet0/'
 
 ################################################################################
 # Variable: MH_SCRATCH_DIR 
@@ -63,4 +63,4 @@ MH_SCRATCH_DIR=/scratch/$USER/$PBS_JOBID
 #
 #   This is the location of the user's per-job Hadoop configuration directory.  
 #
-HADOOP_CONF_DIR=$HOME/hadoop-conf.$(cut -d. -f1 <<< $PBS_JOBID)
+#HADOOP_CONF_DIR=$HOME/hadoop-conf.$(cut -d. -f1 <<< $PBS_JOBID)
diff --git a/etc/myhadoop.conf-sierra b/etc/myhadoop.conf-sierra
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+################################################################################
+# etc/myhadoop.conf - a list of environment variables that can be set system-
+#   wide to alter the default behavior of a myHadoop installation.  These will
+#   be overridden by anything provided by the user as a myHadoop option when
+#   myhadoop-configure.sh is executed.  However these will NOT be overridden
+#   by anything in the user's environment.
+#
+# This particular configuration is designed for FutureGrid's Sierra resource
+#
+#  Glenn K. Lockwood, San Diego Supercomputer Center                April 2014
+################################################################################
+
+################################################################################
+# Variable: HADOOP_HOME 
+# Command-line override: -h
+# 
+#   This is the base installation of Hadoop on the system.  Note that if
+#   this is defined here, it will override any HADOOP_HOME that may exist in
+#   the user's environment, effectively railroading the user into using a
+#   specific Hadoop version when using this installation of myHadoop unless
+#   they override on the command line.
+#
+HADOOP_HOME=/N/u/glock/apps/hadoop-2.2.0
+
+################################################################################
+# Variable: MH_IPOIB_TRANSFORM 
+# Command-line override: -i
+#  
+#   This is the regex substituion to be applied to all of the hosts in the node 
+#   list before constructing the masters/slaves list.  For example, if 
+#   "node-0-1" can be accessed via IP over InfiniBand by "node-0-1.ibnet0",
+#   the transform would be "s/$/.ibnet0/"
+#
+MH_IPOIB_TRANSFORM='s/\\([^.]*\\).*$/\\1ib/'
+
+################################################################################
+# Variable: MH_SCRATCH_DIR 
+# Command-line override: -s
+#
+#   This is the lcoation of the node-local scratch space for a system.  You   
+#   may include variables such as $USER and $SLURM_JOBID which will be evaluated 
+#   within the context of the user's myHadoop execution environment.  This is 
+#   normally defined using the "-s" option when calling myhadoop-configure.
+#
+MH_SCRATCH_DIR=/scratch/$USER/$PBS_JOBID
+
+################################################################################
+# Variable: MH_PERSIST_DIR
+# Command-line override: -p
+#
+#   This specifies the location of a shared filesystem on which persistent 
+#   myHadoop instantiations should be stored when myhadoop-configure is called 
+#   in persistent mode.  This is normally specified with the "-p" option when 
+#   running myhadoop-configure.  NOTE THAT IF YOU SET THIS, ALL JOBS WILL BE 
+#   RUN IN PERSISTENT MODE unless the user explicitly requests -p ''
+#
+#MH_PERSIST_DIR=
+
+################################################################################
+# Variable: HADOOP_CONF_DIR 
+# Command-line override: -c
+#
+#   This is the location of the user's per-job Hadoop configuration directory.  
+#
+#HADOOP_CONF_DIR=$HOME/hadoop-conf.$(cut -d. -f1 <<< $PBS_JOBID)