Skip to content

Commit

Permalink
Merge branch 'spark'
Browse files Browse the repository at this point in the history
  • Loading branch information
glennklockwood committed May 24, 2016
2 parents 917d789 + 624c105 commit 6a581f6
Show file tree
Hide file tree
Showing 6 changed files with 173 additions and 20 deletions.
2 changes: 1 addition & 1 deletion bin/myhadoop-bootstrap.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
################################################################################
# myhadoop-bootstrap - call from within a job script to do the entire cluster
# setup in a hands-off fashion. This script is admittedly much less useful
Expand Down
2 changes: 1 addition & 1 deletion bin/myhadoop-cleanup.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
################################################################################
# myhadoop-cleanup.sh - clean up all of the directories created by running a
# Hadoop cluster via myHadoop.
Expand Down
69 changes: 53 additions & 16 deletions bin/myhadoop-configure.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
################################################################################
# myhadoop-configure.sh - establish a valid $HADOOP_CONF_DIR with all of the
# configurations necessary to start a Hadoop cluster from within a HPC batch
Expand Down Expand Up @@ -304,6 +304,46 @@ if [ ! -e ${config_subs[DFS_NAME_DIR]}/current ]; then
fi
fi

### Enable Spark support if SPARK_HOME is defined
if [ "z$SPARK_HOME" != "z" ]; then
mh_print " "
mh_print "Enabling experimental Spark support"
if [ "z$SPARK_CONF_DIR" == "z" ]; then
SPARK_CONF_DIR=$HADOOP_CONF_DIR/spark
fi
mh_print "Using SPARK_CONF_DIR=$SPARK_CONF_DIR"
mh_print " "

mkdir -p $SPARK_CONF_DIR
cp $SPARK_HOME/conf/* $SPARK_CONF_DIR/
cp $HADOOP_CONF_DIR/slaves $SPARK_CONF_DIR/slaves

cat <<EOF >> $SPARK_CONF_DIR/spark-env.sh
export SPARK_CONF_DIR=$SPARK_CONF_DIR
export SPARK_MASTER_IP=$MASTER_NODE
export SPARK_MASTER_PORT=7077
export SPARK_WORKER_DIR=$MH_SCRATCH_DIR/work
export SPARK_LOG_DIR=$MH_SCRATCH_DIR/logs
### pyspark shell requires this environment variable be set to work
export MASTER=spark://$MASTER_NODE:7077
### push out the local environment to all slaves so that any loaded modules
### from the user environment are honored by the execution environment
export PATH=$PATH
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH
### to prevent Spark from binding to the first address it can find
export SPARK_LOCAL_IP=\$(sed -e '$MH_IPOIB_TRANSFORM' <<< \$HOSTNAME)
EOF

cat <<EOF
To use Spark, you will want to type the following commands:"
source $SPARK_CONF_DIR/spark-env.sh
myspark start
EOF
fi

### Enable HBase support if HBASE_HOME is defined
if [ "z$HBASE_HOME" != "z" ]; then

Expand All @@ -330,8 +370,7 @@ if [ "z$HBASE_HOME" != "z" ]; then
mh_print "ZOOKEEPER_QUORUM= $FIRST_NODE,$SECOND_NODE,$THIRD_NODE"
mh_print " "

cat <<EOF > $HBASE_CONF_DIR/myhbase.conf
cat <<EOF > $HBASE_CONF_DIR/myhbase.conf
declare -A config_hbase_subs
config_hbase_subs[NAME_NODE]="$MASTER_NODE"
config_hbase_subs[ZOOKEEPER_DATADIR]="$MH_SCRATCH_DIR/zookeeper"
Expand All @@ -340,29 +379,27 @@ config_hbase_subs[HBASE_LOG_DIR]="$MH_SCRATCH_DIR/logs"
config_hbase_subs[HBASE_PID_DIR]="$MH_SCRATCH_DIR/pids"
EOF

source $HBASE_CONF_DIR/myhbase.conf
source $HBASE_CONF_DIR/myhbase.conf

### And actually apply those substitutions:
for key in "${!config_hbase_subs[@]}"; do
for xml in hbase-site.xml
do
if [ -f $HBASE_CONF_DIR/$xml ]; then
sed -i 's#'$key'#'${config_hbase_subs[$key]}'#g' $HBASE_CONF_DIR/$xml
fi
### And actually apply those substitutions:
for key in "${!config_hbase_subs[@]}"; do
for xml in hbase-site.xml
do
if [ -f $HBASE_CONF_DIR/$xml ]; then
sed -i 's#'$key'#'${config_hbase_subs[$key]}'#g' $HBASE_CONF_DIR/$xml
fi
done
done
done

cat << EOF >> $HBASE_CONF_DIR/hbase-env.sh

cat << EOF >> $HBASE_CONF_DIR/hbase-env.sh
export JAVA_HOME=$JAVA_HOME
export HBASE_LOG_DIR=${config_hbase_subs[HBASE_LOG_DIR]}
export HBASE_PID_DIR=${config_hbase_subs[HBASE_PID_DIR]}
EOF

cat <<EOF
cat <<EOF
To use HBase, you will want to type the following commands:"
export HBASE_CONF_DIR=$HBASE_CONF_DIR
$HBASE_HOME/bin/start-hbase.sh
EOF

fi
50 changes: 50 additions & 0 deletions bin/myspark
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env bash
################################################################################
# myspark.sh - start or stop a standalone Spark cluster. Cannot use the control
# scripts provided with Spark 0.9.0 due to various bugs and lack of
# functionality necessary to create transient Spark clusters.
#
# Glenn K. Lockwood, San Diego Supercomputer Center April 2014
################################################################################

function mh_print {
echo -e "mySpark: $@"
}

if [ "z$1" == "zstart" ]; then
action="Starting"
elif [ "z$1" == "zstop" ]; then
action="Stopping"
else
echo "Syntax: $0 <start|stop>" >&2
exit 1
fi

### Establish the slaves file containing our worker nodes
if [ "z$SPARK_SLAVES" == "z" ]; then
SPARK_SLAVES=$SPARK_CONF_DIR/slaves
fi
mh_print "Using $SPARK_SLAVES as our slaves file"

### Read in our cluster's unique configuration variables
source $SPARK_CONF_DIR/spark-env.sh
mh_print "Reading in $SPARK_CONF_DIR/spark-env.sh as our slaves file"

### mySpark does not currently support multiple worker instances per node
if [ "$action" == "Starting" ]; then
cmd_ma="$SPARK_HOME/sbin/start-master.sh"
cmd_sl="$SPARK_HOME/sbin/spark-daemon.sh --config $SPARK_CONF_DIR start org.apache.spark.deploy.worker.Worker 1 spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT"
elif [ "$action" == "Stopping" ]; then
cmd_ma="$SPARK_HOME/sbin/stop-master.sh"
cmd_sl="$SPARK_HOME/sbin/spark-daemon.sh --config $SPARK_CONF_DIR stop org.apache.spark.deploy.worker.Worker 1"
else
exit 1
fi

### Apply action across master + slave nodes
$cmd_ma
for slave in $(sort -u $SPARK_SLAVES)
do
mh_print "$action worker on $slave:\n $cmd_sl"
ssh $slave "$cmd_sl"
done
4 changes: 2 additions & 2 deletions etc/myhadoop.conf-gordon
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ HADOOP_HOME=/home/glock/hadoop/hadoop-1.2.1
# "node-0-1" can be accessed via IP over InfiniBand by "node-0-1.ibnet0",
# the transform would be "s/$/.ibnet0/"
#
MH_IPOIB_TRANSFORM='s/$/.ibnet0/'
MH_IPOIB_TRANSFORM='s/\\([^.]*\\).*$/\\1.ibnet0/'

################################################################################
# Variable: MH_SCRATCH_DIR
Expand Down Expand Up @@ -63,4 +63,4 @@ MH_SCRATCH_DIR=/scratch/$USER/$PBS_JOBID
#
# This is the location of the user's per-job Hadoop configuration directory.
#
HADOOP_CONF_DIR=$HOME/hadoop-conf.$(cut -d. -f1 <<< $PBS_JOBID)
#HADOOP_CONF_DIR=$HOME/hadoop-conf.$(cut -d. -f1 <<< $PBS_JOBID)
66 changes: 66 additions & 0 deletions etc/myhadoop.conf-sierra
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env bash
################################################################################
# etc/myhadoop.conf - a list of environment variables that can be set system-
# wide to alter the default behavior of a myHadoop installation. These will
# be overridden by anything provided by the user as a myHadoop option when
# myhadoop-configure.sh is executed. However these will NOT be overridden
# by anything in the user's environment.
#
# This particular configuration is designed for FutureGrid's Sierra resource
#
# Glenn K. Lockwood, San Diego Supercomputer Center April 2014
################################################################################

################################################################################
# Variable: HADOOP_HOME
# Command-line override: -h
#
# This is the base installation of Hadoop on the system. Note that if
# this is defined here, it will override any HADOOP_HOME that may exist in
# the user's environment, effectively railroading the user into using a
# specific Hadoop version when using this installation of myHadoop unless
# they override on the command line.
#
HADOOP_HOME=/N/u/glock/apps/hadoop-2.2.0

################################################################################
# Variable: MH_IPOIB_TRANSFORM
# Command-line override: -i
#
# This is the regex substituion to be applied to all of the hosts in the node
# list before constructing the masters/slaves list. For example, if
# "node-0-1" can be accessed via IP over InfiniBand by "node-0-1.ibnet0",
# the transform would be "s/$/.ibnet0/"
#
MH_IPOIB_TRANSFORM='s/\\([^.]*\\).*$/\\1ib/'

################################################################################
# Variable: MH_SCRATCH_DIR
# Command-line override: -s
#
# This is the lcoation of the node-local scratch space for a system. You
# may include variables such as $USER and $SLURM_JOBID which will be evaluated
# within the context of the user's myHadoop execution environment. This is
# normally defined using the "-s" option when calling myhadoop-configure.
#
MH_SCRATCH_DIR=/scratch/$USER/$PBS_JOBID

################################################################################
# Variable: MH_PERSIST_DIR
# Command-line override: -p
#
# This specifies the location of a shared filesystem on which persistent
# myHadoop instantiations should be stored when myhadoop-configure is called
# in persistent mode. This is normally specified with the "-p" option when
# running myhadoop-configure. NOTE THAT IF YOU SET THIS, ALL JOBS WILL BE
# RUN IN PERSISTENT MODE unless the user explicitly requests -p ''
#
#MH_PERSIST_DIR=

################################################################################
# Variable: HADOOP_CONF_DIR
# Command-line override: -c
#
# This is the location of the user's per-job Hadoop configuration directory.
#
#HADOOP_CONF_DIR=$HOME/hadoop-conf.$(cut -d. -f1 <<< $PBS_JOBID)

0 comments on commit 6a581f6

Please sign in to comment.