Skip to content

Commit

Permalink
added support for Hadoop 2.2 and YARN
Browse files Browse the repository at this point in the history
  • Loading branch information
glennklockwood committed Mar 31, 2014
1 parent c578c48 commit 58fd302
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 5 deletions.
22 changes: 22 additions & 0 deletions TODO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Known Issues
------------
No known issues at this time.

Functionality
-------------
* At present, all Hadoop daemons are not secured and assume the underlying
cluster environment will provide the necessary access controls to the
compute nodes. Current configuration allows any user and any datanode join
the existing cluster. This should be remedied by specifying
* cluster admin
* namenode/datanode includes
* integrate spark support directly into myhadoop-configure.sh by checking
for SPARK_HOME instead of using the separate myspark-configure.sh script

Framework
---------
* separate out the redundant functionality into libexec/myhadoop-driver.sh
* put a "myhadoop" front-end application interface to the backend
configure/cleanup scripts
* begin re-implementing backends in perl
* add unit tests
27 changes: 22 additions & 5 deletions bin/myhadoop-configure.sh
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,13 @@ fi
mkdir -p $HADOOP_CONF_DIR

### First copy over all default Hadoop configs
cp $HADOOP_HOME/conf/* $HADOOP_CONF_DIR
if [ -d $HADOOP_HOME/conf ]; then # Hadoop 1.x
cp $HADOOP_HOME/conf/* $HADOOP_CONF_DIR
MH_HADOOP_VERS=1
elif [ -d $HADOOP_HOME/etc/hadoop ]; then # Hadoop 2.x
cp $HADOOP_HOME/etc/hadoop/* $HADOOP_CONF_DIR
MH_HADOOP_VERS=2
fi

### Pick the master node as the first node in the nodefile
MASTER_NODE=$(print_nodelist | /usr/bin/head -n1)
Expand Down Expand Up @@ -228,18 +234,23 @@ source $HADOOP_CONF_DIR/myhadoop.conf

### And actually apply those substitutions:
for key in "${!config_subs[@]}"; do
for xml in mapred-site.xml core-site.xml hdfs-site.xml
for xml in mapred-site.xml core-site.xml hdfs-site.xml yarn-site.xml
do
sed -i 's#'$key'#'${config_subs[$key]}'#g' $HADOOP_CONF_DIR/$xml
if [ -f $HADOOP_CONF_DIR/$xml ]; then
sed -i 's#'$key'#'${config_subs[$key]}'#g' $HADOOP_CONF_DIR/$xml
fi
done
done

### A few Hadoop file locations are set via environment variables:
cat <<EOF >> $HADOOP_CONF_DIR/hadoop-env.sh
cat << EOF >> $HADOOP_CONF_DIR/hadoop-env.sh
# myHadoop alterations for this job:
export HADOOP_LOG_DIR=${config_subs[HADOOP_LOG_DIR]}
export HADOOP_PID_DIR=${config_subs[HADOOP_PID_DIR]}
export YARN_LOG_DIR=${config_subs[HADOOP_LOG_DIR]} # no effect if using Hadoop 1
export YARN_PID_DIR=${config_subs[HADOOP_PID_DIR]} # no effect if using Hadoop 1
export HADOOP_SECURE_DN_PID_DIR=${config_subs[HADOOP_PID_DIR]}
export HADOOP_HOME_WARN_SUPPRESS=TRUE
export JAVA_HOME=$JAVA_HOME
### Jetty leaves garbage in /tmp no matter what \$TMPDIR is; this is an extreme
Expand Down Expand Up @@ -271,5 +282,11 @@ fi

### Format HDFS if it does not already exist from persistent mode
if [ ! -e ${config_subs[DFS_NAME_DIR]}/current ]; then
HADOOP_CONF_DIR=$HADOOP_CONF_DIR $HADOOP_HOME/bin/hadoop namenode -format -nonInteractive -force
if [ $MH_HADOOP_VERS -eq 1 ]; then
HADOOP_CONF_DIR=$HADOOP_CONF_DIR $HADOOP_HOME/bin/hadoop namenode -format -nonInteractive -force
elif [ $MH_HADOOP_VERS -eq 2 ]; then
HADOOP_CONF_DIR=$HADOOP_CONF_DIR $HADOOP_HOME/bin/hdfs namenode -format
else
mh_print "Unknown Hadoop version. You must format namenode manually."
fi
fi
134 changes: 134 additions & 0 deletions myhadoop-2.2.0.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
diff -ruN a/core-site.xml b/core-site.xml
--- a/core-site.xml 2014-03-30 10:25:24.420801062 -0700
+++ b/core-site.xml 2014-03-30 11:47:32.415743269 -0700
@@ -17,4 +17,15 @@
<!-- Put site-specific property overrides in this file. -->

<configuration>
+<property>
+ <name>hadoop.tmp.dir</name>
+ <value>HADOOP_TMP_DIR</value>
+ <description>A base for other temporary directories.</description>
+</property>
+
+<property>
+ <name>fs.defaultFS</name>
+ <value>hdfs://MASTER_NODE:54310</value>
+</property>
+
</configuration>
diff -ruN a/hdfs-site.xml b/hdfs-site.xml
--- a/hdfs-site.xml 2014-03-30 10:25:24.420801062 -0700
+++ b/hdfs-site.xml 2014-03-30 11:42:31.974815902 -0700
@@ -18,4 +18,34 @@

<configuration>

+ <property>
+ <name>dfs.namenode.name.dir</name>
+ <value>DFS_NAME_DIR</value>
+ <description>Determines where on the local filesystem the DFS name node
+ should store the name table. If this is a comma-delimited list
+ of directories then the name table is replicated in all of the
+ directories, for redundancy. </description>
+ <final>true</final>
+ </property>
+
+ <property>
+ <name>dfs.datanode.data.dir</name>
+ <value>DFS_DATA_DIR</value>
+ <description>Determines where on the local filesystem an DFS data node
+ should store its blocks. If this is a comma-delimited
+ list of directories, then data will be stored in all named
+ directories, typically on different devices.
+ Directories that do not exist are ignored.
+ </description>
+ <final>true</final>
+ </property>
+
+ <property>
+ <name>dfs.namenode.secondary.http-address</name>
+ <value>MASTER_NODE:50090</value>
+ <description>The secondary namenode http server address and
+ port.</description>
+ <final>true</final>
+ </property>
+
</configuration>
diff -ruN a/mapred-site.xml b/mapred-site.xml
--- a/mapred-site.xml 1969-12-31 16:00:00.000000000 -0800
+++ b/mapred-site.xml 2014-03-30 13:52:18.101773941 -0700
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+<property>
+ <name>mapreduce.framework.name</name>
+ <value>yarn</value>
+ <description>The runtime framework for executing MapReduce jobs. Can be one of
+ local, classic or yarn.</description>
+</property>
+
+</configuration>
diff -ruN a/yarn-site.xml b/yarn-site.xml
--- a/yarn-site.xml 2014-03-30 10:25:24.420801062 -0700
+++ b/yarn-site.xml 2014-03-30 13:52:09.947889010 -0700
@@ -15,5 +15,41 @@
<configuration>

<!-- Site specific YARN configuration properties -->
+ <property>
+ <name>yarn.resourcemanager.hostname</name>
+ <value>MASTER_NODE</value>
+ <description>The hostname of the RM.</description>
+ <final>true</final>
+ </property>
+
+ <property>
+ <name>yarn.nodemanager.local-dirs</name>
+ <value>MAPRED_LOCAL_DIR</value>
+ <description>The hostname of the RM.
+ Default: ${hadoop.tmp.dir}/nm-local-dir</description>
+ </property>
+
+<!-- yarn.nodemanager.log-dirs defaults to ${yarn.log.dir}/userlogs, where
+ yarn.log.dir is set by yarn-env.sh via the YARN_LOG_DIR environment
+ variable -->
+
+<!-- these are necessary for mapreduce to work with YARN -->
+ <property>
+ <name>yarn.nodemanager.aux-services</name>
+ <value>mapreduce_shuffle</value>
+ <description>The valid service name should only contain a-zA-Z0-9_ and can
+ not start with numbers. Default: none</description>
+ </property>
+
+<!--
+ <property>
+ <name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
+ <value>org.apache.hadoop.mapred.ShuffleHandler</value>
+ <description>Java class to handle the shuffle stage of
+ mapreduce.
+ Default: org.apache.hadoop.mapred.ShuffleHandler</description>
+ </property>
+-->
+

</configuration>

0 comments on commit 58fd302

Please sign in to comment.