added support for Hadoop 2.2 and YARN

glennklockwood · Mar 31, 2014 · 58fd302 · 58fd302
1 parent c578c48
commit 58fd302
Show file tree

Hide file tree

Showing 3 changed files with 178 additions and 5 deletions.
diff --git a/TODO.md b/TODO.md
@@ -0,0 +1,22 @@
+Known Issues
+------------
+No known issues at this time.
+
+Functionality
+-------------
+* At present, all Hadoop daemons are not secured and assume the underlying 
+  cluster environment will provide the necessary access controls to the 
+  compute nodes.  Current configuration allows any user and any datanode join
+  the existing cluster.  This should be remedied by specifying
+  * cluster admin
+  * namenode/datanode includes
+* integrate spark support directly into myhadoop-configure.sh by checking
+  for SPARK_HOME instead of using the separate myspark-configure.sh script
+
+Framework
+---------
+* separate out the redundant functionality into libexec/myhadoop-driver.sh
+* put a "myhadoop" front-end application interface to the backend 
+  configure/cleanup scripts
+* begin re-implementing backends in perl
+* add unit tests
diff --git a/bin/myhadoop-configure.sh b/bin/myhadoop-configure.sh
@@ -196,7 +196,13 @@ fi
 mkdir -p $HADOOP_CONF_DIR
 
 ### First copy over all default Hadoop configs
-cp $HADOOP_HOME/conf/* $HADOOP_CONF_DIR
+if [ -d $HADOOP_HOME/conf ]; then           # Hadoop 1.x
+  cp $HADOOP_HOME/conf/* $HADOOP_CONF_DIR
+  MH_HADOOP_VERS=1
+elif [ -d $HADOOP_HOME/etc/hadoop ]; then   # Hadoop 2.x
+  cp $HADOOP_HOME/etc/hadoop/* $HADOOP_CONF_DIR
+  MH_HADOOP_VERS=2
+fi
 
 ### Pick the master node as the first node in the nodefile
 MASTER_NODE=$(print_nodelist | /usr/bin/head -n1)
@@ -228,18 +234,23 @@ source $HADOOP_CONF_DIR/myhadoop.conf
 
 ### And actually apply those substitutions:
 for key in "${!config_subs[@]}"; do
-  for xml in mapred-site.xml core-site.xml hdfs-site.xml
+  for xml in mapred-site.xml core-site.xml hdfs-site.xml yarn-site.xml
   do
-    sed -i 's#'$key'#'${config_subs[$key]}'#g' $HADOOP_CONF_DIR/$xml
+    if [ -f $HADOOP_CONF_DIR/$xml ]; then
+      sed -i 's#'$key'#'${config_subs[$key]}'#g' $HADOOP_CONF_DIR/$xml
+    fi
   done
 done
 
 ### A few Hadoop file locations are set via environment variables:
-cat <<EOF >> $HADOOP_CONF_DIR/hadoop-env.sh
+cat << EOF >> $HADOOP_CONF_DIR/hadoop-env.sh
 
 # myHadoop alterations for this job:
 export HADOOP_LOG_DIR=${config_subs[HADOOP_LOG_DIR]}
 export HADOOP_PID_DIR=${config_subs[HADOOP_PID_DIR]}
+export YARN_LOG_DIR=${config_subs[HADOOP_LOG_DIR]} # no effect if using Hadoop 1
+export YARN_PID_DIR=${config_subs[HADOOP_PID_DIR]} # no effect if using Hadoop 1
+export HADOOP_SECURE_DN_PID_DIR=${config_subs[HADOOP_PID_DIR]}
 export HADOOP_HOME_WARN_SUPPRESS=TRUE
 export JAVA_HOME=$JAVA_HOME
 ### Jetty leaves garbage in /tmp no matter what \$TMPDIR is; this is an extreme 
@@ -271,5 +282,11 @@ fi
 
 ### Format HDFS if it does not already exist from persistent mode
 if [ ! -e ${config_subs[DFS_NAME_DIR]}/current ]; then
-  HADOOP_CONF_DIR=$HADOOP_CONF_DIR $HADOOP_HOME/bin/hadoop namenode -format -nonInteractive -force
+  if [ $MH_HADOOP_VERS -eq 1 ]; then
+    HADOOP_CONF_DIR=$HADOOP_CONF_DIR $HADOOP_HOME/bin/hadoop namenode -format -nonInteractive -force
+  elif [ $MH_HADOOP_VERS -eq 2 ]; then
+    HADOOP_CONF_DIR=$HADOOP_CONF_DIR $HADOOP_HOME/bin/hdfs namenode -format
+  else
+    mh_print "Unknown Hadoop version.  You must format namenode manually."
+  fi
 fi
diff --git a/myhadoop-2.2.0.patch b/myhadoop-2.2.0.patch
@@ -0,0 +1,134 @@
+diff -ruN a/core-site.xml b/core-site.xml
+--- a/core-site.xml	2014-03-30 10:25:24.420801062 -0700
++++ b/core-site.xml	2014-03-30 11:47:32.415743269 -0700
+@@ -17,4 +17,15 @@
+ <!-- Put site-specific property overrides in this file. -->
+
+ <configuration>
++<property>
++  <name>hadoop.tmp.dir</name>
++  <value>HADOOP_TMP_DIR</value>
++  <description>A base for other temporary directories.</description>
++</property>
++
++<property>
++  <name>fs.defaultFS</name>
++  <value>hdfs://MASTER_NODE:54310</value>
++</property>
++
+ </configuration>
+diff -ruN a/hdfs-site.xml b/hdfs-site.xml
+--- a/hdfs-site.xml	2014-03-30 10:25:24.420801062 -0700
++++ b/hdfs-site.xml	2014-03-30 11:42:31.974815902 -0700
+@@ -18,4 +18,34 @@
+
+ <configuration>
+
++  <property>
++    <name>dfs.namenode.name.dir</name>
++    <value>DFS_NAME_DIR</value>
++    <description>Determines where on the local filesystem the DFS name node
++      should store the name table.  If this is a comma-delimited list
++      of directories then the name table is replicated in all of the
++      directories, for redundancy. </description>
++    <final>true</final>
++  </property>
++
++  <property>
++    <name>dfs.datanode.data.dir</name>
++    <value>DFS_DATA_DIR</value>
++    <description>Determines where on the local filesystem an DFS data node
++       should store its blocks.  If this is a comma-delimited
++       list of directories, then data will be stored in all named
++       directories, typically on different devices.
++       Directories that do not exist are ignored.
++    </description>
++    <final>true</final>
++  </property>
++
++  <property>
++   <name>dfs.namenode.secondary.http-address</name>
++   <value>MASTER_NODE:50090</value>
++   <description>The secondary namenode http server address and 
++       port.</description>
++   <final>true</final>
++  </property>
++
+ </configuration>
+diff -ruN a/mapred-site.xml b/mapred-site.xml
+--- a/mapred-site.xml	1969-12-31 16:00:00.000000000 -0800
++++ b/mapred-site.xml	2014-03-30 13:52:18.101773941 -0700
+@@ -0,0 +1,28 @@
++<?xml version="1.0"?>
++<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
++<!--
++  Licensed under the Apache License, Version 2.0 (the "License");
++  you may not use this file except in compliance with the License.
++  You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++  Unless required by applicable law or agreed to in writing, software
++  distributed under the License is distributed on an "AS IS" BASIS,
++  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++  See the License for the specific language governing permissions and
++  limitations under the License. See accompanying LICENSE file.
++-->
++
++<!-- Put site-specific property overrides in this file. -->
++
++<configuration>
++
++<property>
++  <name>mapreduce.framework.name</name>
++  <value>yarn</value>
++  <description>The runtime framework for executing MapReduce jobs. Can be one of
++    local, classic or yarn.</description>
++</property>
++
++</configuration>
+diff -ruN a/yarn-site.xml b/yarn-site.xml
+--- a/yarn-site.xml	2014-03-30 10:25:24.420801062 -0700
++++ b/yarn-site.xml	2014-03-30 13:52:09.947889010 -0700
+@@ -15,5 +15,41 @@
+ <configuration>
+
+ <!-- Site specific YARN configuration properties -->
++  <property>
++    <name>yarn.resourcemanager.hostname</name>
++    <value>MASTER_NODE</value>
++    <description>The hostname of the RM.</description>
++    <final>true</final>
++  </property>
++
++  <property>
++    <name>yarn.nodemanager.local-dirs</name>
++    <value>MAPRED_LOCAL_DIR</value>
++    <description>The hostname of the RM.  
++        Default: ${hadoop.tmp.dir}/nm-local-dir</description>
++  </property>
++
++<!-- yarn.nodemanager.log-dirs defaults to ${yarn.log.dir}/userlogs, where 
++     yarn.log.dir is set by yarn-env.sh via the YARN_LOG_DIR environment 
++     variable -->
++
++<!-- these are necessary for mapreduce to work with YARN -->
++  <property>
++    <name>yarn.nodemanager.aux-services</name>
++    <value>mapreduce_shuffle</value>
++    <description>The valid service name should only contain a-zA-Z0-9_ and can 
++        not start with numbers.  Default: none</description>
++  </property>
++
++<!-- 
++  <property>
++    <name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
++    <value>org.apache.hadoop.mapred.ShuffleHandler</value>
++    <description>Java class to handle the shuffle stage of 
++        mapreduce.  
++        Default:  org.apache.hadoop.mapred.ShuffleHandler</description>
++  </property>
++-->
++
+
+ </configuration>