From e836b625853a78da07fda970aa19fd36d3af8fe8 Mon Sep 17 00:00:00 2001
From: tcs <you@example.com>
Date: Fri, 25 Oct 2019 15:22:47 +0530
Subject: [PATCH] replacing hiplaunchkernel to hiplaunchkernelGGL and removing
 hiplaunchparm

---
 FAQ/FAQ_HIP.rst                              |  2 +-
 Inputs/hip_runtime_api.h                     |  2 +-
 Installation_Guide/HIP.rst                   |  6 +++---
 Programming_Guides/CUDAAPIHIPTEXTURE.rst     |  2 +-
 Programming_Guides/HIP-FAQ.rst               |  2 +-
 Programming_Guides/HIP-GUIDE.rst             | 19 +++++++++----------
 Programming_Guides/HIP-Terms.rst             |  2 +-
 Programming_Guides/HIP-porting-guide.rst     | 12 ++++++------
 Programming_Guides/HIP-terminology.rst       |  2 +-
 Programming_Guides/Kernel_language.rst       | 19 +++++++++----------
 Programming_Guides/LanguageInto.rst          |  4 ++--
 Programming_Guides/Programming-Guides.rst    |  8 ++++----
 Programming_Guides/hip_port.rst              |  4 ++--
 Programming_Guides/hip_profiling.rst         |  4 ++--
 Programming_Guides/hipporting-driver-api.rst |  4 ++--
 15 files changed, 45 insertions(+), 47 deletions(-)

diff --git a/FAQ/FAQ_HIP.rst b/FAQ/FAQ_HIP.rst
index 008aadde..15bfceb6 100644
--- a/FAQ/FAQ_HIP.rst
+++ b/FAQ/FAQ_HIP.rst
@@ -14,7 +14,7 @@ HIP provides the following:
 * Memory management (hipMalloc(), hipMemcpy(), hipFree(), etc.)
 * Streams (hipStreamCreate(),hipStreamSynchronize(), hipStreamWaitEvent(),  etc.)
 * Events (hipEventRecord(), hipEventElapsedTime(), etc.)
-* Kernel launching (hipLaunchKernel is a standard C/C++ function that replaces <<< >>>)
+* Kernel launching (hipLaunchKernelGGL is a standard C/C++ function that replaces <<< >>>)
 * HIP Module API to control when adn how code is loaded.
 * CUDA*style kernel coordinate functions (threadIdx, blockIdx, blockDim, gridDim)
 * Cross*lane instructions including shfl, ballot, any, all
diff --git a/Inputs/hip_runtime_api.h b/Inputs/hip_runtime_api.h
index b6ae8872..d6a59667 100644
--- a/Inputs/hip_runtime_api.h
+++ b/Inputs/hip_runtime_api.h
@@ -328,7 +328,7 @@ hipError_t hipDeviceReset(void);
  * - Any device memory subsequently allocated from this host thread (using hipMalloc) will be
  * allocated on device.
  * - Any streams or events created from this host thread will be associated with device.
- * - Any kernels launched from this host thread (using hipLaunchKernel) will be executed on device
+ * - Any kernels launched from this host thread (using hipLaunchKernelGGL) will be executed on device
  * (unless a specific stream is specified, in which case the device associated with that stream will
  * be used).
  *
diff --git a/Installation_Guide/HIP.rst b/Installation_Guide/HIP.rst
index 0baa8c99..8b7affc9 100644
--- a/Installation_Guide/HIP.rst
+++ b/Installation_Guide/HIP.rst
@@ -56,7 +56,7 @@ Simple Example
 ################
 
 The HIP API includes functions such as hipMalloc, hipMemcpy, and hipFree.
-Programmers familiar with CUDA will also be able to quickly learn and start coding with the HIP API. Compute kernels are launched with the "hipLaunchKernel" macro call. Here is simple example showing a snippet of HIP API code:
+Programmers familiar with CUDA will also be able to quickly learn and start coding with the HIP API. Compute kernels are launched with the "hipLaunchKernelGGL" macro call. Here is simple example showing a snippet of HIP API code:
 ::
  hipMalloc(&A_d, Nbytes));
  hipMalloc(&C_d, Nbytes));
@@ -65,7 +65,7 @@ Programmers familiar with CUDA will also be able to quickly learn and start codi
  
  const unsigned blocks = 512;
  const unsigned threadsPerBlock = 256;
- hipLaunchKernel(vector_square,   /* compute kernel*/
+ hipLaunchKernelGGL(vector_square,   /* compute kernel*/
                  dim3(blocks), dim3(threadsPerBlock), 0/*dynamic shared*/, 0/*stream*/,     /* launch config*/
                  C_d, A_d, N);  /* arguments to the compute kernel */ 
  
@@ -128,7 +128,7 @@ Tour of the HIP Directories
  * **Include:**
 
 	* **hip_runtime_api.h** : Defines HIP runtime APIs and can be compiled with many standard Linux compilers (hcc, GCC, ICC, CLANG, etc), in either C or C++ mode.
-	* **hip_runtime.h** : Includes everything in hip_runtime_api.h PLUS hipLaunchKernel and syntax for writing device kernels and device functions. hip_runtime.h can only be compiled with hcc.
+	* **hip_runtime.h** : Includes everything in hip_runtime_api.h PLUS hipLaunchKernelGGL and syntax for writing device kernels and device functions. hip_runtime.h can only be compiled with hcc.
 	* **hcc_detail/**** , ***nvcc_detail/**** : Implementation details for specific platforms. HIP applications should not include these files directly.
 	* **hcc.h** : Includes interop APIs for HIP and HCC
  * **bin**: Tools and scripts to help with hip porting
diff --git a/Programming_Guides/CUDAAPIHIPTEXTURE.rst b/Programming_Guides/CUDAAPIHIPTEXTURE.rst
index 818afb44..7b13131a 100644
--- a/Programming_Guides/CUDAAPIHIPTEXTURE.rst
+++ b/Programming_Guides/CUDAAPIHIPTEXTURE.rst
@@ -160,7 +160,7 @@ CUDA Runtime API functions supported by HIP
 +--------------------------------+---------------------------+
 |   cudaGetParameterBufferV2     |                           |
 +--------------------------------+---------------------------+
-|   cudaLaunchKernel             |   hipLaunchKernel         |
+|   cudaLaunchKernel             |   hipLaunchKernelGGL      |
 +--------------------------------+---------------------------+
 |   cudaSetDoubleForDevice       |                           |
 +--------------------------------+---------------------------+
diff --git a/Programming_Guides/HIP-FAQ.rst b/Programming_Guides/HIP-FAQ.rst
index 1e15d69e..0d58bc13 100644
--- a/Programming_Guides/HIP-FAQ.rst
+++ b/Programming_Guides/HIP-FAQ.rst
@@ -13,7 +13,7 @@ HIP provides the following:
     * Memory management (hipMalloc(), hipMemcpy(), hipFree(), etc.)
     * Streams (hipStreamCreate(),hipStreamSynchronize(), hipStreamWaitEvent(), etc.)
     * Events (hipEventRecord(), hipEventElapsedTime(), etc.)
-    * Kernel launching (hipLaunchKernel is a standard C/C++ function that replaces <<< >>>)
+    * Kernel launching (hipLaunchKernelGGL is a standard C/C++ function that replaces <<< >>>)
     * HIP Module API to control when adn how code is loaded.
     * CUDA-style kernel coordinate functions (threadIdx, blockIdx, blockDim, gridDim)
     * Cross-lane instructions including shfl, ballot, any, all
diff --git a/Programming_Guides/HIP-GUIDE.rst b/Programming_Guides/HIP-GUIDE.rst
index de551f93..f032101a 100644
--- a/Programming_Guides/HIP-GUIDE.rst
+++ b/Programming_Guides/HIP-GUIDE.rst
@@ -39,7 +39,7 @@ Supported __global__ functions are
     * Executed on the device
     * Called ("launched") from the host
 
-HIP __global__ functions must have a void return type, and the first parameter to a HIP __global__ function must have the type hipLaunchParm.see :ref:`Kernel`
+HIP __global__ functions must have a void return type.
 
 HIP lacks dynamic-parallelism support, so __global__ functions cannot be called from the device.
 
@@ -61,8 +61,8 @@ Calling __global__ Functions
 
 __global__ functions are often referred to as kernels, and calling one is termed launching the kernel. These functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute. HIP introduces a standard C++ calling convention to pass the execution configuration to the kernel (this convention replaces the Cuda <<< >>> syntax). In HIP,
 
-   * Kernels launch with the "hipLaunchKernel" function
-    * The first five parameters to hipLaunchKernel are the following:
+   * Kernels launch with the "hipLaunchKernelGGL" function
+    * The first five parameters to hipLaunchKernelGGL are the following:
        *  symbol kernelName: the name of the kernel to launch. To support template kernels which contains "," use the HIP_KERNEL_NAME 		                      macro. The hipify tools insert this automatically.
        *  dim3 gridDim: 3D-grid dimensions specifying the number of blocks to launch.
        *  dim3 blockDim: 3D-block dimensions specifying the number of threads in each block.
@@ -71,16 +71,16 @@ __global__ functions are often referred to as kernels, and calling one is termed
           :ref:`Synchronization-Functions`).
    * Kernel arguments follow these first five parameters ::
     
-      //Example pseudo code introducing hipLaunchKernel
-      __global__ MyKernel(hipLaunchParm lp, float *A, float *B, float *C, size_t N)
+      //Example pseudo code introducing hipLaunchKernelGGL
+      __global__ MyKernel(float *A, float *B, float *C, size_t N)
       {
       ...
       } 
       //Replace MyKernel<<<dim3(gridDim), dim3(gridDim), 0, 0>>> (a,b,c,n);
-      hipLaunchKernel(MyKernel, dim3(gridDim), dim3(groupDim), 0/*dynamicShared*/, 0/*stream), a, b, c, n)
+      hipLaunchKernelGGL(MyKernel, dim3(gridDim), dim3(groupDim), 0/*dynamicShared*/, 0/*stream), a, b, c, n)
 
 
-The hipLaunchKernel macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernel, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernel parameters. The :ref:`dim3` constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See dim3. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that’s currently executing. 
+The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The :ref:`dim3` constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See dim3. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that’s currently executing. 
 
  .. _Kernel:
 
@@ -98,8 +98,7 @@ Kernel-Launch Example
 
   __global__ 
   void 
-  MyKernel (hipLaunchParm lp, /*lp parm for execution configuration */
-          const float *a, const float *b, float *c, unsigned N)
+  MyKernel (const float *a, const float *b, float *c, unsigned N)
   {
      unsigned gid = hipThreadIdx_x; // <- coordinate index function
      if (gid < N) {
@@ -111,7 +110,7 @@ Kernel-Launch Example
      float *a, *b, *c; // initialization not shown...
      unsigned N = 1000000;
      const unsigned blockSize = 256;
-     hipLaunchKernel(MyKernel, 
+     hipLaunchKernelGGL(MyKernel, 
    (N/blockSize), dim3(blockSize), 0, 0,  a,b,c,N);
   }
 
diff --git a/Programming_Guides/HIP-Terms.rst b/Programming_Guides/HIP-Terms.rst
index f091c82c..36a7bcf6 100644
--- a/Programming_Guides/HIP-Terms.rst
+++ b/Programming_Guides/HIP-Terms.rst
@@ -40,7 +40,7 @@ Table Comparing Syntax for Different Compute APIs
 | Host + Device Function |   __host__            |   __host__            |   [[hc]] [[cpu]]                                  |   restrict(amp,cpu)                                | No equivalent                   |
 |                        |   __device__          |   __device__          |                                                   |                                                    |                                 |
 +------------------------+-----------------------+-----------------------+---------------------------------------------------+----------------------------------------------------+---------------------------------+
-| Kernel Launch          |   <<< >>>             |   hipLaunchKernel     |   hc::parallel_for_each                           |   concurrency::parallel_for_each                   |   clEnqueueNDRangeKernel        |
+| Kernel Launch          |   <<< >>>             |hipLaunchKernelGGL     |   hc::parallel_for_each                           |   concurrency::parallel_for_each                   |   clEnqueueNDRangeKernel        |
 +------------------------+-----------------------+-----------------------+---------------------------------------------------+----------------------------------------------------+---------------------------------+
 | Global Memory          |   __global__          |   __global__          | Unnecessary / Implied                             | Unnecessary / Implied                              |   __global__                    |
 +------------------------+-----------------------+-----------------------+---------------------------------------------------+----------------------------------------------------+---------------------------------+
diff --git a/Programming_Guides/HIP-porting-guide.rst b/Programming_Guides/HIP-porting-guide.rst
index 6b3feaf7..a6315f25 100644
--- a/Programming_Guides/HIP-porting-guide.rst
+++ b/Programming_Guides/HIP-porting-guide.rst
@@ -295,10 +295,10 @@ Makefiles can use the following syntax to conditionally provide a default HIP_PA
 
   HIP_PATH ?= $(shell hipconfig --path)
 
-hipLaunchKernel
+hipLaunchKernelGGL
 ~~~~~~~~~~~~~~~~
 
-hipLaunchKernel is a variadic macro which accepts as parameters the launch configurations (grid dims, group dims, stream, dynamic shared size) followed by a variable number of kernel arguments. This sequence is then expanded into the appropriate kernel launch syntax depending on the platform.
+hipLaunchKernelGGL is a variadic macro which accepts as parameters the launch configurations (grid dims, group dims, stream, dynamic shared size) followed by a variable number of kernel arguments. This sequence is then expanded into the appropriate kernel launch syntax depending on the platform.
 While this can be a convenient single-line kernel launch syntax, the macro implementation can cause issues when nested inside other macros. For example, consider the following:
 
 ::
@@ -310,7 +310,7 @@ While this can be a convenient single-line kernel launch syntax, the macro imple
      (command);   /* The nested ( ) will cause compile error */\
   }
 
-  MY_LAUNCH (hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad), true, "firstCall");
+  MY_LAUNCH (hipLaunchKernelGGL(vAdd, dim3(1024), dim3(1), 0, 0, Ad), true, "firstCall");
 
 Avoid nesting macro parameters inside parenthesis - here's an alternative that will work:
 
@@ -322,7 +322,7 @@ Avoid nesting macro parameters inside parenthesis - here's an alternative that w
      command;\ 
   }
 
-  MY_LAUNCH (hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad), true, "firstCall");
+  MY_LAUNCH (hipLaunchKernelGGL(vAdd, dim3(1024), dim3(1), 0, 0, Ad), true, "firstCall");
 
 Compiler Options
 ~~~~~~~~~~~~~~~~
@@ -441,7 +441,7 @@ Device Code:
 
   __constant__ int Value[LEN];
 
-  __global__ void Get(hipLaunchParm lp, int *Ad)
+  __global__ void Get(int *Ad)
   {
       int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
       Ad[tid] = Value[tid];
@@ -461,7 +461,7 @@ Device Code:
       HIP_ASSERT(hipMalloc((void**)&Ad, SIZE));
 
       HIP_ASSERT(hipMemcpyToSymbol(HIP_SYMBOL(Value), A, SIZE, 0, hipMemcpyHostToDevice));
-      hipLaunchKernel(Get, dim3(1,1,1), dim3(LEN,1,1), 0, 0, Ad);
+      hipLaunchKernelGGL(Get, dim3(1,1,1), dim3(LEN,1,1), 0, 0, Ad);
       HIP_ASSERT(hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost));
 
       for(unsigned i=0;i<LEN;i++)
diff --git a/Programming_Guides/HIP-terminology.rst b/Programming_Guides/HIP-terminology.rst
index b2b71345..fbb6334a 100644
--- a/Programming_Guides/HIP-terminology.rst
+++ b/Programming_Guides/HIP-terminology.rst
@@ -54,7 +54,7 @@ HIP terminology comparison with OpenCL, Cuda, C++ AMP and HCC
 +-----------------------+---------------+-----------------+---------------------+------------------------+---------------------------+
 |Kernel Launch          |               |                 |                     | concurrency::          |                           |
 |                       |   <<< >>>     | hipLaunchKernel |hc::                 | parallel_for_each      |clEnqueueND-               |
-|                       |               |                 |parallel_for_each    |                        |RangeKernel                |
+|                       |               |   GGL           |parallel_for_each    |                        |RangeKernel                |
 +-----------------------+---------------+-----------------+---------------------+------------------------+---------------------------+
 +-----------------------+---------------+-----------------+---------------------+------------------------+---------------------------+
 |                       |               |                 |                     |                        |                           |
diff --git a/Programming_Guides/Kernel_language.rst b/Programming_Guides/Kernel_language.rst
index a9e45cfb..faf5b330 100644
--- a/Programming_Guides/Kernel_language.rst
+++ b/Programming_Guides/Kernel_language.rst
@@ -93,7 +93,7 @@ Supported ``__global__`` functions are
 * Executed on the device
 * Called ("launched") from the host
 
-HIP ``__global__`` functions must have a void return type, and the first parameter to a HIP ``__global__`` function must have the type hipLaunchParm. See `Kernel-Launch Example <https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#kernel-launch-example>`_ .
+HIP ``__global__`` functions must have a void return type. See `Kernel-Launch Example <https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#kernel-launch-example>`_ .
 
 HIP lacks dynamic-parallelism support, so ``__global__`` functions cannot be called from the device.
 
@@ -118,8 +118,8 @@ Calling __global__ Functions
 
 ``__global__`` functions are often referred to as kernels, and calling one is termed launching the kernel. These functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute. HIP introduces a standard C++ calling convention to pass the execution configuration to the kernel (this convention replaces the Cuda <<< >>> syntax). In HIP,
 
-* Kernels launch with the "hipLaunchKernel" function
-* The first five parameters to hipLaunchKernel are the following:
+* Kernels launch with the "hipLaunchKernelGGL" function
+* The first five parameters to hipLaunchKernelGGL are the following:
 	* **symbol kernelName:** the name of the kernel to launch. To support template kernels which contains "," use the HIP_KERNEL_NAME macro. The hipify tools insert this automatically.
 	* **dim3 gridDim:** 3D-grid dimensions specifying the number of blocks to launch.
 	* **dim3 blockDim:** 3D-block dimensions specifying the number of threads in each block.
@@ -130,18 +130,18 @@ Calling __global__ Functions
 
 ::
 
-  // Example pseudo code introducing hipLaunchKernel:
-  __global__ MyKernel(hipLaunchParm lp, float *A, float *B, float *C, size_t N)
+  // Example pseudo code introducing hipLaunchKernelGGL:
+  __global__ MyKernel(float *A, float *B, float *C, size_t N)
   {
   ...
   } 
  
   // Replace MyKernel<<<dim3(gridDim), dim3(gridDim), 0, 0>>> (a,b,c,n);
   
-  hipLaunchKernel(MyKernel, dim3(gridDim), dim3(groupDim), 0/*dynamicShared*/, 0/*stream), a, b, c, n);
+  hipLaunchKernelGGL(MyKernel, dim3(gridDim), dim3(groupDim), 0/*dynamicShared*/, 0/*stream), a, b, c, n);
  
 
-The hipLaunchKernel macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernel, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernel parameters. The dim3 constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See `dim3 <https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#dim3>`_. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that’s currently executing. See `Coordinate Built-Ins <https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#coordinate-built-ins>`_.
+The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The dim3 constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See `dim3 <https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#dim3>`_. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that’s currently executing. See `Coordinate Built-Ins <https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#coordinate-built-ins>`_.
 
 .. _Kernel-Launch-Example:
 
@@ -159,8 +159,7 @@ Kernel-Launch Example
  
   __global__ 
   void 
-  MyKernel (hipLaunchParm lp, /*lp parm for execution configuration */
-           const float *a, const float *b, float *c, unsigned N)
+  MyKernel (const float *a, const float *b, float *c, unsigned N)
   {
       unsigned gid = hipThreadIdx_x; // <- coordinate index function
       if (gid < N) {
@@ -173,7 +172,7 @@ Kernel-Launch Example
      unsigned N = 1000000;
      const unsigned blockSize = 256; 
  
-     hipLaunchKernel(MyKernel, dim3(N/blockSize), dim3(blockSize), 0, 0,  a,b,c,N);
+     hipLaunchKernelGGL(MyKernel, dim3(N/blockSize), dim3(blockSize), 0, 0,  a,b,c,N);
   }
  
 .. _Variable-Type-Qualifiers:
diff --git a/Programming_Guides/LanguageInto.rst b/Programming_Guides/LanguageInto.rst
index faf794d1..a3020cb2 100644
--- a/Programming_Guides/LanguageInto.rst
+++ b/Programming_Guides/LanguageInto.rst
@@ -152,7 +152,7 @@ Table Comparing Syntax for Different Compute APIs
 +-----------------------+---------------+-----------------+---------------------+------------------------+---------------------------+
 |Kernel Launch          |               |                 |                     | concurrency::          |                           |
 |                       |   <<< >>>     | hipLaunchKernel |hc::                 | parallel_for_each      |clEnqueueND-               |
-|                       |               |                 |parallel_for_each    |                        |RangeKernel                |
+|                       |               |         GGL     |parallel_for_each    |                        |RangeKernel                |
 +-----------------------+---------------+-----------------+---------------------+------------------------+---------------------------+
 +-----------------------+---------------+-----------------+---------------------+------------------------+---------------------------+
 |                       |               |                 |                     |                        |                           |
@@ -193,4 +193,4 @@ Notes
 1. For HC and C++AMP, assume a captured _tiled_ext_ named "t_ext" and captured _extent_ named "ext".  These languages use captured variables to pass information to the kernel rather than using special built-in functions so the exact variable name may vary.
 2. The indexing functions (starting with ``thread-index``) show the terminology for a 1D grid.  Some APIs use reverse order of xyz / 012 indexing for 3D grids.
 3. HC allows tile dimensions to be specified at runtime while C++AMP requires that tile dimensions be specified at compile-time.  Thus hc syntax for tile dims is ``t_ext.tile_dim[0]`` while C++AMP is t_ext.tile_dim0.
-4. **From ROCm version 2.0 onwards C++AMP is no longer available in HCC.**
\ No newline at end of file
+4. **From ROCm version 2.0 onwards C++AMP is no longer available in HCC.**
diff --git a/Programming_Guides/Programming-Guides.rst b/Programming_Guides/Programming-Guides.rst
index ce44a522..b6ec3100 100644
--- a/Programming_Guides/Programming-Guides.rst
+++ b/Programming_Guides/Programming-Guides.rst
@@ -140,7 +140,7 @@ Table Comparing Syntax for Different Compute APIs
 +-----------------------+---------------+-----------------+---------------------+------------------------+---------------------------+
 |Kernel Launch          |               |                 |                     | concurrency::          |                           |
 |                       |   <<< >>>     | hipLaunchKernel |hc::                 | parallel_for_each      |clEnqueueND-               |
-|                       |               |                 |parallel_for_each    |                        |RangeKernel                |
+|                       |               |      GGL        |parallel_for_each    |                        |RangeKernel                |
 +-----------------------+---------------+-----------------+---------------------+------------------------+---------------------------+
 +-----------------------+---------------+-----------------+---------------------+------------------------+---------------------------+
 |                       |               |                 |                     |                        |                           |
@@ -393,7 +393,7 @@ Supported CUDA APIs:
 **Simple Example**
 
 The HIP API includes functions such as hipMalloc, hipMemcpy, and hipFree.
-Programmers familiar with CUDA will also be able to quickly learn and start coding with the HIP API. Compute kernels are launched with the "hipLaunchKernel" macro call. Here is simple example showing a snippet of HIP API code:
+Programmers familiar with CUDA will also be able to quickly learn and start coding with the HIP API. Compute kernels are launched with the "hipLaunchKernelGGL" macro call. Here is simple example showing a snippet of HIP API code:
 
 hipMalloc(&A_d, Nbytes));
 hipMalloc(&C_d, Nbytes));
@@ -402,7 +402,7 @@ hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice);
 
 const unsigned blocks = 512;
 const unsigned threadsPerBlock = 256;
-hipLaunchKernel(vector_square,   /* compute kernel*/
+hipLaunchKernelGGL(vector_square,   /* compute kernel*/
                 dim3(blocks), dim3(threadsPerBlock), 0/*dynamic shared*/, 0/*stream*/,     /* launch config*/
                 C_d, A_d, N);  /* arguments to the compute kernel */
 
@@ -460,7 +460,7 @@ The GitHub repository `HIP-Examples <https://github.com/ROCm-Developer-Tools/HIP
 
     **include:**
         * **hip_runtime_api.h** : Defines HIP runtime APIs and can be compiled with many standard Linux compilers (hcc, GCC, ICC, CLANG, etc), in either C or C++ mode.
-        * **hip_runtime.h** : Includes everything in hip_runtime_api.h PLUS hipLaunchKernel and syntax for writing device kernels and device functions. hip_runtime.h can only be compiled with hcc.
+        * **hip_runtime.h** : Includes everything in hip_runtime_api.h PLUS hipLaunchKernelGGL and syntax for writing device kernels and device functions. hip_runtime.h can only be compiled with hcc.
         * **hcc_detail/** , nvcc_detail/** : Implementation details for specific platforms. HIP applications should not include these files directly.
         * **hcc.h** : Includes interop APIs for HIP and HCC
 
diff --git a/Programming_Guides/hip_port.rst b/Programming_Guides/hip_port.rst
index 2d1a5c8b..59d5c498 100644
--- a/Programming_Guides/hip_port.rst
+++ b/Programming_Guides/hip_port.rst
@@ -97,7 +97,7 @@ hip-clang generates initializatiion and termination functions for each translati
 
 **Kernel Launching**
 
-hip-clang supports kernel launching by CUDA <<<>>> syntax, hipLaunchKernel, and hipLaunchKernelGGL. The latter two are macros which expand to CUDA <<<>>> syntax.
+hip-clang supports kernel launching by CUDA <<<>>> syntax, hipLaunchKernelGGL. The latter is a macro which expands to CUDA <<<>>> syntax.
 
 In host code, hip-clang emits a stub function with the same name and arguments as the kernel. In the body of this function, hipSetupArgument is called for each kernel argument, then hipLaunchByPtr is called with a function pointer to the stub function.
 
@@ -261,7 +261,7 @@ HIP supports texture driver APIs however texture reference should be declared in
   #include "hip/hip_runtime.h"
   extern texture<float, 2, hipReadModeElementType> tex;
 
-  __global__ void tex2dKernel(hipLaunchParm lp, float* outputData,
+  __global__ void tex2dKernel(float* outputData,
                                int width,
                                int height)
   {
diff --git a/Programming_Guides/hip_profiling.rst b/Programming_Guides/hip_profiling.rst
index 1c00c075..8c167d3e 100644
--- a/Programming_Guides/hip_profiling.rst
+++ b/Programming_Guides/hip_profiling.rst
@@ -292,7 +292,7 @@ Heres a specific example showing the output of the square program running on HIP
  <<hip-api tid:1.4 hipMemcpy (0x50409d000, 0x7f32158ac010, 4000000, hipMemcpyHostToDevice)
    hip-api tid:1.4 hipMemcpy                      ret= 0 (hipSuccess)>>
  info: launch 'vector_square' kernel
- 1.5 hipLaunchKernel 'HIP_KERNEL_NAME(vector_square)' gridDim:{512,1,1} groupDim:{256,1,1} sharedMem:+0 stream#0.0
+ 1.5 hipLaunchKernelGGL 'HIP_KERNEL_NAME(vector_square)' gridDim:{512,1,1} groupDim:{256,1,1} sharedMem:+0 stream#0.0
  info: copy Device2Host
  <<hip-api tid:1.6 hipMemcpy (0x7f32154db010, 0x50446e000, 4000000, hipMemcpyDeviceToHost)
    hip-api tid:1.6 hipMemcpy                      ret= 0 (hipSuccess)>>
@@ -302,7 +302,7 @@ Heres a specific example showing the output of the square program running on HIP
 HIP_TRACE_API supports multiple levels of debug information:
  
  * 0x1 = print all HIP APIs. This is the most verbose setting; the flags below allow selecting a subset.
- * 0x2 = print HIP APIs which initiate GPU kernel commands. Includes hipLaunchKernel, hipLaunchModuleKernel
+ * 0x2 = print HIP APIs which initiate GPU kernel commands. Includes hipLaunchKernelGGL, hipLaunchModuleKernel
  * 0x4 = print HIP APIs which initiate GPU memory commands. Includes hipMemcpy*, hipMemset*.
  * 0x8 = print HIP APIs which allocate or free memory. Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree.
 
diff --git a/Programming_Guides/hipporting-driver-api.rst b/Programming_Guides/hipporting-driver-api.rst
index c6b104da..9de8c622 100644
--- a/Programming_Guides/hipporting-driver-api.rst
+++ b/Programming_Guides/hipporting-driver-api.rst
@@ -101,7 +101,7 @@ hip-clang generates initializatiion and termination functions for each translati
 Kernel Launching
 *****************
 
-hip-clang supports kernel launching by CUDA <<<>>> syntax, hipLaunchKernel, and hipLaunchKernelGGL. The latter two are macros which expand to CUDA <<<>>> syntax.
+hip-clang supports kernel launching by CUDA <<<>>> syntax, hipLaunchKernelGGL. The latter is a macros which expands to CUDA <<<>>> syntax.
 
 In host code, hip-clang emits a stub function with the same name and arguments as the kernel. In the body of this function, hipSetupArgument is called for each kernel argument, then hipLaunchByPtr is called with a function pointer to the stub function.
 
@@ -271,7 +271,7 @@ HIP supports texture driver APIs however texture reference should be declared in
   #include "hip/hip_runtime.h"
   extern texture<float, 2, hipReadModeElementType> tex;
 
-  __global__ void tex2dKernel(hipLaunchParm lp, float* outputData,
+  __global__ void tex2dKernel(float* outputData,
                              int width,
                              int height)
    {