From 611e249fd7565b4a25be38a922953cfbbe0f291a Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Wed, 8 Apr 2020 10:24:35 -0700 Subject: [PATCH] Revert "Add cleanup_text.sh script, and remove non-ASCII characters from RST code blocks" --- Deep_learning/GCN-asm-tutorial.rst | 6 +- GCN_ISA_Manuals/GCN-ISA-Manuals.rst | 6 +- Installation_Guide/FAQ-on-Installation.rst | 6 +- Installation_Guide/HCC-Compiler.rst | 2 +- Installation_Guide/Installation-Guide.rst | 4 +- .../Quick Start Installation Guide.rst | 4 +- Programming_Guides/HIP-GUIDE.rst | 2 +- Programming_Guides/Kernel_language.rst | 2 +- Programming_Guides/Opencl-optimization.rst | 6 +- .../Opencl-programming-guide.rst | 14 +- ROCm_API_References/HCC-API.rst | 4 +- ROCm_Compiler_SDK/ROCm-Native-ISA.rst | 18 +- ROCm_Tools/ROCm-Tools.rst | 154 +++++++------- ROCm_Tools/tutorial.rst | 2 +- .../ROCm-Virtualization-&-Containers.rst | 2 +- Tutorial/GCN-asm-tutorial.rst | 6 +- cleanup_text.sh | 196 ------------------ 17 files changed, 119 insertions(+), 315 deletions(-) delete mode 100755 cleanup_text.sh diff --git a/Deep_learning/GCN-asm-tutorial.rst b/Deep_learning/GCN-asm-tutorial.rst index bd2e325a..22b43deb 100644 --- a/Deep_learning/GCN-asm-tutorial.rst +++ b/Deep_learning/GCN-asm-tutorial.rst @@ -71,9 +71,9 @@ The host program should also allocate memory for the in, index and out buffers. out = AllocateBuffer(size); // Fill Kernarg memory - Kernarg(in); // Add base pointer to "in" buffer - Kernarg(index); // Append base pointer to "index" buffer - Kernarg(out); // Append base pointer to "out" buffer + Kernarg(in); // Add base pointer to “in” buffer + Kernarg(index); // Append base pointer to “index” buffer + Kernarg(out); // Append base pointer to “out” buffer Initial Wavefront and Register State To launch a kernel in real hardware, the run time needs information about the kernel, such as diff --git a/GCN_ISA_Manuals/GCN-ISA-Manuals.rst b/GCN_ISA_Manuals/GCN-ISA-Manuals.rst index 22a41026..55aedd3d 100644 --- a/GCN_ISA_Manuals/GCN-ISA-Manuals.rst +++ b/GCN_ISA_Manuals/GCN-ISA-Manuals.rst @@ -90,9 +90,9 @@ The host program should also allocate memory for the in, index and out buffers. out = AllocateBuffer(size); // Fill Kernarg memory - Kernarg(in); // Add base pointer to "in" buffer - Kernarg(index); // Append base pointer to "index" buffer - Kernarg(out); // Append base pointer to "out" buffer + Kernarg(in); // Add base pointer to “in” buffer + Kernarg(index); // Append base pointer to “index” buffer + Kernarg(out); // Append base pointer to “out” buffer Initial Wavefront and Register State To launch a kernel in real hardware, the run time needs information about the kernel, such as diff --git a/Installation_Guide/FAQ-on-Installation.rst b/Installation_Guide/FAQ-on-Installation.rst index 0f68dc18..c9055017 100644 --- a/Installation_Guide/FAQ-on-Installation.rst +++ b/Installation_Guide/FAQ-on-Installation.rst @@ -89,7 +89,7 @@ This problem can occur on Fedora installation if several previous kernels are cu This is not an issue with the YUM repository; it is caused by the size of the /boot filesystem and the size of the kernels already installed on it. This issue can be fixed by uninstalling previous versions of the rocm Linux kernel: :: sudo dnf remove rocm - rpm -qa | grep kfd | xargs sudo rpm -e + rpm -qa | grep kfd | xargs sudo rpm –e sudo dnf install rocm Installing from an archived repository @@ -104,7 +104,7 @@ Here is an Example: cd /temp && wget http://repo.radeon.com/rocm/archive/apt_1.6.3.tar.bz2 tar -xvf apt_1.6.3.tar.bz2 - sudo echo "deb [amd64] file://temp/apt_1.6.3 xenial main" > /etc/apt/sources.lists.d/rocm.local.list + sudo echo “deb [amd64] file://temp/apt_1.6.3 xenial main” > /etc/apt/sources.lists.d/rocm.local.list sudo apt-get update && sudo apt-get install rocm Users should make sure that no other list files contain another rocm repo configuration. @@ -119,7 +119,7 @@ Add a /etc/yum.d/rocm.local.repo file with the following contents: :: enabled=1 gpgcheck=0 cd /temp && wget http://repo.radeon.com/rocm/archive/yum_1.6.3.tar.bz2 - tar -xvf yum_1.6.3.tar.bz2 + tar –xvf yum_1.6.3.tar.bz2 Then execute: :: diff --git a/Installation_Guide/HCC-Compiler.rst b/Installation_Guide/HCC-Compiler.rst index 70336a10..8a350a6c 100644 --- a/Installation_Guide/HCC-Compiler.rst +++ b/Installation_Guide/HCC-Compiler.rst @@ -173,4 +173,4 @@ For applications compiled using hcc, ThinLTO could significantly improve link-ti ThinLTO Phase 2 - Under development ************************************** -This ThinLTO implementation which will use llvm-lto LLVM tool to replace clamp-device bash script. It adds an optllc option into ThinLTOGenerator, which will perform in-program opt and codegen in parallel. +This ThinLTO implementation which will use llvm-lto LLVM tool to replace clamp-device bash script. It adds an optllc option into ThinLTOGenerator, which will perform in-program opt and codegen in parallel. \ No newline at end of file diff --git a/Installation_Guide/Installation-Guide.rst b/Installation_Guide/Installation-Guide.rst index 33162b06..94dd0bc8 100644 --- a/Installation_Guide/Installation-Guide.rst +++ b/Installation_Guide/Installation-Guide.rst @@ -227,7 +227,7 @@ To install ROCm on your system, follow the instructions below: enabled=1 gpgcheck=0 -Note: The URL of the repository must point to the location of the repositories' repodata database. +Note: The URL of the repository must point to the location of the repositories’ repodata database. 3. Install ROCm components using the following command: @@ -352,7 +352,7 @@ The following section tells you how to perform an install and uninstall ROCm on :: - sudo zypper clean -all + sudo zypper clean –all sudo zypper addrepo --no-gpgcheck http://repo.radeon.com/rocm/zyp/zypper/ rocm sudo zypper ref zypper install rocm-dkms diff --git a/Installation_Guide/Quick Start Installation Guide.rst b/Installation_Guide/Quick Start Installation Guide.rst index 70b9d8f5..de5109eb 100644 --- a/Installation_Guide/Quick Start Installation Guide.rst +++ b/Installation_Guide/Quick Start Installation Guide.rst @@ -236,7 +236,7 @@ To install ROCm on your system, follow the instructions below: enabled=1 gpgcheck=0 -Note: The URL of the repository must point to the location of the repositories' repodata database. +Note: The URL of the repository must point to the location of the repositories’ repodata database. 3. Install ROCm components using the following command: @@ -363,7 +363,7 @@ The following section tells you how to perform an install and uninstall ROCm on :: - sudo zypper clean -all + sudo zypper clean –all sudo zypper addrepo --no-gpgcheck http://repo.radeon.com/rocm/zyp/zypper/ rocm sudo zypper ref zypper install rocm-dkms diff --git a/Programming_Guides/HIP-GUIDE.rst b/Programming_Guides/HIP-GUIDE.rst index 50c06f11..f032101a 100644 --- a/Programming_Guides/HIP-GUIDE.rst +++ b/Programming_Guides/HIP-GUIDE.rst @@ -80,7 +80,7 @@ __global__ functions are often referred to as kernels, and calling one is termed hipLaunchKernelGGL(MyKernel, dim3(gridDim), dim3(groupDim), 0/*dynamicShared*/, 0/*stream), a, b, c, n) -The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The :ref:`dim3` constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See dim3. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that's currently executing. +The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The :ref:`dim3` constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See dim3. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that’s currently executing. .. _Kernel: diff --git a/Programming_Guides/Kernel_language.rst b/Programming_Guides/Kernel_language.rst index cbc0ee69..faf5b330 100644 --- a/Programming_Guides/Kernel_language.rst +++ b/Programming_Guides/Kernel_language.rst @@ -141,7 +141,7 @@ Calling __global__ Functions hipLaunchKernelGGL(MyKernel, dim3(gridDim), dim3(groupDim), 0/*dynamicShared*/, 0/*stream), a, b, c, n); -The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The dim3 constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See `dim3 `_. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that's currently executing. See `Coordinate Built-Ins `_. +The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The dim3 constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See `dim3 `_. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that’s currently executing. See `Coordinate Built-Ins `_. .. _Kernel-Launch-Example: diff --git a/Programming_Guides/Opencl-optimization.rst b/Programming_Guides/Opencl-optimization.rst index 43a69307..1fa69e53 100644 --- a/Programming_Guides/Opencl-optimization.rst +++ b/Programming_Guides/Opencl-optimization.rst @@ -1543,7 +1543,7 @@ In the second block of code, the ``?:`` operator executes in the vector units, s a[idx] = d[idx]; } -This is inefficient because the GPU compiler must know the base pointer that every load comes from and in this situation, the compiler cannot determine what aEUR~d' points to. So, both B and C are assigned to the same GPU resource, removing the ability to do certain optimizations. +This is inefficient because the GPU compiler must know the base pointer that every load comes from and in this situation, the compiler cannot determine what ‘d' points to. So, both B and C are assigned to the same GPU resource, removing the ability to do certain optimizations. *If the algorithm allows changing the work-group size, it is possible to get better performance by using larger work-groups (more work-items in each work-group) because the workgroup creation overhead is reduced. On the other hand, the OpenCL CPU runtime uses a task-stealing algorithm at the work-group level, so when the kernel execution time differs because it contains conditions and/or loops of varying number of iterations, it might be better to increase the number of work-groups. This gives the runtime more flexibility in scheduling work-groups to idle CPU cores. Experimentation might be needed to reach optimal work-group size. *Since the AMD OpenCL runtime supports only in-order queuing, using clFinish() on a queue and queuing a blocking command gives the same result. The latter saves the overhead of another API command. @@ -2230,7 +2230,7 @@ The following are sample kernels with different coalescing patterns. { int gid = get_global_id(0); if((gid & 0x1) == 0) { - gid = (gid & (Eoe63)) +62 - get_local_id(0); + gid = (gid & (Ëœ63)) +62 - get_local_id(0); } output[gid] = input[gid]; return; @@ -3242,7 +3242,7 @@ In the second block of code, the ``?:`` operator executes in an ALU clause, so n } - This is inefficient because the GPU compiler must know the base pointer that every load comes from and in this situation, the compiler cannot determine what aEUR~d' points to. So, both B and C are assigned to the same GPU resource, removing the ability to do certain optimizations. + This is inefficient because the GPU compiler must know the base pointer that every load comes from and in this situation, the compiler cannot determine what ‘d' points to. So, both B and C are assigned to the same GPU resource, removing the ability to do certain optimizations. * If the algorithm allows changing the work-group size, it is possible to get better performance by using larger work-groups (more work-items in each work-group) because the workgroup creation overhead is reduced. On the other hand, the OpenCL CPU runtime uses a task-stealing algorithm at the work-group level, so when the kernel execution time differs because it contains conditions and/or loops of varying number of iterations, it might be better to increase the number of work-groups. This gives the runtime more flexibility in scheduling work-groups to idle CPU cores. Experimentation might be needed to reach optimal work-group size. * Since the AMD OpenCL runtime supports only in-order queuing, using ``clFinish`` () on a queue and queuing a blocking command gives the same result. The latter saves the overhead of another API command. diff --git a/Programming_Guides/Opencl-programming-guide.rst b/Programming_Guides/Opencl-programming-guide.rst index fc721632..63b27374 100644 --- a/Programming_Guides/Opencl-programming-guide.rst +++ b/Programming_Guides/Opencl-programming-guide.rst @@ -1469,7 +1469,7 @@ Now, these headers can be passed as embedded headers along with the program obje :: cl_program input_headers[2] = { foo_pg, myinc_pg }; - char * input_header_names[2] = { "foo.h", "mydir/myinc.h" }; + char * input_header_names[2] = { “foo.h”, “mydir/myinc.h” }; clCompileProgram(program_A, 0, NULL, // num_devices & device_list NULL, // compile_options @@ -1660,7 +1660,7 @@ A sample kernel definition is shown below. kernel void sample_kernel( global const uchar *normalPtr, global uchar *svmPtr) { - ... + … } To create a kernel object for the above kernel, you must pass the program object corresponding to the kernel to the clCreateKernel function. Assuming that the program object containing the above kernel function has been created and built as program, a kernel object for the above kernel would be created as follows: @@ -2139,7 +2139,7 @@ OpenCL Language types. MyFunc () { tempClass = new(Test); - ... // Some OpenCL startup code - create context, queue, etc. + ... // Some OpenCL startup code – create context, queue, etc. cl_mem classObj = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(Test), &tempClass, event); clEnqueueMapBuffer(...,classObj,...); tempClass.setX(10); @@ -2393,9 +2393,9 @@ Generic example In OpenCL 1.2, the developer needed to write three functions for a pointer p that can reference the local, private, or global address space:: - void fooL (local int *p) { ... } - void fooP (private int *p) { ... } - void fooG (global int *p) { ... } + void fooL (local int *p) { … } + void fooP (private int *p) { … } + void fooG (global int *p) { … } @@ -2967,7 +2967,7 @@ There are special directives for the OpenCL compiler to enable or disable availa #pragma OPENCL EXTENSION all: -The is described in Section A.1, "Extension Name +The is described in Section A.1, “Extension Name Convention.”. The second form allows to address all extensions at once. The token can be either: * **enable** - the extension is enabled if it is supported, or the error is reported if the specified extension is not supported or token “all” is used. diff --git a/ROCm_API_References/HCC-API.rst b/ROCm_API_References/HCC-API.rst index e9024358..13abbcab 100644 --- a/ROCm_API_References/HCC-API.rst +++ b/ROCm_API_References/HCC-API.rst @@ -46,7 +46,7 @@ For example: :: - `` hcchcc-config -cxxflags -ldflagsfoo.cpp -o foo `` + `` hcchcc-config –cxxflags –ldflagsfoo.cpp -o foo `` HCC built-in macros ******************** @@ -143,4 +143,4 @@ HC supports capturing memory pointer by a GPU kernel. ``` // allocate GPU memory through the HSA API int* gpu_pointer; hsa_memory_allocate(..., &gpu_pointer); ... parallel_for_each(ext, [=](index i) [[hc]] { gpu_pointer[i[0]]++; } -``` For HSA APUs that supports system wide shared virtual memory, a GPU kernel can directly access system memory allocated by the host: ``` int* cpu_memory = (int*) malloc(...); ... parallel_for_each(ext, [=](index i) [[hc]] { cpu_memory[i[0]]++; }); ``` +``` For HSA APUs that supports system wide shared virtual memory, a GPU kernel can directly access system memory allocated by the host: ``` int* cpu_memory = (int*) malloc(...); ... parallel_for_each(ext, [=](index i) [[hc]] { cpu_memory[i[0]]++; }); ``` \ No newline at end of file diff --git a/ROCm_Compiler_SDK/ROCm-Native-ISA.rst b/ROCm_Compiler_SDK/ROCm-Native-ISA.rst index f33adcc6..e224d4d3 100644 --- a/ROCm_Compiler_SDK/ROCm-Native-ISA.rst +++ b/ROCm_Compiler_SDK/ROCm-Native-ISA.rst @@ -2265,7 +2265,7 @@ DS ds_min_rtn_f64 v[8:9], v2, v[4:5] -For full list of supported instructions, refer to "LDS/GDS instructions" in ISA Manual. +For full list of supported instructions, refer to “LDS/GDS instructions” in ISA Manual. .. _FLAT: @@ -2280,7 +2280,7 @@ FLAT flat_atomic_fmax_x2 v[1:2], v[3:4], v[5:6] glc -For full list of supported instructions, refer to "FLAT instructions" in ISA Manual. +For full list of supported instructions, refer to “FLAT instructions” in ISA Manual. .. _MUBUF: @@ -2295,7 +2295,7 @@ MUBUF buffer_wbinvl1 buffer_atomic_inc v1, v2, s[8:11], s4 idxen offset:4 slc -For full list of supported instructions, refer to "MUBUF Instructions" in ISA Manual. +For full list of supported instructions, refer to “MUBUF Instructions” in ISA Manual. .. _SMRD/SMEM: @@ -2309,7 +2309,7 @@ SMRD/SMEM s_dcache_inv_vol s_memtime s[4:5] -For full list of supported instructions, refer to "Scalar Memory Operations" in ISA Manual. +For full list of supported instructions, refer to “Scalar Memory Operations” in ISA Manual. .. _SOP1: @@ -2325,7 +2325,7 @@ SOP1 s_swappc_b64 s[2:3], s[4:5] s_cbranch_join s[4:5] -For full list of supported instructions, refer to "SOP1 Instructions" in ISA Manual. +For full list of supported instructions, refer to “SOP1 Instructions” in ISA Manual. .. _SOP2: @@ -2343,7 +2343,7 @@ SOP2 s_bfe_i64 s[2:3], s[4:5], s6 s_cbranch_g_fork s[4:5], s[6:7] -For full list of supported instructions, refer to "SOP2 Instructions" in ISA Manual. +For full list of supported instructions, refer to “SOP2 Instructions” in ISA Manual. .. _SOPC: @@ -2356,7 +2356,7 @@ SOPC s_bitcmp0_b64 s[2:3], s4 s_setvskip s3, s5 -For full list of supported instructions, refer to "SOPC Instructions" in ISA Manual. +For full list of supported instructions, refer to “SOPC Instructions” in ISA Manual. .. _SOPP: @@ -2376,7 +2376,7 @@ SOPP s_sendmsg sendmsg(MSG_INTERRUPT) s_trap 1 -For full list of supported instructions, refer to "SOPP Instructions" in ISA Manual. +For full list of supported instructions, refer to “SOPP Instructions” in ISA Manual. Unless otherwise mentioned, little verification is performed on the operands of SOPP Instructions, so it is up to the programmer to be familiar with the range or acceptable values. @@ -2434,7 +2434,7 @@ VOP_SDWA examples v_fract_f32 v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 v_cmpx_le_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 -For full list of supported instructions, refer to "Vector ALU instructions". +For full list of supported instructions, refer to “Vector ALU instructions”. .. _Code Object V2 Predefined Symbols (-mattr=-code-object-v3): diff --git a/ROCm_Tools/ROCm-Tools.rst b/ROCm_Tools/ROCm-Tools.rst index 58651749..22f06d3f 100644 --- a/ROCm_Tools/ROCm-Tools.rst +++ b/ROCm_Tools/ROCm-Tools.rst @@ -414,17 +414,17 @@ submitted kernels. An example of input file: gpu: 0 1 2 3 kernel: simple Pass1 simpleConvolutionPass2 -An example of profiling command line for 'MatrixTranspose' application +An example of profiling command line for ‘MatrixTranspose’ application :: $ rocprof -i input.txt MatrixTranspose - RPL: on '191018_011134' from '/..../rocprofiler_pkg' in '/..../MatrixTranspose' + RPL: on '191018_011134' from '/…./rocprofiler_pkg' in '/…./MatrixTranspose' RPL: profiling '"./MatrixTranspose"' RPL: input file 'input.txt' RPL: output dir '/tmp/rpl_data_191018_011134_9695' RPL: result dir '/tmp/rpl_data_191018_011134_9695/input0_results_191018_011134' - ROCProfiler: rc-file '/..../rpl_rc.xml' + ROCProfiler: rc-file '/…./rpl_rc.xml' ROCProfiler: input from "/tmp/rpl_data_191018_011134_9695/input0.xml" gpu_index = kernel = @@ -436,7 +436,7 @@ An example of profiling command line for 'MatrixTranspose' application PASSED! ROCProfiler: 1 contexts collected, output directory /tmp/rpl_data_191018_011134_9695/input0_results_191018_011134 - RPL: '/..../MatrixTranspose/input.csv' is generated + RPL: '/…./MatrixTranspose/input.csv' is generated **2.1.1. Counters and metrics** @@ -456,8 +456,8 @@ Metrics XML File Example: :: - - + + . . . @@ -469,7 +469,7 @@ Metrics XML File Example: @@ -484,8 +484,8 @@ expressions. Examples: :: $ rocprof --list-basic - RPL: on '191018_014450' from '/opt/rocm/rocprofiler' in '/..../MatrixTranspose' - ROCProfiler: rc-file '/..../rpl_rc.xml' + RPL: on '191018_014450' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' + ROCProfiler: rc-file '/…./rpl_rc.xml' Basic HW counters: gpu-agent0 : GRBM_COUNT : Tie High - Count Number of Clocks block GRBM has 2 counters @@ -541,12 +541,12 @@ metric groups: :: $ rocprof -i input.txt ./MatrixTranspose - RPL: on '191018_032645' from '/opt/rocm/rocprofiler' in '/..../MatrixTranspose' + RPL: on '191018_032645' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' RPL: profiling './MatrixTranspose' RPL: input file 'input.txt' RPL: output dir '/tmp/rpl_data_191018_032645_12106' RPL: result dir '/tmp/rpl_data_191018_032645_12106/input0_results_191018_032645' - ROCProfiler: rc-file '/..../rpl_rc.xml' + ROCProfiler: rc-file '/…./rpl_rc.xml' ROCProfiler: input from "/tmp/rpl_data_191018_032645_12106/input0.xml" gpu_index = kernel = @@ -845,34 +845,34 @@ Counters: :: - o GRBM_COUNT : Tie High - Count Number of Clocks - o GRBM_GUI_ACTIVE : The GUI is Active - o SQ_WAVES : Count number of waves sent to SQs. (per-simd, emulated, global) - o SQ_INSTS_VALU : Number of VALU instructions issued. (per-simd, emulated) - o SQ_INSTS_VMEM_WR : Number of VMEM write instructions issued (including FLAT). (per-simd, emulated) - o SQ_INSTS_VMEM_RD : Number of VMEM read instructions issued (including FLAT). (per-simd, emulated) - o SQ_INSTS_SALU : Number of SALU instructions issued. (per-simd, emulated) - o SQ_INSTS_SMEM : Number of SMEM instructions issued. (per-simd, emulated) - o SQ_INSTS_FLAT : Number of FLAT instructions issued. (per-simd, emulated) - o SQ_INSTS_FLAT_LDS_ONLY : Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated) - o SQ_INSTS_LDS : Number of LDS instructions issued (including FLAT). (per-simd, emulated) - o SQ_INSTS_GDS : Number of GDS instructions issued. (per-simd, emulated) - o SQ_WAIT_INST_LDS : Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic) - o SQ_ACTIVE_INST_VALU : regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic) - o SQ_INST_CYCLES_SALU : Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated) - o SQ_THREAD_CYCLES_VALU : Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd) - o SQ_LDS_BANK_CONFLICT : Number of cycles LDS is stalled by bank conflicts. (emulated) - o TA_TA_BUSY[0-15] : TA block is busy. Perf_Windowing not supported for this counter. - o TA_FLAT_READ_WAVEFRONTS[0-15] : Number of flat opcode reads processed by the TA. - o TA_FLAT_WRITE_WAVEFRONTS[0-15] : Number of flat opcode writes processed by the TA. - o TCC_HIT[0-15] : Number of cache hits. - o TCC_MISS[0-15] : Number of cache misses. UC reads count as misses. - o TCC_EA_WRREQ[0-15] : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands. - o TCC_EA_WRREQ_64B[0-15] : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. - o TCC_EA_WRREQ_STALL[0-15] : Number of cycles a write request was stalled. - o TCC_EA_RDREQ[0-15] : Number of TCC/EA read requests (either 32-byte or 64-byte) - o TCC_EA_RDREQ_32B[0-15] : Number of 32-byte TCC/EA read requests - o TCP_TCP_TA_DATA_STALL_CYCLES[0-15] : TCP stalls TA data interface. Now Windowed. + • GRBM_COUNT : Tie High - Count Number of Clocks + • GRBM_GUI_ACTIVE : The GUI is Active + • SQ_WAVES : Count number of waves sent to SQs. (per-simd, emulated, global) + • SQ_INSTS_VALU : Number of VALU instructions issued. (per-simd, emulated) + • SQ_INSTS_VMEM_WR : Number of VMEM write instructions issued (including FLAT). (per-simd, emulated) + • SQ_INSTS_VMEM_RD : Number of VMEM read instructions issued (including FLAT). (per-simd, emulated) + • SQ_INSTS_SALU : Number of SALU instructions issued. (per-simd, emulated) + • SQ_INSTS_SMEM : Number of SMEM instructions issued. (per-simd, emulated) + • SQ_INSTS_FLAT : Number of FLAT instructions issued. (per-simd, emulated) + • SQ_INSTS_FLAT_LDS_ONLY : Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated) + • SQ_INSTS_LDS : Number of LDS instructions issued (including FLAT). (per-simd, emulated) + • SQ_INSTS_GDS : Number of GDS instructions issued. (per-simd, emulated) + • SQ_WAIT_INST_LDS : Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic) + • SQ_ACTIVE_INST_VALU : regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic) + • SQ_INST_CYCLES_SALU : Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated) + • SQ_THREAD_CYCLES_VALU : Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd) + • SQ_LDS_BANK_CONFLICT : Number of cycles LDS is stalled by bank conflicts. (emulated) + • TA_TA_BUSY[0-15] : TA block is busy. Perf_Windowing not supported for this counter. + • TA_FLAT_READ_WAVEFRONTS[0-15] : Number of flat opcode reads processed by the TA. + • TA_FLAT_WRITE_WAVEFRONTS[0-15] : Number of flat opcode writes processed by the TA. + • TCC_HIT[0-15] : Number of cache hits. + • TCC_MISS[0-15] : Number of cache misses. UC reads count as misses. + • TCC_EA_WRREQ[0-15] : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands. + • TCC_EA_WRREQ_64B[0-15] : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. + • TCC_EA_WRREQ_STALL[0-15] : Number of cycles a write request was stalled. + • TCC_EA_RDREQ[0-15] : Number of TCC/EA read requests (either 32-byte or 64-byte) + • TCC_EA_RDREQ_32B[0-15] : Number of 32-byte TCC/EA read requests + • TCP_TCP_TA_DATA_STALL_CYCLES[0-15] : TCP stalls TA data interface. Now Windowed. The following derived metrics have been defined and the profiler metrics XML specification can be found at: @@ -882,44 +882,44 @@ Metrics: :: - o TA_BUSY_avr : TA block is busy. Average over TA instances. - o TA_BUSY_max : TA block is busy. Max over TA instances. - o TA_BUSY_min : TA block is busy. Min over TA instances. - o TA_FLAT_READ_WAVEFRONTS_sum : Number of flat opcode reads processed by the TA. Sum over TA instances. - o TA_FLAT_WRITE_WAVEFRONTS_sum : Number of flat opcode writes processed by the TA. Sum over TA instances. - o TCC_HIT_sum : Number of cache hits. Sum over TCC instances. - o TCC_MISS_sum : Number of cache misses. Sum over TCC instances. - o TCC_EA_RDREQ_32B_sum : Number of 32-byte TCC/EA read requests. Sum over TCC instances. - o TCC_EA_RDREQ_sum : Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances. - o TCC_EA_WRREQ_sum : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances. - o TCC_EA_WRREQ_64B_sum : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances. - o TCC_WRREQ_STALL_max : Number of cycles a write request was stalled. Max over TCC instances. - o TCC_MC_WRREQ_sum : Number of 32-byte effective writes. Sum over TCC instaces. - o FETCH_SIZE : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - o WRITE_SIZE : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - o GPUBusy : The percentage of time GPU was busy. - o Wavefronts : Total wavefronts. - o VALUInsts : The average number of vector ALU instructions executed per work-item (affected by flow control). - o SALUInsts : The average number of scalar ALU instructions executed per work-item (affected by flow control). - o VFetchInsts : The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory. - o SFetchInsts : The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control). - o VWriteInsts : The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory. - o FlatVMemInsts : The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch. - o LDSInsts : The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS. - o FlatLDSInsts : The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control). - o GDSInsts : The average number of GDS read or GDS write instructions executed per work item (affected by flow control). - o VALUUtilization : The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence). - o VALUBusy : The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). - o SALUBusy : The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). - o Mem32Bwrites : - o FetchSize : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - o WriteSize : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - o L2CacheHit : The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). - o MemUnitBusy : The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). - o MemUnitStalled : The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad). - o WriteUnitStalled : The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad). - o ALUStalledByLDS : The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). - o LDSBankConflict : The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). + • TA_BUSY_avr : TA block is busy. Average over TA instances. + • TA_BUSY_max : TA block is busy. Max over TA instances. + • TA_BUSY_min : TA block is busy. Min over TA instances. + • TA_FLAT_READ_WAVEFRONTS_sum : Number of flat opcode reads processed by the TA. Sum over TA instances. + • TA_FLAT_WRITE_WAVEFRONTS_sum : Number of flat opcode writes processed by the TA. Sum over TA instances. + • TCC_HIT_sum : Number of cache hits. Sum over TCC instances. + • TCC_MISS_sum : Number of cache misses. Sum over TCC instances. + • TCC_EA_RDREQ_32B_sum : Number of 32-byte TCC/EA read requests. Sum over TCC instances. + • TCC_EA_RDREQ_sum : Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances. + • TCC_EA_WRREQ_sum : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances. + • TCC_EA_WRREQ_64B_sum : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances. + • TCC_WRREQ_STALL_max : Number of cycles a write request was stalled. Max over TCC instances. + • TCC_MC_WRREQ_sum : Number of 32-byte effective writes. Sum over TCC instaces. + • FETCH_SIZE : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + • WRITE_SIZE : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + • GPUBusy : The percentage of time GPU was busy. + • Wavefronts : Total wavefronts. + • VALUInsts : The average number of vector ALU instructions executed per work-item (affected by flow control). + • SALUInsts : The average number of scalar ALU instructions executed per work-item (affected by flow control). + • VFetchInsts : The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory. + • SFetchInsts : The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control). + • VWriteInsts : The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory. + • FlatVMemInsts : The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch. + • LDSInsts : The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS. + • FlatLDSInsts : The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control). + • GDSInsts : The average number of GDS read or GDS write instructions executed per work item (affected by flow control). + • VALUUtilization : The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence). + • VALUBusy : The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). + • SALUBusy : The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). + • Mem32Bwrites : + • FetchSize : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + • WriteSize : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + • L2CacheHit : The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). + • MemUnitBusy : The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). + • MemUnitStalled : The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad). + • WriteUnitStalled : The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad). + • ALUStalledByLDS : The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). + • LDSBankConflict : The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). ROC Profiler diff --git a/ROCm_Tools/tutorial.rst b/ROCm_Tools/tutorial.rst index 2de1ca66..15c2e053 100644 --- a/ROCm_Tools/tutorial.rst +++ b/ROCm_Tools/tutorial.rst @@ -337,7 +337,7 @@ ROCm-gdb helps developers to view information about kernels that have been launc (ROCm-gdb) set rocm trace mytrace.csv (ROCm-gdb) set rocm trace on -You can now execute and debug the application within ROCm-gdb. Anytime during the application's execution you can view my_trace.csv to see the kernels have been dispatched. A sample trace for an application that dispatches a vector add kernel followed by a matrix multiplication kernel in a loop is shown below. +You can now execute and debug the application within ROCm-gdb. Anytime during the application’s execution you can view my_trace.csv to see the kernels have been dispatched. A sample trace for an application that dispatches a vector add kernel followed by a matrix multiplication kernel in a loop is shown below. &__OpenCL_matrixMul_kernel ====== =========== =========== ============================= ======= ======= ================ =========== ========== ====================== index queue_id packet_id kernel_name header setup workgroup_size reserved0 grid_size private_segment_size diff --git a/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst b/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst index f5211c1e..282c37ea 100644 --- a/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst +++ b/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst @@ -11,7 +11,7 @@ The following KVM-based instructions assume a headless host with an input/output :: - cat /proc/cpuinfo | grep -E "svm|vxm" + cat /proc/cpuinfo | grep -E “svm|vxm” Ubuntu 16.04 **************************** diff --git a/Tutorial/GCN-asm-tutorial.rst b/Tutorial/GCN-asm-tutorial.rst index 5307ccff..75ff5d2b 100644 --- a/Tutorial/GCN-asm-tutorial.rst +++ b/Tutorial/GCN-asm-tutorial.rst @@ -71,9 +71,9 @@ The host program should also allocate memory for the in, index and out buffers. out = AllocateBuffer(size); // Fill Kernarg memory - Kernarg(in); // Add base pointer to "in" buffer - Kernarg(index); // Append base pointer to "index" buffer - Kernarg(out); // Append base pointer to "out" buffer + Kernarg(in); // Add base pointer to “in” buffer + Kernarg(index); // Append base pointer to “index” buffer + Kernarg(out); // Append base pointer to “out” buffer Initial Wavefront and Register State To launch a kernel in real hardware, the run time needs information about the kernel, such as diff --git a/cleanup_text.sh b/cleanup_text.sh deleted file mode 100755 index 7b65dc9a..00000000 --- a/cleanup_text.sh +++ /dev/null @@ -1,196 +0,0 @@ -#!/bin/bash - -# Script to clean up RST and other text files -# By Lee Killough - -export PATH=/usr/bin:/bin - -files= -rstcode=0 -ascii=0 -trailing=0 -trailing_after=0 - -main() -{ - parse_args "$@" - check_git - cleanup_text -} - -cleanup_text() -{ - # iconv command to translate UTF8 to ASCII - iconv="/usr/bin/iconv -s -f utf-8 -t ascii//TRANSLIT" - - set -ex - - git ls-files -z --exclude-standard "$files" | while read -rd '' file; do - # Operate only on regular files of MIME type text/* - if [[ -f $file && "$(file -b --mime-type "$file")" == text/* ]]; then - # Add missing newline to end of file - sed -i -e '$a\' "$file" - - # Remove trailing whitespace at end of lines - if [[ $trailing -ne 0 ]]; then - sed -i -e 's/[[:space:]]*$//' "$file" - elif [[ $trailing_after -ne 0 ]]; then - perl -pi -e 's/\S\K\s+$/\n/' "$file" - fi - - # Temporary file - temp=$(mktemp) - - # Replace non-ASCII text and/or RST code line with ASCII equivalents - if [[ $ascii -ne 0 ]]; then - $iconv "$file" > "$temp" - elif [[ $rstcode -ne 0 && $file == *.rst ]]; then - { set +x; } 2>/dev/null - echo perl -e '$(rstcode_perl)' "\"$iconv\" \"$file\" > \"$temp\"" >&2 - perl -e "$(rstcode_perl)" "$iconv" "$file" > "$temp" - set -x - fi - - # Preserve permissions and add file to Git if updated - chmod --reference="$file" "$temp" - mv -f "$temp" "$file" - git add -u "$file" - fi - echo "" >&2 - done - - { set +x; } 2>/dev/null - git status - echo " All of the selected files in the repository have been cleaned up." -} - -check_git() -{ - if ! git diff-index --quiet HEAD -- ; then - cat >&2 <) -{ - my ($indent) = /^(\s*)/; - if($code) { - $code = 0 if /\S/ && length($indent) <= length($code_indent); - open ICONV, "|-", $iconv or die "$!"; - print ICONV; - close ICONV; - } else { - ($code, $code_indent) = (1, $indent) if /::(\s+\S+)?\s*$/; - print; - } -} -EOF -} - -# Help message -usage() -{ - cat< ] - [ --rstcode ] - [ --ascii ] - [ --trailing | --trailing-after ] - -Description: - - Replaces non-ASCII Unicode characters with their ASCII equivalents in - selected text files, or in the code sections of reStructuredText (RST) - files. - - Adds missing newlines at the ends of selected text files. - - Optionally removes trailing whitespace at the ends of lines in selected - text files. - - Code sections of RST files are critically important, because they are - often copied-and-pasted to a user's terminal, and if they contain - non-ASCII characters, then they will not work. - -Options: - - --files - - Clean up all text files matching wildcard or path, - e.g.: - - --files "*.md" - --files "*.rst" - --files "*" - --files README.md - - (Wildcard may need to be quoted, to prevent shell - wildcard expansion.) - - --rstcode Clean up only the code sections of selected RST - files, or all RST files if --files is not specified. - - --ascii Replace non-ASCII UTF-8 characters in selected text - files with their ASCII equivalents. - - --trailing Remove trailing whitespace at the ends of lines in - selected files. This includes converting CR-LF to LF. - - --trailing-after Remove trailing whitespace at the ends of lines in - selected files, but only after non-space characters. - This prevents removing indentation from otherwise - blank lines. - -EOF - exit 1 -} - -main "$@"