[SYSTEMDS-3567] Update allocation logic in GPU

This patch updates the order of the steps for GPU allocation. Now we recycle cached pointers before freeing any pointers (inside or outside of the cache). This patch also provides a method clear full GPU cache, fixes bugs and extend lineage tracing for missing DNN operators.
apache · Dec 1, 2023 · 8a38477 · 8a38477
1 parent 7b53ca2
commit 8a38477
Show file tree

Hide file tree

Showing 6 changed files with 77 additions and 44 deletions.
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/UnaryScalarCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/UnaryScalarCPInstruction.java
@@ -23,6 +23,7 @@
 import org.apache.sysds.common.Types.ValueType;
 import org.apache.sysds.runtime.DMLScriptException;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
 import org.apache.sysds.runtime.matrix.operators.Operator;
 import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
 
@@ -44,7 +45,7 @@ public void processInstruction(ExecutionContext ec) {
 		//core execution
 		if ( opcode.equalsIgnoreCase("print") ) {
 			String outString = so.getLanguageSpecificStringValue();
-			
+
 			// print to stdout only when suppress flag in DMLScript is not set.
 			// The flag will be set, for example, when SystemDS is invoked in fenced mode from Jaql.
 			if (!DMLScript.suppressPrint2Stdout())

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java
@@ -23,6 +23,7 @@
 
 import org.apache.commons.lang3.tuple.Pair;
 import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.common.Types;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
@@ -323,7 +324,7 @@ else if (opcode.equalsIgnoreCase("batch_norm2d") || opcode.equalsIgnoreCase("lst
 			CPOperand in6 = new CPOperand(parts[6]); // mode
 			CPOperand in7 = new CPOperand(parts[7]); // epsilon
 			CPOperand in8 = new CPOperand(parts[8]); // exponentialAverageFactor
-			CPOperand out = new CPOperand(parts[9]);  // ret
+			CPOperand out = new CPOperand(parts[9], Types.ValueType.FP64, Types.DataType.MATRIX);  // ret
 			CPOperand out2 = new CPOperand(parts[10]); // retRunningMean
 			CPOperand out3 = new CPOperand(parts[11]); // retRunningVar
 			CPOperand out4 = new CPOperand(parts[12]); // resultSaveMean
@@ -902,10 +903,14 @@ public Pair<String, LineageItem> getLineageItem(ExecutionContext ec) {
 		inputs.add(_input6);
 		inputs.add(_input7);
 		inputs.add(_input8);
-		inputs.addAll(_input_shape);
-		inputs.addAll(_filter_shape);
-		inputs.addAll(_stride);
-		inputs.addAll(_padding);
+		if (_input_shape != null && !_input_shape.isEmpty())
+			inputs.addAll(_input_shape);
+		if (_filter_shape != null && !_filter_shape.isEmpty())
+			inputs.addAll(_filter_shape);
+		if (_stride != null && !_stride.isEmpty())
+			inputs.addAll(_stride);
+		if (_padding!= null && !_padding.isEmpty())
+			inputs.addAll(_padding);
 		return Pair.of(_output.getName(),
 			new LineageItem(getOpcode(), LineageItemUtils.getLineage(ec, inputs.toArray(new CPOperand[0]))));
 	}

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -261,35 +261,13 @@ public Pointer malloc(String opcode, long size, boolean initialize) {
 
 		Pointer tmpA = (A == null) ? new Pointer() : null;
 		// Step 2: Allocate a new pointer in the GPU memory (since memory is available)
-		// Step 3 has potential to create holes as well as limit future reuse, hence perform this step before step 3.
+		// Step 4 has potential to create holes as well as limit future reuse, hence perform this step before step 3.
 		if(A == null && allocator.canAllocate(size)) {
 			// This can fail in case of fragmented memory, so don't issue any warning
 			A = cudaMallocNoWarn(tmpA, size, "allocate a new pointer");
 		}
-
-		// Step 3: Try reusing non-exact match entry of rmvarGPUPointers
-		if(A == null) {
-			A = lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size);
-			if(A != null) {
-				guardedCudaFree(A);
-				A = cudaMallocNoWarn(tmpA, size, "reuse non-exact match of rmvarGPUPointers"); 
-				if(A == null)
-					LOG.warn("cudaMalloc failed after clearing one of rmvarGPUPointers.");
-			}
-		}
-
-		// Step 4: Eagerly free-up rmvarGPUPointers and check if memory is available on GPU
-		// Evictions of matrix blocks are expensive (as they might lead them to be written to disk in case of smaller CPU budget) 
-		// than doing cuda free/malloc/memset. So, rmvar-ing every blocks (step 4) is preferred over eviction (step 6, 7, 8).
-		if(A == null) {
-			lazyCudaFreeMemoryManager.clearAll();
-			if(allocator.canAllocate(size)) {
-				// This can fail in case of fragmented memory, so don't issue any warning
-				A = cudaMallocNoWarn(tmpA, size, "allocate a new pointer after eager free");
-			}
-		}
-
-		// Step 5.1: Recycle, delete or evict gpu intermediates from lineage cache
+
+		// Step 3: Recycle gpu intermediates from lineage cache
 		if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
 			long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
 			// Recycle a cached pointer if exactly matches the required size
@@ -316,8 +294,30 @@ public Pointer malloc(String opcode, long size, boolean initialize) {
 			if (DMLScript.STATISTICS)
 				LineageCacheStatistics.incrementEvictTimeGpu(System.nanoTime() - t0);
 		}
-
-		// Step 5.2: Use a non-exact sized pointer
+
+		// Step 4: Try reusing non-exact match entry of rmvarGPUPointers
+		if(A == null) {
+			A = lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size);
+			if(A != null) {
+				guardedCudaFree(A);
+				A = cudaMallocNoWarn(tmpA, size, "reuse non-exact match of rmvarGPUPointers");
+				if(A == null)
+					LOG.warn("cudaMalloc failed after clearing one of rmvarGPUPointers.");
+			}
+		}
+
+		// Step 5: Eagerly free-up rmvarGPUPointers and check if memory is available on GPU
+		// Evictions of matrix blocks are expensive (as they might lead them to be written to disk in case of smaller CPU budget) 
+		// than doing cuda free/malloc/memset. So, rmvar-ing every blocks (step 4) is preferred over eviction (step 6, 7, 8).
+		if(A == null) {
+			lazyCudaFreeMemoryManager.clearAll();
+			if(allocator.canAllocate(size)) {
+				// This can fail in case of fragmented memory, so don't issue any warning
+				A = cudaMallocNoWarn(tmpA, size, "allocate a new pointer after eager free");
+			}
+		}
+
+		// Step 6: Free gpu intermediates from lineage cache
 		if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
 			long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
 			long freedSize = 0;
@@ -340,7 +340,7 @@ public Pointer malloc(String opcode, long size, boolean initialize) {
 						if(DMLScript.STATISTICS)
 							LineageCacheStatistics.incrementGpuSyncEvicts();
 					}
-					if (freedSize > size)
+					if (freedSize >= size)
 						A = cudaMallocNoWarn(tmpA, size, "recycle non-exact match of lineage cache");
 					// Else, deallocate another free pointer. We are calling pollFistFreeNotExact with
 					// the same size (not with freedSize-size) to reduce potentials for creating holes
@@ -353,7 +353,7 @@ public Pointer malloc(String opcode, long size, boolean initialize) {
 				LOG.warn("cudaMalloc failed after Lineage GPU cache eviction.");
 		}
 
-		// Step 6: Try eviction/clearing exactly one with size restriction
+		// Step 7: Try eviction/clearing exactly one with size restriction
 		if(A == null) {
 			long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
 			synchronized (matrixMemoryManager.gpuObjects) {
@@ -377,7 +377,7 @@ public Pointer malloc(String opcode, long size, boolean initialize) {
 			}
 		}
 
-		// Step 7: Try eviction/clearing one-by-one based on the given policy without size restriction
+		// Step 8: Try eviction/clearing one-by-one based on the given policy without size restriction
 		if(A == null) {
 			long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
 			long currentAvailableMemory = allocator.getAvailableMemory();
@@ -411,7 +411,7 @@ public Pointer malloc(String opcode, long size, boolean initialize) {
 			}
 		}
 
-		// Step 8: Handle defragmentation
+		// Step 9: Handle defragmentation
 		if(A == null) {
 			LOG.warn("Potential fragmentation of the GPU memory. Forcibly evicting all ...");
 			LOG.info("Before clearAllUnlocked, GPU Memory info:" + toString());
@@ -477,7 +477,8 @@ private void printPointers(Set<Pointer> pointers, StringBuilder sb) {
 	 * 
 	 * @param toFree pointer to call cudaFree method on
 	 */
-	public void guardedCudaFree(Pointer toFree) {
+	public void guardedCudaFree(Pointer toFree, boolean noStats) {
+		long t0 = (!noStats && DMLScript.STATISTICS) ? System.nanoTime() : 0;
 		synchronized(allPointers) {
 			if(allPointers.containsKey(toFree)) {
 				long size = allPointers.get(toFree).getSizeInBytes();
@@ -495,9 +496,16 @@ public void guardedCudaFree(Pointer toFree) {
 				throw new RuntimeException("Attempting to free an unaccounted pointer:" + toFree);
 			}
 		}
+		if(DMLScript.STATISTICS && !noStats) {
+			GPUStatistics.cudaDeAllocTime.add(System.nanoTime() - t0);
+			GPUStatistics.cudaDeAllocCount.add(1);
+		}
+	}
 
+	public void guardedCudaFree(Pointer toFree) {
+		guardedCudaFree(toFree, false);
 	}
-	
+
 	/**
 	 * Deallocate the pointer
 	 * 
@@ -517,9 +525,9 @@ public void free(String opcode, Pointer toFree, boolean eager) throws DMLRuntime
 		if(LOG.isTraceEnabled())
 			LOG.trace("Free-ing the pointer with eager=" + eager);
 		if (eager) {
-			long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+			//long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
 			guardedCudaFree(toFree);
-			addMiscTime(opcode, GPUStatistics.cudaDeAllocTime, GPUStatistics.cudaDeAllocCount, GPUInstruction.MISC_TIMER_CUDA_FREE, t0);
+			//addMiscTime(opcode, GPUStatistics.cudaDeAllocTime, GPUStatistics.cudaDeAllocCount, GPUInstruction.MISC_TIMER_CUDA_FREE, t0);
 		}
 		else {
 			long size = 0;
@@ -602,7 +610,7 @@ public void clearTemporaryMemory() {
 		Set<Pointer> unlockedDirtyOrCachedPointers = matrixMemoryManager.getPointers(false, true);
 		Set<Pointer> temporaryPointers = nonIn(allPointers.keySet(), unlockedDirtyOrCachedPointers);
 		for(Pointer tmpPtr : temporaryPointers) {
-			guardedCudaFree(tmpPtr);
+			guardedCudaFree(tmpPtr, true);
 		}
 	}
 

diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
@@ -189,6 +189,7 @@ else if (e.isRDDPersist()) {
 							return false;  //the executing thread removed this entry from cache
 						if (e.getCacheStatus() == LineageCacheStatus.TOCACHEGPU) {  //second hit
 							//Cannot reuse as already garbage collected
+							if (DMLScript.STATISTICS) LineageCacheStatistics.incrementDelHitsGpu(); //increase miss count
 							ec.replaceLineageItem(outName, e._key); //still reuse the lineage trace
 							return false;
 						}
@@ -322,6 +323,7 @@ else if (e.isScalarValue()) {
 						case TOCACHEGPU:
 							//Cannot reuse as already garbage collected putValue method
 							// will save the pointer while caching the original instruction
+							if (DMLScript.STATISTICS) LineageCacheStatistics.incrementDelHitsGpu(); //increase miss count
 							return false;
 						case GPUCACHED:
 							//Increment the live count for this pointer

diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
@@ -56,7 +56,8 @@ public class LineageCacheConfig
 		"^2", "*2", "uack+", "tak+*", "uacsqk+", "uark+", "n+", "uarimax", "qsort",
 		"qpick", "transformapply", "uarmax", "n+", "-*", "castdtm", "lowertri", "1-*",
 		"prefetch", "mapmm", "contains", "mmchain", "mapmmchain", "+*", "==", "rmempty",
-		"conv2d_bias_add", "relu_maxpooling", "maxpooling", "softmax"
+		"conv2d_bias_add", "relu_maxpooling", "maxpooling", "batch_norm2d", "avgpooling",
+		"softmax"
 		//TODO: Reuse everything.
 	};
 
@@ -76,7 +77,7 @@ public class LineageCacheConfig
 	};
 
 	private static final String[] GPU_OPCODE_HEAVY = new String[] {
-		"conv2d_bias_add", "relu_maxpooling", "maxpooling"	 //DNN OPs
+		"conv2d_bias_add", "relu_maxpooling", "maxpooling", "batch_norm2d", "avgpooling"  //DNN OPs
 	};
 
 	private static String[] REUSE_OPCODES  = new String[] {};

diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
@@ -31,6 +31,7 @@
 import java.util.stream.Collectors;
 
 import jcuda.Pointer;
+import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
@@ -130,6 +131,21 @@ private static void removeEntry(LineageCacheEntry e) {
 		}
 	}
 
+	public static void removeAllEntries() {
+		List<Long> sizes = new ArrayList<>(freeQueues.keySet());
+		for (Long size : sizes) {
+			TreeSet<LineageCacheEntry> freeList = freeQueues.get(size);
+			LineageCacheEntry le = pollFirstFreeEntry(size);
+			while (le != null) {
+				// Free the pointer
+				_gpuContext.getMemoryManager().guardedCudaFree(le.getGPUPointer());
+				if (DMLScript.STATISTICS)
+					LineageCacheStatistics.incrementGpuDel();
+				le = pollFirstFreeEntry(size);
+			}
+		}
+	}
+
 	public static void setGPUContext(GPUContext gpuCtx) {
 		_gpuContext = gpuCtx;
 	}