Skip to content

Commit

Permalink
[SYSTEMDS-3567] Update allocation logic in GPU
Browse files Browse the repository at this point in the history
This patch updates the order of the steps for GPU allocation. Now we
recycle cached pointers before freeing any pointers (inside or outside
of the cache). This patch also provides a method clear full GPU cache,
fixes bugs and extend lineage tracing for missing DNN operators.
  • Loading branch information
phaniarnab committed Dec 1, 2023
1 parent 7b53ca2 commit 8a38477
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.apache.sysds.common.Types.ValueType;
import org.apache.sysds.runtime.DMLScriptException;
import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
import org.apache.sysds.runtime.matrix.operators.Operator;
import org.apache.sysds.runtime.matrix.operators.UnaryOperator;

Expand All @@ -44,7 +45,7 @@ public void processInstruction(ExecutionContext ec) {
//core execution
if ( opcode.equalsIgnoreCase("print") ) {
String outString = so.getLanguageSpecificStringValue();

// print to stdout only when suppress flag in DMLScript is not set.
// The flag will be set, for example, when SystemDS is invoked in fenced mode from Jaql.
if (!DMLScript.suppressPrint2Stdout())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import org.apache.commons.lang3.tuple.Pair;
import org.apache.sysds.api.DMLScript;
import org.apache.sysds.common.Types;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
Expand Down Expand Up @@ -323,7 +324,7 @@ else if (opcode.equalsIgnoreCase("batch_norm2d") || opcode.equalsIgnoreCase("lst
CPOperand in6 = new CPOperand(parts[6]); // mode
CPOperand in7 = new CPOperand(parts[7]); // epsilon
CPOperand in8 = new CPOperand(parts[8]); // exponentialAverageFactor
CPOperand out = new CPOperand(parts[9]); // ret
CPOperand out = new CPOperand(parts[9], Types.ValueType.FP64, Types.DataType.MATRIX); // ret
CPOperand out2 = new CPOperand(parts[10]); // retRunningMean
CPOperand out3 = new CPOperand(parts[11]); // retRunningVar
CPOperand out4 = new CPOperand(parts[12]); // resultSaveMean
Expand Down Expand Up @@ -902,10 +903,14 @@ public Pair<String, LineageItem> getLineageItem(ExecutionContext ec) {
inputs.add(_input6);
inputs.add(_input7);
inputs.add(_input8);
inputs.addAll(_input_shape);
inputs.addAll(_filter_shape);
inputs.addAll(_stride);
inputs.addAll(_padding);
if (_input_shape != null && !_input_shape.isEmpty())
inputs.addAll(_input_shape);
if (_filter_shape != null && !_filter_shape.isEmpty())
inputs.addAll(_filter_shape);
if (_stride != null && !_stride.isEmpty())
inputs.addAll(_stride);
if (_padding!= null && !_padding.isEmpty())
inputs.addAll(_padding);
return Pair.of(_output.getName(),
new LineageItem(getOpcode(), LineageItemUtils.getLineage(ec, inputs.toArray(new CPOperand[0]))));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -261,35 +261,13 @@ public Pointer malloc(String opcode, long size, boolean initialize) {

Pointer tmpA = (A == null) ? new Pointer() : null;
// Step 2: Allocate a new pointer in the GPU memory (since memory is available)
// Step 3 has potential to create holes as well as limit future reuse, hence perform this step before step 3.
// Step 4 has potential to create holes as well as limit future reuse, hence perform this step before step 3.
if(A == null && allocator.canAllocate(size)) {
// This can fail in case of fragmented memory, so don't issue any warning
A = cudaMallocNoWarn(tmpA, size, "allocate a new pointer");
}

// Step 3: Try reusing non-exact match entry of rmvarGPUPointers
if(A == null) {
A = lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size);
if(A != null) {
guardedCudaFree(A);
A = cudaMallocNoWarn(tmpA, size, "reuse non-exact match of rmvarGPUPointers");
if(A == null)
LOG.warn("cudaMalloc failed after clearing one of rmvarGPUPointers.");
}
}

// Step 4: Eagerly free-up rmvarGPUPointers and check if memory is available on GPU
// Evictions of matrix blocks are expensive (as they might lead them to be written to disk in case of smaller CPU budget)
// than doing cuda free/malloc/memset. So, rmvar-ing every blocks (step 4) is preferred over eviction (step 6, 7, 8).
if(A == null) {
lazyCudaFreeMemoryManager.clearAll();
if(allocator.canAllocate(size)) {
// This can fail in case of fragmented memory, so don't issue any warning
A = cudaMallocNoWarn(tmpA, size, "allocate a new pointer after eager free");
}
}

// Step 5.1: Recycle, delete or evict gpu intermediates from lineage cache

// Step 3: Recycle gpu intermediates from lineage cache
if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
// Recycle a cached pointer if exactly matches the required size
Expand All @@ -316,8 +294,30 @@ public Pointer malloc(String opcode, long size, boolean initialize) {
if (DMLScript.STATISTICS)
LineageCacheStatistics.incrementEvictTimeGpu(System.nanoTime() - t0);
}

// Step 5.2: Use a non-exact sized pointer

// Step 4: Try reusing non-exact match entry of rmvarGPUPointers
if(A == null) {
A = lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size);
if(A != null) {
guardedCudaFree(A);
A = cudaMallocNoWarn(tmpA, size, "reuse non-exact match of rmvarGPUPointers");
if(A == null)
LOG.warn("cudaMalloc failed after clearing one of rmvarGPUPointers.");
}
}

// Step 5: Eagerly free-up rmvarGPUPointers and check if memory is available on GPU
// Evictions of matrix blocks are expensive (as they might lead them to be written to disk in case of smaller CPU budget)
// than doing cuda free/malloc/memset. So, rmvar-ing every blocks (step 4) is preferred over eviction (step 6, 7, 8).
if(A == null) {
lazyCudaFreeMemoryManager.clearAll();
if(allocator.canAllocate(size)) {
// This can fail in case of fragmented memory, so don't issue any warning
A = cudaMallocNoWarn(tmpA, size, "allocate a new pointer after eager free");
}
}

// Step 6: Free gpu intermediates from lineage cache
if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
long freedSize = 0;
Expand All @@ -340,7 +340,7 @@ public Pointer malloc(String opcode, long size, boolean initialize) {
if(DMLScript.STATISTICS)
LineageCacheStatistics.incrementGpuSyncEvicts();
}
if (freedSize > size)
if (freedSize >= size)
A = cudaMallocNoWarn(tmpA, size, "recycle non-exact match of lineage cache");
// Else, deallocate another free pointer. We are calling pollFistFreeNotExact with
// the same size (not with freedSize-size) to reduce potentials for creating holes
Expand All @@ -353,7 +353,7 @@ public Pointer malloc(String opcode, long size, boolean initialize) {
LOG.warn("cudaMalloc failed after Lineage GPU cache eviction.");
}

// Step 6: Try eviction/clearing exactly one with size restriction
// Step 7: Try eviction/clearing exactly one with size restriction
if(A == null) {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
synchronized (matrixMemoryManager.gpuObjects) {
Expand All @@ -377,7 +377,7 @@ public Pointer malloc(String opcode, long size, boolean initialize) {
}
}

// Step 7: Try eviction/clearing one-by-one based on the given policy without size restriction
// Step 8: Try eviction/clearing one-by-one based on the given policy without size restriction
if(A == null) {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
long currentAvailableMemory = allocator.getAvailableMemory();
Expand Down Expand Up @@ -411,7 +411,7 @@ public Pointer malloc(String opcode, long size, boolean initialize) {
}
}

// Step 8: Handle defragmentation
// Step 9: Handle defragmentation
if(A == null) {
LOG.warn("Potential fragmentation of the GPU memory. Forcibly evicting all ...");
LOG.info("Before clearAllUnlocked, GPU Memory info:" + toString());
Expand Down Expand Up @@ -477,7 +477,8 @@ private void printPointers(Set<Pointer> pointers, StringBuilder sb) {
*
* @param toFree pointer to call cudaFree method on
*/
public void guardedCudaFree(Pointer toFree) {
public void guardedCudaFree(Pointer toFree, boolean noStats) {
long t0 = (!noStats && DMLScript.STATISTICS) ? System.nanoTime() : 0;
synchronized(allPointers) {
if(allPointers.containsKey(toFree)) {
long size = allPointers.get(toFree).getSizeInBytes();
Expand All @@ -495,9 +496,16 @@ public void guardedCudaFree(Pointer toFree) {
throw new RuntimeException("Attempting to free an unaccounted pointer:" + toFree);
}
}
if(DMLScript.STATISTICS && !noStats) {
GPUStatistics.cudaDeAllocTime.add(System.nanoTime() - t0);
GPUStatistics.cudaDeAllocCount.add(1);
}
}

public void guardedCudaFree(Pointer toFree) {
guardedCudaFree(toFree, false);
}

/**
* Deallocate the pointer
*
Expand All @@ -517,9 +525,9 @@ public void free(String opcode, Pointer toFree, boolean eager) throws DMLRuntime
if(LOG.isTraceEnabled())
LOG.trace("Free-ing the pointer with eager=" + eager);
if (eager) {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
//long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
guardedCudaFree(toFree);
addMiscTime(opcode, GPUStatistics.cudaDeAllocTime, GPUStatistics.cudaDeAllocCount, GPUInstruction.MISC_TIMER_CUDA_FREE, t0);
//addMiscTime(opcode, GPUStatistics.cudaDeAllocTime, GPUStatistics.cudaDeAllocCount, GPUInstruction.MISC_TIMER_CUDA_FREE, t0);
}
else {
long size = 0;
Expand Down Expand Up @@ -602,7 +610,7 @@ public void clearTemporaryMemory() {
Set<Pointer> unlockedDirtyOrCachedPointers = matrixMemoryManager.getPointers(false, true);
Set<Pointer> temporaryPointers = nonIn(allPointers.keySet(), unlockedDirtyOrCachedPointers);
for(Pointer tmpPtr : temporaryPointers) {
guardedCudaFree(tmpPtr);
guardedCudaFree(tmpPtr, true);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ else if (e.isRDDPersist()) {
return false; //the executing thread removed this entry from cache
if (e.getCacheStatus() == LineageCacheStatus.TOCACHEGPU) { //second hit
//Cannot reuse as already garbage collected
if (DMLScript.STATISTICS) LineageCacheStatistics.incrementDelHitsGpu(); //increase miss count
ec.replaceLineageItem(outName, e._key); //still reuse the lineage trace
return false;
}
Expand Down Expand Up @@ -322,6 +323,7 @@ else if (e.isScalarValue()) {
case TOCACHEGPU:
//Cannot reuse as already garbage collected putValue method
// will save the pointer while caching the original instruction
if (DMLScript.STATISTICS) LineageCacheStatistics.incrementDelHitsGpu(); //increase miss count
return false;
case GPUCACHED:
//Increment the live count for this pointer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ public class LineageCacheConfig
"^2", "*2", "uack+", "tak+*", "uacsqk+", "uark+", "n+", "uarimax", "qsort",
"qpick", "transformapply", "uarmax", "n+", "-*", "castdtm", "lowertri", "1-*",
"prefetch", "mapmm", "contains", "mmchain", "mapmmchain", "+*", "==", "rmempty",
"conv2d_bias_add", "relu_maxpooling", "maxpooling", "softmax"
"conv2d_bias_add", "relu_maxpooling", "maxpooling", "batch_norm2d", "avgpooling",
"softmax"
//TODO: Reuse everything.
};

Expand All @@ -76,7 +77,7 @@ public class LineageCacheConfig
};

private static final String[] GPU_OPCODE_HEAVY = new String[] {
"conv2d_bias_add", "relu_maxpooling", "maxpooling" //DNN OPs
"conv2d_bias_add", "relu_maxpooling", "maxpooling", "batch_norm2d", "avgpooling" //DNN OPs
};

private static String[] REUSE_OPCODES = new String[] {};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import java.util.stream.Collectors;

import jcuda.Pointer;
import org.apache.sysds.api.DMLScript;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
Expand Down Expand Up @@ -130,6 +131,21 @@ private static void removeEntry(LineageCacheEntry e) {
}
}

public static void removeAllEntries() {
List<Long> sizes = new ArrayList<>(freeQueues.keySet());
for (Long size : sizes) {
TreeSet<LineageCacheEntry> freeList = freeQueues.get(size);
LineageCacheEntry le = pollFirstFreeEntry(size);
while (le != null) {
// Free the pointer
_gpuContext.getMemoryManager().guardedCudaFree(le.getGPUPointer());
if (DMLScript.STATISTICS)
LineageCacheStatistics.incrementGpuDel();
le = pollFirstFreeEntry(size);
}
}
}

public static void setGPUContext(GPUContext gpuCtx) {
_gpuContext = gpuCtx;
}
Expand Down

0 comments on commit 8a38477

Please sign in to comment.