diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
index 486ec40694f..009be82de3a 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
@@ -92,6 +92,9 @@ public class LibMatrixReorg {
 	
 	//use csr instead of mcsr sparse block for rexpand columns / diag v2m
 	public static final boolean SPARSE_OUTPUTS_IN_CSR = true;
+
+	//use legacy in-place transpose dense instead of brenner in-place transpose dense
+	public static final boolean TRANSPOSE_IN_PLACE_DENSE_LEGACY = true;
 	
 	private enum ReorgType {
 		TRANSPOSE,
@@ -1789,19 +1792,23 @@ else if(cols == rows){
 			transposeInPlaceTrivial(in.getDenseBlockValues(), cols, k);
 		}
 		else {
-			if(cols<rows){
-				// important to set dims after
-				c2r(in, k);
-				values.setDims(new int[]{rows,cols});
-				in.setNumColumns(cols);
-				in.setNumRows(rows);
-			}
-			else{
-				// important to set dims before
-				values.setDims(new int[]{rows,cols});
-				in.setNumColumns(cols);
-				in.setNumRows(rows);
-				r2c(in, k);
+			if(TRANSPOSE_IN_PLACE_DENSE_LEGACY) {
+				if(cols < rows) {
+					// important to set dims after
+					c2r(in, k);
+					values.setDims(new int[] {rows, cols});
+					in.setNumColumns(cols);
+					in.setNumRows(rows);
+				}
+				else {
+					// important to set dims before
+					values.setDims(new int[] {rows, cols});
+					in.setNumColumns(cols);
+					in.setNumRows(rows);
+					r2c(in, k);
+				}
+			} else {
+				transposeInPlaceDenseBrenner(in, k);
 			}
 		}
 
@@ -4227,4 +4234,234 @@ public Object call() {
 			return null;
 		}
 	}
+
+	/**
+	 * Transposes a dense matrix in-place using following cycles based on Brenner's method. This
+	 * method shifts cycles with a focus on less storage by using cycle leaders based on prime factorization. The used
+	 * storage is in O(n+m). Quadratic matrices should be handled outside this method (using the trivial method) for a
+	 * speedup. This method is based on: Algorithm 467, Brenner, https://dl.acm.org/doi/pdf/10.1145/355611.362542.
+	 *
+	 * @param in The input matrix to be transposed.
+	 * @param k  The number of threads.
+	 */
+	public static void transposeInPlaceDenseBrenner(MatrixBlock in, int k) {
+
+		DenseBlock denseBlock = in.getDenseBlock();
+		double[] matrix = in.getDenseBlockValues();
+
+		final int rows = in.getNumRows();
+		final int cols = in.getNumColumns();
+
+		// Brenner: rows + cols / 2 is sufficient for most cases.
+		int workSize = rows + cols;
+		int maxIndex = rows * cols - 1;
+
+		// prime factorization of the maximum index to identify cycle structures
+		// Brenner: length 8 is sufficient up to maxIndex 2*3*5*...*19 = 9,767,520.
+		int[] primes = new int[8];
+		int[] exponents = new int[8];
+		int[] powers = new int[8];
+		int numPrimes = primeFactorization(maxIndex, primes, exponents, powers);
+
+		int[] iExponents = new int[numPrimes];
+		int div = 1;
+
+		div:
+		while(div < maxIndex / 2) {
+
+			// number of indices divisible by div and no other divisor of maxIndex
+			int count = eulerTotient(primes, exponents, iExponents, numPrimes, maxIndex / div);
+			// all false
+			boolean[] moved = new boolean[workSize];
+			// starting point cycle
+			int start = div;
+
+			count:
+			do {
+				// companion of start
+				int comp = maxIndex - start;
+
+				if(start == div) {
+					// shift cycles
+					count = simultaneousCycleShift(matrix, moved, rows, maxIndex, count, workSize, start, comp);
+					start += div;
+				}
+				else if(start < workSize && moved[start]) {
+					// already moved
+					start += div;
+				}
+				else {
+					// handle other cycle starts
+					int cycleLeader = start / div;
+					for(int ip = 0; ip < numPrimes; ip++) {
+						if(iExponents[ip] != exponents[ip] && cycleLeader % primes[ip] == 0) {
+							start += div;
+							continue count;
+						}
+					}
+
+					if(start < workSize) {
+						count = simultaneousCycleShift(matrix, moved, rows, maxIndex, count, workSize, start, comp);
+						start += div;
+						continue;
+					}
+
+					int test = start;
+					do {
+						test = prevIndexCycle(test, rows, cols);
+						if(test < start || test > comp) {
+							start += div;
+							continue count;
+						}
+					}
+					while(test > start && test < comp);
+
+					count = simultaneousCycleShift(matrix, moved, rows, maxIndex, count, workSize, start, comp);
+					start += div;
+				}
+			}
+			while(count > 0);
+
+			// update cycle divisor for the next set of cycles based on prime factors
+			for(int ip = 0; ip < numPrimes; ip++) {
+				if(iExponents[ip] != exponents[ip]) {
+					iExponents[ip]++;
+					div *= primes[ip];
+					continue div;
+				}
+				iExponents[ip] = 0;
+				div /= powers[ip];
+			}
+		}
+
+		denseBlock.setDims(new int[] {cols, rows});
+		in.setNumColumns(rows);
+		in.setNumRows(cols);
+	}
+
+	/**
+	 * Performs a simultaneous cycle shift for a cycle and its companion cycle. This method ensures that distinct cycles
+	 * or self-dual cycles are handled correctly. This method is based on: Algorithm 2, Karlsson,
+	 * https://webapps.cs.umu.se/uminf/reports/2009/011/part1.pdf and Algorithm 467, Brenner,
+	 * https://dl.acm.org/doi/pdf/10.1145/355611.362542.
+	 *
+	 * @param matrix   The matrix whose elements are being shifted.
+	 * @param moved    Boolean array tracking whether an element has already been moved.
+	 * @param rows     The number of rows in the matrix.
+	 * @param maxIndex The maximum valid index in the matrix.
+	 * @param count    The number of elements left to process.
+	 * @param workSize The length of moved.
+	 * @param start    The starting index for the cycle shift.
+	 * @param comp     The corresponding companion index.
+	 * @return The updated count of elements remaining to shift.
+	 */
+	private static int simultaneousCycleShift(double[] matrix, boolean[] moved, int rows, int maxIndex, int count,
+		int workSize, int start, int comp) {
+
+		int orig = start;
+		double val = matrix[orig];
+		double cval = matrix[comp];
+
+		while(orig >= 0) {
+			// decrease the remaining shift count by orig and comp
+			count -= 2;
+			orig = simultaneousCycleShiftStep(matrix, moved, rows, maxIndex, workSize, start, orig, val, cval);
+		}
+		return count;
+	}
+
+	private static int simultaneousCycleShiftStep(double[] matrix, boolean[] moved, int rows, int maxIndex,
+		int workSize, int start, int orig, double val, double cval) {
+
+		int comp = maxIndex - orig;
+		int prevOrig = prevIndexCycle(orig, rows, (maxIndex + 1) / rows);
+		int prevComp = maxIndex - prevOrig;
+
+		if(orig < workSize)
+			moved[orig] = true;
+		if(comp < workSize)
+			moved[comp] = true;
+
+		if(prevOrig == start) {
+			// cycle and comp are distinct
+			matrix[orig] = val;
+			matrix[comp] = cval;
+			return -1;
+		}
+		else if(prevComp == start) {
+			// cycle is self dual
+			matrix[orig] = cval;
+			matrix[comp] = val;
+			return -1;
+		}
+
+		// shift the values to their next positions
+		matrix[orig] = matrix[prevOrig];
+		matrix[comp] = matrix[prevComp];
+		// update
+		return prevOrig;
+	}
+
+	private static int prevIndexCycle(int index, int rows, int cols) {
+		int lastIndex = rows * cols - 1;
+		if(index == lastIndex)
+			return lastIndex;
+		long temp = (long) index * cols;
+		return (int) (temp % lastIndex);
+	}
+
+	/**
+	 * Performs prime factorization of a given number n. The method calculates the prime factors of n, their exponents,
+	 * powers and stores the results in the provided arrays.
+	 *
+	 * @param n         The number to be factorized.
+	 * @param primes    Array to store the unique prime factors of n.
+	 * @param exponents Array to store the exponents of the respective prime factors.
+	 * @param powers    Array to store the powers of the respective prime factors.
+	 * @return The number of unique prime factors.
+	 */
+	private static int primeFactorization(int n, int[] primes, int[] exponents, int[] powers) {
+		int pIdx = -1;
+		int currDiv = 0;
+		int rest = n;
+		int div = 2;
+
+		while(rest > 1) {
+			int quotient = rest / div;
+			if(rest - div * quotient == 0) {
+				if(div == currDiv) {
+					// current divisor is the same as the last one
+					powers[pIdx] *= div;
+					exponents[pIdx]++;
+				}
+				else {
+					// new prime factor found
+					pIdx++;
+					if(pIdx >= primes.length)
+						throw new RuntimeException("Not enough space, need to expand input arrays.");
+
+					primes[pIdx] = div;
+					powers[pIdx] = div;
+					currDiv = div;
+					exponents[pIdx] = 1;
+				}
+				rest = quotient;
+			}
+			else {
+				// only odd divs
+				div = (div == 2) ? 3 : div + 2;
+			}
+		}
+		return pIdx + 1;
+	}
+
+	private static int eulerTotient(int[] primes, int[] exponents, int[] iExponents, int numPrimes, int count) {
+		for(int ip = 0; ip < numPrimes; ip++) {
+			if(iExponents[ip] == exponents[ip]) {
+				continue;
+			}
+			count = (count / primes[ip]) * (primes[ip] - 1);
+		}
+		return count;
+	}
 }
diff --git a/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/TransposeInPlaceBrennerTest.java b/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/TransposeInPlaceBrennerTest.java
new file mode 100644
index 00000000000..87e169f3f3f
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/TransposeInPlaceBrennerTest.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.matrix.libMatrixReorg;
+
+import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import static org.junit.Assert.assertThrows;
+import static org.junit.Assert.assertTrue;
+
+public class TransposeInPlaceBrennerTest {
+
+	@Test
+	public void transposeInPlaceDenseBrennerOnePrime() {
+		// 3*4-1 = 11
+		testTransposeInPlaceDense(3, 4, 1);
+	}
+
+	@Test
+	public void transposeInPlaceDenseBrennerTwoPrimes() {
+		// 4*9-1 = 5*7
+		testTransposeInPlaceDense(4, 9, 0.96);
+	}
+
+	@Test
+	public void transposeInPlaceDenseBrennerThreePrimes() {
+		// 2*53-1 = 3*5*7
+		testTransposeInPlaceDense(2, 53, 0.52);
+	}
+
+	@Test
+	public void transposeInPlaceDenseBrennerThreePrimesOneExpo() {
+		// 1151*2999-1 = (2**3)*3*143827
+		testTransposeInPlaceDense(1151, 2999, 0.82);
+	}
+
+	@Test
+	public void transposeInPlaceDenseBrennerThreePrimesAllExpos() {
+		// 9*10889-1 = (2**4)*(5**3)*(7**2)
+		testTransposeInPlaceDense(9, 10889, 0.74);
+	}
+
+	@Test
+	public void transposeInPlaceDenseBrennerFourPrimesOneExpo() {
+		// 53*4421-1 = (2**3)*3*13*751
+		testTransposeInPlaceDense(53, 4421, 0.75);
+	}
+
+	@Test
+	public void transposeInPlaceDenseBrennerFivePrimes() {
+		// 3*3337-1 = 2*5*7*11*13
+		testTransposeInPlaceDense(3, 3337, 0.68);
+	}
+
+	@Test
+	public void transposeInPlaceDenseBrennerSixPrimesOneExpo() {
+		// 53*7177-1 = (2**2)*5*7*11*13*19
+		testTransposeInPlaceDense(53, 7177, 0.78);
+	}
+
+	@Test
+	public void transposeInPlaceDenseBrennerSevenPrimesThreeExpos() {
+		// 2087*17123-1 = (2**2)*3*(5**2)*(7**2)*11*13*17
+		testTransposeInPlaceDense(2087, 17123, 0.79);
+	}
+
+	@Test
+	public void transposeInPlaceDenseBrennerEightPrimes() {
+		// 347*27953-1 = 2*3*5*7*11*13*17*19
+		testTransposeInPlaceDense(347, 27953, 0.86);
+	}
+
+	@Test
+	public void transposeInPlaceDenseBrennerNinePrimes() {
+		// 317*703763-1 = 2*3*5*7*11*13*17*19*23
+		MatrixBlock X = MatrixBlock.randOperations(317, 703763, 0.52);
+
+		Exception exception = assertThrows(RuntimeException.class,
+			() -> LibMatrixReorg.transposeInPlaceDenseBrenner(X, 1));
+		assertTrue(exception.getMessage().contains("Not enough space, need to expand input arrays."));
+	}
+
+	private void testTransposeInPlaceDense(int rows, int cols, double sparsity) {
+		MatrixBlock X = MatrixBlock.randOperations(rows, cols, sparsity);
+		MatrixBlock tX = LibMatrixReorg.transpose(X);
+
+		LibMatrixReorg.transposeInPlaceDenseBrenner(X, 1);
+
+		TestUtils.compareMatrices(X, tX, 0);
+	}
+
+}