diff --git a/llamafile/copy.sh b/llamafile/copy.sh
index 4f0031fd5a..d0c464b93b 100755
--- a/llamafile/copy.sh
+++ b/llamafile/copy.sh
@@ -7,6 +7,7 @@ scp llama.cpp/ggml-cuda.cu \
     llama.cpp/ggml-cuda.h \
     llama.cpp/ggml-impl.h \
     llama.cpp/ggml-alloc.h \
+    llama.cpp/ggml-common.h \
     llama.cpp/ggml-backend.h \
     llama.cpp/ggml-backend-impl.h \
     llama.cpp/ggml.h \
diff --git a/llamafile/cuda.bat b/llamafile/cuda.bat
index d6936d1339..a47067c86c 100644
--- a/llamafile/cuda.bat
+++ b/llamafile/cuda.bat
@@ -8,7 +8,6 @@ nvcc -arch=all ^
      --shared ^
      --forward-unknown-to-host-compiler ^
      -Xcompiler="/nologo /EHsc /O2 /GR /MT" ^
-     -use_fast_math ^
      -DNDEBUG ^
      -DGGML_BUILD=1 ^
      -DGGML_SHARED=1 ^
diff --git a/llamafile/rocm.bat b/llamafile/rocm.bat
index 5a4b039b18..58874d3c69 100644
--- a/llamafile/rocm.bat
+++ b/llamafile/rocm.bat
@@ -19,7 +19,7 @@
 ::
 :: TODO(jart): How do we get this to not depend on VCRUNTIME140?
 
-%HIP_PATH%\bin\clang++.exe ^
+"%HIP_PATH%\bin\clang++.exe" ^
   -fuse-ld=lld ^
   -shared ^
   -nostartfiles ^
@@ -36,7 +36,7 @@
   -D_XOPEN_SOURCE=600 ^
   -D__HIP_PLATFORM_AMD__=1 ^
   -D__HIP_PLATFORM_HCC__=1 ^
-  -isystem %HIP_PATH%\include ^
+  -isystem "%HIP_PATH%\include" ^
   -O3 ^
   -DNDEBUG ^
   -D_DLL ^
@@ -50,7 +50,5 @@
   --offload-arch=gfx1010,gfx1012,gfx906,gfx1030,gfx1031,gfx1032,gfx1100,gfx1101,gfx1102,gfx1103 ^
   -o ggml-rocm.dll ^
   ggml-cuda.cu ^
-  -l%HIP_PATH%\lib\hipblas.lib ^
-  -l%HIP_PATH%\lib\rocblas.lib ^
-  -l%HIP_PATH%\lib\amdhip64.lib ^
+  "-l%HIP_PATH%\lib\amdhip64.lib" ^
   -lkernel32
diff --git a/llamafile/version.h b/llamafile/version.h
index 3f702f9040..a20f962edb 100644
--- a/llamafile/version.h
+++ b/llamafile/version.h
@@ -1,8 +1,8 @@
 #pragma once
 
 #define LLAMAFILE_MAJOR 0
-#define LLAMAFILE_MINOR 6
-#define LLAMAFILE_PATCH 2
+#define LLAMAFILE_MINOR 7
+#define LLAMAFILE_PATCH 0
 #define LLAMAFILE_VERSION \
     (100000000 * LLAMAFILE_MAJOR + 1000000 * LLAMAFILE_MINOR + LLAMAFILE_PATCH)