Skip to content

Commit

Permalink
llama/ggml: add LLM training support
Browse files Browse the repository at this point in the history
more compact progress bar

refactor: llama_prepare_sbatch/ubatch

llama_save_model_to_file

gqa_mode arg for repeat_back

llama_opt_param_filter

ggml_graph_dup force_grads

refactor ggml_opt, fix test-opt
  • Loading branch information
JohannesGaessler committed Jan 27, 2025
1 parent a5203b4 commit c255573
Show file tree
Hide file tree
Showing 26 changed files with 1,288 additions and 333 deletions.
16 changes: 16 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2177,3 +2177,19 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
return result;
}

ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
const int64_t ne_datapoint = llama_n_ctx(ctx);
const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
ggml_opt_dataset_t result = ggml_opt_dataset_init(
GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);

llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;

for (int64_t idata = 0; idata < ndata; ++idata) {
memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
}

return result;
}
6 changes: 6 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -697,3 +697,9 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

}

//
// training utils
//

ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ else()
add_subdirectory(tokenize)
add_subdirectory(tts)
add_subdirectory(gen-docs)
add_subdirectory(training)
if (NOT GGML_BACKEND_DL)
# these examples use the backends directly and cannot be built with dynamic loading
add_subdirectory(convert-llama2c-to-ggml)
Expand Down
5 changes: 5 additions & 0 deletions examples/training/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
set(TARGET llama-finetune)
add_executable(${TARGET} finetune.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
17 changes: 17 additions & 0 deletions examples/training/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# llama.cpp/examples/training

This directory contains examples related to language model training using llama.cpp/GGML.
So far finetuning is technically functional (for FP32 models and limited hardware setups) but the code is very much WIP.
Finetuning of Stories 260K and LLaMA 3.2 1b seems to work with 24 GB of memory.
**For CPU training, compile llama.cpp without any additional backends such as CUDA.**
**For CUDA training, use the maximum number of GPU layers.**

Proof of concept:

``` sh
export model_name=llama_3.2-1b && export quantization=f32
./build/bin/finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
./build/bin/perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
```

The perplexity value of the finetuned model should be lower after training on the test set for 2 epochs.
97 changes: 97 additions & 0 deletions examples/training/finetune.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"

#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <vector>

#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

int main(int argc, char ** argv) {
common_params params;

params.logits_all = true;
params.escape = false;

if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
return 1;
}

if (params.use_mmap) {
LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
params.use_mmap = false;
}
if (params.cache_type_k == GGML_TYPE_F16) {
LOG_INF("%s: force changing k cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
params.cache_type_k = GGML_TYPE_F32;
}
if (params.cache_type_v == GGML_TYPE_F16) {
LOG_INF("%s: force changing v cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
params.cache_type_v = GGML_TYPE_F32;
}

common_init();
llama_backend_init();
llama_numa_init(params.numa);

// load the model and apply lora adapter, if any
common_init_result llama_init = common_init_from_params(params);
llama_model_ptr & model = llama_init.model;
llama_context_ptr & ctx = llama_init.context;

if (model == NULL) {
LOG_ERR("%s: unable to load model\n", __func__);
return 1;
}

// print system information
{
LOG_INF("\n");
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
}

constexpr float val_split = 0.05f;

std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);

struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
optimizer_params.adamw.alpha = 1e-7f; // learning rate

struct llama_opt_params lopt_params {
/*n_ctx_train =*/ 0,
/*param_filter =*/ llama_opt_param_filter_all,
/*param_filter_ud =*/ nullptr,
/*get_opt_pars =*/ ggml_opt_get_constant_optimizer_params,
/*get_opt_pars_ud =*/ &optimizer_params,
};
llama_opt_init(ctx.get(), model.get(), lopt_params);

const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);

ggml_opt_result_t result_train = ggml_opt_result_init();
ggml_opt_result_t result_eval = ggml_opt_result_init();

for (int epoch = 0; epoch < 2; ++epoch) {
llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
fprintf(stderr, "\n");

ggml_opt_result_reset(result_train);
ggml_opt_result_reset(result_eval);
}
ggml_opt_result_free(result_train);
ggml_opt_result_free(result_eval);

llama_model_save_to_file(model.get(), "finetuned-model.gguf");

llama_backend_free();

return 0;
}
75 changes: 47 additions & 28 deletions ggml/include/ggml-opt.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,16 @@ extern "C" {
// ====== Dataset ======

GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
int64_t ne_datapoint, // number of elements per datapoint
int64_t ne_label, // number of elements per label
int64_t ndata, // total number of datapoints/labels
int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
enum ggml_type type_data, // the type for the internal data tensor
enum ggml_type type_label, // the type for the internal labels tensor
int64_t ne_datapoint, // number of elements per datapoint
int64_t ne_label, // number of elements per label
int64_t ndata, // total number of datapoints/labels
int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);

// get underlying tensors that store the data
GGML_API int64_t ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata]

Expand All @@ -56,13 +59,19 @@ extern "C" {
struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch]
struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch]
int64_t ibatch);
GGML_API void ggml_opt_dataset_get_batch_host(
ggml_opt_dataset_t dataset,
void * data_batch,
size_t nb_data_batch,
void * labels_batch,
int64_t ibatch);

// ====== Model / Context ======

enum ggml_opt_build_type {
GGML_OPT_BUILD_TYPE_FORWARD,
GGML_OPT_BUILD_TYPE_GRAD,
GGML_OPT_BUILD_TYPE_OPT,
GGML_OPT_BUILD_TYPE_FORWARD = 10,
GGML_OPT_BUILD_TYPE_GRAD = 20,
GGML_OPT_BUILD_TYPE_OPT = 30,
};

// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
Expand All @@ -81,20 +90,22 @@ extern "C" {
// userdata can be used to pass arbitrary data
typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);

// returns the default optimizer params (constant)
// returns the default optimizer params (constant, hard-coded values)
// userdata is not used
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);

// casts userdata to ggml_opt_optimizer_params and returns it
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);

// parameters for initializing a new optimization context
struct ggml_opt_params {
ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs

struct ggml_context * ctx_compute; // created in user code, holds non-static tensors

// the forward graph is defined by inputs and outputs
// those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
struct ggml_tensor * inputs;
struct ggml_tensor * outputs;
// by default the forward graph needs to be reconstructed for each eval
// if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
struct ggml_context * ctx_compute;
struct ggml_tensor * inputs;
struct ggml_tensor * outputs;

enum ggml_opt_loss_type loss_type;
enum ggml_opt_build_type build_type;
Expand All @@ -107,12 +118,9 @@ extern "C" {

// get parameters for an optimization context with defaults set where possible
// parameters for which no sensible defaults exist are supplied as arguments to this function
GGML_API ggml_opt_params ggml_opt_default_params(
ggml_backend_sched_t backend_sched,
struct ggml_context * ctx_compute,
struct ggml_tensor * inputs,
struct ggml_tensor * outputs,
enum ggml_opt_loss_type loss_type);
GGML_API struct ggml_opt_params ggml_opt_default_params(
ggml_backend_sched_t backend_sched,
enum ggml_opt_loss_type loss_type);

GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
Expand All @@ -121,18 +129,20 @@ extern "C" {
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);

// get underlying tensors that store data
// if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against
GGML_API struct ggml_tensor * ggml_opt_loss( ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss
GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs
GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels

// get the gradient accumulator for a node from the forward graph
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);

// ====== Optimization Result ======

GGML_API ggml_opt_result_t ggml_opt_result_init();
GGML_API ggml_opt_result_t ggml_opt_result_init(void);
GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);

Expand All @@ -144,11 +154,20 @@ extern "C" {

// ====== Computation ======

// do forward pass, increment result if not NULL
GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
// if not using static graphs, this function must be called prior to ggml_opt_alloc
GGML_API void ggml_opt_prepare_alloc(
ggml_opt_context_t opt_ctx,
struct ggml_context * ctx_compute,
struct ggml_cgraph * gf,
struct ggml_tensor * inputs,
struct ggml_tensor * outputs);

// allocate the next graph for evaluation, either forward or forward + backward
// must be called exactly once prior to calling ggml_opt_eval
GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);

// do forward pass, increment result if not NULL, do backward pass
GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
// do forward pass, increment result if not NULL, do backward pass if allocated
GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);

// ############################################################################
// ## The high-level functions start here. They do not depend on any private ##
Expand Down Expand Up @@ -200,9 +219,9 @@ extern "C" {
// fit model defined by inputs and outputs to dataset
GGML_API void ggml_opt_fit(
ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs
ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
struct ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
struct ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
enum ggml_opt_loss_type loss_type, // loss to minimize
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
Expand Down
13 changes: 6 additions & 7 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -763,7 +763,7 @@ extern "C" {
// Tensor flags
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API void ggml_set_param(struct ggml_tensor * tensor);
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);

//
Expand Down Expand Up @@ -933,7 +933,7 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_repeat_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride

// concat a and b along dim
// used in stable-diffusion
Expand Down Expand Up @@ -2054,15 +2054,14 @@ extern "C" {

GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API void ggml_build_backward_expand(
struct ggml_context * ctx_static, // context for static gradients (loss + gradient accumulation)
struct ggml_context * ctx_compute, // context for gradient computation
struct ggml_cgraph * cgraph,
bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
struct ggml_context * ctx, // context for gradient computation
struct ggml_cgraph * cgraph,
struct ggml_tensor ** grad_accs);

// graph allocation in a context
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1107,7 +1107,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg

const int node_backend_id = tensor_backend_id(node);

assert(node_backend_id != -1); // all nodes should be assigned by now
assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback

// check if we should start a new split based on the sources of the current node
bool need_new_split = false;
Expand Down
Loading

0 comments on commit c255573

Please sign in to comment.