diff --git a/requirements.txt b/requirements.txt index 2d77d1e1..84da8460 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,7 +17,7 @@ numba numpy>=1.23.5,<2.0.0 ; python_version == '3.10' numpy>=1.26.4,<2.0.0 ; python_version != '3.10' rich -instructlab-dolomite>=0.1.1 +instructlab-dolomite>=0.2.0 trl>=0.9.4 peft pydantic>=2.7.0 diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index ab59282f..7910c341 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -4,6 +4,7 @@ from copy import deepcopy from pathlib import Path import argparse +import json import math import os import re @@ -528,6 +529,10 @@ def main(args): tokenizer = setup_tokenizer(args.model_name_or_path, SPECIAL_TOKENS, CHAT_TEMPLATE) # device = torch.device("cuda", args.local_rank) + with open(Path(args.model_name_or_path) / "config.json") as conf_json: + model_conf = json.load(conf_json) + args.model_type = model_conf["model_type"] + #### distributed init ##### torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) args.local_rank = int(os.environ["LOCAL_RANK"]) diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py index d685d212..41b410c7 100644 --- a/src/instructlab/training/utils.py +++ b/src/instructlab/training/utils.py @@ -10,7 +10,6 @@ from typing import Any, List, Optional import importlib import inspect -import json import logging import os import random @@ -62,17 +61,10 @@ def check_valid_train_args(train_args: TrainingArgs): f"Provided path to model does not exist. Please make sure that you've passed a valid model and that it has appropriate permissions: {train_args.model_path}" ) - if train_args.use_dolomite: - with open(Path(train_args.model_path) / "config.json") as conf_json: - model_conf = json.load(conf_json) - if model_conf["model_type"] == "granite": - raise RuntimeError( - "Converting Granite models to Dolomite format is currently unsupported." - ) - if train_args.disable_flash_attn: - raise RuntimeError( - "ERROR: Trying to use dolomite padding-free transformer without flash attention is not supported" - ) + if train_args.use_dolomite and train_args.disable_flash_attn: + raise RuntimeError( + "ERROR: Trying to use dolomite padding-free transformer without flash attention is not supported" + ) if train_args.is_padding_free: print( @@ -229,7 +221,7 @@ def pad_collate_fn(batch): input_ids.extend(item["input_ids"].tolist()) labels.extend(item["labels"].tolist()) - position_ids.extend(range(total_len, total_len + item_len)) + position_ids.extend(range(item_len)) total_len += item_len num_loss_counted_tokens += (item["labels"] != -100).sum().item() @@ -802,10 +794,21 @@ def _get_state_dict_patched(model, unwrap=False): output_dir.mkdir(parents=True, exist_ok=True) if not model.module.config.architectures and convert_dolomite: - model.module.config.architectures = ["LlamaForCausalLM"] - warnings.warn( - f"Adding architectures to ckpt: {model.module.config.architectures}", - ) + arch_added = False + if args.model_type == "llama": + model.module.config.architectures = ["LlamaForCausalLM"] + arch_added = True + elif args.model_type == "granite": + model.module.config.architectures = ["GraniteForCausalLM"] + arch_added = True + if arch_added: + warnings.warn( + f"Adding architectures to ckpt: {model.module.config.architectures}", + ) + else: + warnings.warn( + f"Converting from dolomite, but no architecture field added to config.json", + ) model.module.config.to_json_file(output_config_file) tokenizer.save_pretrained(output_dir) @@ -834,7 +837,7 @@ def _get_state_dict_patched(model, unwrap=False): export_to_huggingface( pretrained_model_name_or_path=tmpdir.name, save_path=final_output_dir, - model_type="llama", + model_type=args.model_type, ) tmpdir.cleanup()