From ab164df51a28a23866202fbfc6ab276f94678c9e Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 24 Jan 2025 10:40:09 +0800 Subject: [PATCH 01/31] update --- swift/llm/template/grounding.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 swift/llm/template/grounding.py diff --git a/swift/llm/template/grounding.py b/swift/llm/template/grounding.py new file mode 100644 index 0000000000..e69de29bb2 From ed341e113da3aca939dde13fc86d9c794dc7cf62 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 24 Jan 2025 16:00:52 +0800 Subject: [PATCH 02/31] update --- ...\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" | 2 +- docs/source_en/Instruction/Command-line-parameters.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 00dd6dafe6..3507387c3f 100644 --- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -44,7 +44,7 @@ ### 模板参数 - 🔥template: 对话模板类型,默认使用model对应的template类型。`swift pt`会将对话模版转为生成模板使用 -- 🔥system: 自定义system字段,可以是一个txt文件地址,默认为None,使用template的默认system +- 🔥system: 自定义system字段,可以传入字符串或者txt文件路径。默认为None,使用template的默认system - 🔥max_length: 单样本的tokens最大长度。默认为None,设置为模型支持的tokens最大长度(max_model_len) - truncation_strategy: 如果超长如何处理,支持`delete`, `left`和`right`,代表删除、左侧裁剪和右侧裁剪,默认为'delete' - 🔥max_pixels: 多模态模型图片前处理的最大像素数(H\*W),默认不缩放。 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index e9f28d5bc0..b6069ee07a 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -44,7 +44,7 @@ The introduction to command line parameters will cover base arguments, atomic ar ### Template Arguments - 🔥template: Type of dialogue template, which defaults to the template type corresponding to the model. `swift pt` will convert the dialogue template into a generation template for use. -- 🔥system: Custom system field, could be a txt file path, default is None, uses the default system of the template. +- 🔥system: Custom system field, can take a string or txt file path as input. Default is None, uses the default system of the template. - 🔥max_length: The maximum length of tokens for a single sample. Defaults to None, set to the maximum length of tokens supported by the model (max_model_len). - truncation_strategy: How to handle overly long tokens, supports `delete`, `left`, `right`, representing deletion, left trimming, and right trimming, default is 'delete'. - 🔥max_pixels: Maximum pixel count for pre-processing images in multimodal models (H*W), default is no scaling. From 5b252beeabdb50393f51c878c3d98f11a622a220 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 24 Jan 2025 17:24:03 +0800 Subject: [PATCH 03/31] update --- swift/llm/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/swift/llm/utils.py b/swift/llm/utils.py index 7445589c97..0af4f15d54 100644 --- a/swift/llm/utils.py +++ b/swift/llm/utils.py @@ -20,6 +20,9 @@ except ImportError: Processor = Union[PreTrainedTokenizerBase, FeatureExtractionMixin, HfProcessorMixin] +if 'TOKENIZERS_PARALLELISM' not in os.environ: + os.environ['TOKENIZERS_PARALLELISM'] = 'false' + logger = get_logger() Tool = Dict[str, Union[str, Dict]] From 743c12c8ed63b43ce852587023bec28b3f244d49 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sun, 26 Jan 2025 16:25:28 +0800 Subject: [PATCH 04/31] update --- examples/train/multimodal/grounding.sh | 6 +++-- swift/llm/dataset/dataset/mllm.py | 29 +++++++++++++++++++++ swift/llm/dataset/preprocessor/core.py | 28 ++++++++++++++++++++- swift/llm/template/template/qwen.py | 35 ++++++++------------------ 4 files changed, 71 insertions(+), 27 deletions(-) diff --git a/examples/train/multimodal/grounding.sh b/examples/train/multimodal/grounding.sh index 50e17804f8..c2213660d9 100644 --- a/examples/train/multimodal/grounding.sh +++ b/examples/train/multimodal/grounding.sh @@ -1,9 +1,10 @@ +# 20GiB # You can refer to `https://github.com/QwenLM/Qwen2-VL` for the meaning of the `MAX_PIXELS` parameter. CUDA_VISIBLE_DEVICES=0 \ MAX_PIXELS=1003520 \ swift sft \ --model Qwen/Qwen2-VL-7B-Instruct \ - --dataset 'swift/refcoco:grounding#1000' \ + --dataset 'AI-ModelScope/coco#20000' \ --train_type lora \ --torch_dtype bfloat16 \ --num_train_epochs 1 \ @@ -22,4 +23,5 @@ swift sft \ --max_length 2048 \ --output_dir output \ --warmup_ratio 0.05 \ - --dataloader_num_workers 4 + --dataloader_num_workers 4 \ + --dataset_num_proc 4 diff --git a/swift/llm/dataset/dataset/mllm.py b/swift/llm/dataset/dataset/mllm.py index 59635f7ca1..0584bbe00f 100644 --- a/swift/llm/dataset/dataset/mllm.py +++ b/swift/llm/dataset/dataset/mllm.py @@ -1086,6 +1086,35 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: tags=['multi-modal', 'en', 'vqa', 'quality'])) +class CocoPreprocessor(ResponsePreprocessor): + category = [ + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', + 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', + 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', + 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', + 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', + 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', + 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', + 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush' + ] + + def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: + row['query'] = 'Task: Object Detection' + objects = row['objects'] + objects['ref'] = [self.category[c] for c in objects['category']] + row['response'] = ','.join([''] * len(objects['ref'])) + return super().preprocess(row) + + +register_dataset( + DatasetMeta( + ms_dataset_id='AI-ModelScope/coco', + hf_dataset_id='detection-datasets/coco', + preprocess_func=CocoPreprocessor(), + huge_dataset=True, + tags=['multi-modal', 'en', 'vqa', 'quality'])) + + class LLaVAMixSFTPreprocessor(RowPreprocessor): def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]: diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py index 76b77992ce..9749ca8938 100644 --- a/swift/llm/dataset/preprocessor/core.py +++ b/swift/llm/dataset/preprocessor/core.py @@ -8,7 +8,7 @@ from datasets import Dataset as HfDataset from datasets import Image from datasets import IterableDataset as HfIterableDataset -from datasets import Value +from datasets import Sequence, Value from swift.llm import history_to_messages from swift.utils import get_logger @@ -143,6 +143,28 @@ def _fix_streaming_keys(row): new_k = k[len('__@'):] row[new_k] = row.pop(k) + @staticmethod + def _check_objects(row): + if 'objects' not in row: + return + objects = row['objects'] + for k in list(objects.keys()): + if k not in {'bbox', 'ref', 'image_id'}: + objects.pop(k) + bbox = objects['bbox'] + assert len(bbox) == len( + objects['ref']), (f"len(objects['bbox']): {len(bbox)}, len(objects['ref']): {len(objects['ref'])}") + + # check bbox + for box in bbox: + assert len(box) % 2 == 0, f'len(box): {len(box)}' + if len(box) != 4: + continue + if box[0] > box[2]: + box[0], box[2] = box[2], box[0] + if box[1] > box[3]: + box[1], box[3] = box[3], box[1] + def batched_preprocess(self, batched_row: Dict[str, Any], *, strict: bool, ignore_max_length_error: bool) -> Dict[str, Any]: from ...template import MaxLengthError @@ -161,6 +183,7 @@ def batched_preprocess(self, batched_row: Dict[str, Any], *, strict: bool, if isinstance(row, dict): row = [row] for r in row: + self._check_objects(r) self._check_messages(r) self._check_rejected_response(r) self._cast_images(r) @@ -228,6 +251,9 @@ def _new_init(self, schema=None, features=None, *args, **kwargs): 'content': Value(dtype='string', id=None) }] features['images'] = [{'bytes': Value(dtype='binary', id=None), 'path': Value(dtype='string', id=None)}] + features['bbox'] = Sequence(feature=Sequence(feature=Value(dtype='float64'), length=4)) + features['ref'] = Sequence(feature=Value(dtype='string')) + ArrowWriter.__origin_init__(self, schema, features, *args, **kwargs) ArrowWriter.__origin_init__ = ArrowWriter.__init__ diff --git a/swift/llm/template/template/qwen.py b/swift/llm/template/template/qwen.py index 5c2cfcca0a..2a6fe4b451 100644 --- a/swift/llm/template/template/qwen.py +++ b/swift/llm/template/template/qwen.py @@ -104,10 +104,10 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int assert isinstance(image, str) return [f'Picture {index + 1}: {image}\n'] - def replace_object(self, object_: Dict[str, Any], index: int, inputs: StdTemplateInputs) -> List[Context]: - return [f'{object_["caption"]}'] + def replace_ref(self, ref: str, index: int, inputs: StdTemplateInputs) -> List[Context]: + return [f'{ref}'] - def replace_box(self, object_: Dict[str, Any], index: int, inputs: StdTemplateInputs) -> List[Context]: + def replace_bbox(self, bbox: Dict[str, Any], index: int, inputs: StdTemplateInputs) -> List[Context]: if isinstance(object_['bbox'][0], list): all_objects = '' for sub_object in object_['bbox']: @@ -208,27 +208,14 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int inputs.videos[index] = fetch_video({'video': inputs.videos[index]}).to(torch.uint8) return ['<|vision_start|><|video_pad|><|vision_end|>'] - def replace_object(self, object_: Dict[str, Any], index: int, inputs: StdTemplateInputs) -> List[Context]: - if object_: - return ['<|object_ref_start|>', object_['caption'], '<|object_ref_end|>'] - else: - return [''] - - def replace_box(self, object_: Dict[str, Any], index: int, inputs: StdTemplateInputs) -> List[Context]: - if object_: - if isinstance(object_['bbox'][0], list): - all_objects = '' - for sub_object in object_['bbox']: - all_objects += (f'<|box_start|>({sub_object[0]},{sub_object[1]}),' - f'({sub_object[2]},{sub_object[3]})<|box_end|>') - return [all_objects] - else: - return [ - f'<|box_start|>({object_["bbox"][0]},{object_["bbox"][1]}),' - f'({object_["bbox"][2]},{object_["bbox"][3]})<|box_end|>' - ] - else: - return [''] + def replace_ref(self, ref: str, index: int, inputs: StdTemplateInputs) -> List[Context]: + return [f'<|object_ref_start|>{ref}<|object_ref_end|>'] + + def replace_bbox(self, bbox: List[int], index: int, inputs: StdTemplateInputs) -> List[Context]: + point = [] + for x, y in zip(bbox[::2], bbox[1::2]): + point.append(f'({x},{y})') + return [f'<|box_start|>{",".join(point)}<|box_end|>'] def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]: encoded = super()._encode(inputs) From 663aa0b4a77a69a5930e4470cae20e45d9aa8c42 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sun, 26 Jan 2025 23:14:29 +0800 Subject: [PATCH 05/31] update --- swift/llm/dataset/dataset/mllm.py | 2 +- swift/llm/template/template_inputs.py | 19 ++--- swift/llm/template/vision_utils.py | 114 ++++++++------------------ 3 files changed, 38 insertions(+), 97 deletions(-) diff --git a/swift/llm/dataset/dataset/mllm.py b/swift/llm/dataset/dataset/mllm.py index 0584bbe00f..c77e277fce 100644 --- a/swift/llm/dataset/dataset/mllm.py +++ b/swift/llm/dataset/dataset/mllm.py @@ -1102,7 +1102,7 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: row['query'] = 'Task: Object Detection' objects = row['objects'] objects['ref'] = [self.category[c] for c in objects['category']] - row['response'] = ','.join([''] * len(objects['ref'])) + row['response'] = '' * len(objects['ref']) return super().preprocess(row) diff --git a/swift/llm/template/template_inputs.py b/swift/llm/template/template_inputs.py index 81aba5716b..28672d8648 100644 --- a/swift/llm/template/template_inputs.py +++ b/swift/llm/template/template_inputs.py @@ -81,16 +81,7 @@ class TemplateInputs(InferRequest): """ rejected_response: Optional[str] = None label: Optional[bool] = None - objects: Union[str, None, List[Dict[str, Any]]] = None # List[Dict[str, Any]] - - def __post_init__(self): - InferRequest.__post_init__(self) - # Format objects(groundings/refs) to json - if isinstance(self.objects, str): - # reload grounding from str - self.objects = json.loads(self.objects) - elif self.objects is None: - self.objects = [] + objects: Dict[str, List[Any]] = field(default_factory=dict) # List[Dict[str, Any]] @dataclass @@ -106,7 +97,7 @@ class StdTemplateInputs: images: List[Union[str, Image.Image]] = field(default_factory=list) audios: List[str] = field(default_factory=list) videos: List[str] = field(default_factory=list) - objects: List[Dict[str, Any]] = field(default_factory=list) + objects: Dict[str, List[Any]] = field(default_factory=dict) agent_keyword: Optional[Dict[str, str]] = None @@ -114,8 +105,8 @@ def __post_init__(self): self.image_idx = 0 self.audio_idx = 0 self.video_idx = 0 - self.object_idx = 0 - self.box_idx = 0 + self.ref_idx = 0 + self.bbox_idx = 0 if self.images and not isinstance(self.images, (list, tuple)): self.images = [self.images] if self.videos and not isinstance(self.videos, (list, tuple)): @@ -144,7 +135,7 @@ def from_dict(cls, inputs: Dict[str, Any], *, tools_prompt: str = 'react_en') -> kwargs[key] = inputs[key] messages = inputs['messages'] tools = inputs.get('tools') - objects = inputs.get('objects') or [] + objects = inputs.get('objects') or {} if messages and messages[0]['role'] == 'system': message = messages.pop(0) diff --git a/swift/llm/template/vision_utils.py b/swift/llm/template/vision_utils.py index bc2d3956c5..213683fada 100644 --- a/swift/llm/template/vision_utils.py +++ b/swift/llm/template/vision_utils.py @@ -100,7 +100,7 @@ def rescale_image(img: Image.Image, max_pixels: int) -> Image.Image: _T = TypeVar('_T') -def load_file(path: Union[str, _T]) -> Union[BytesIO, _T]: +def load_file(path: Union[str, bytes, _T]) -> Union[BytesIO, _T]: res = path if isinstance(path, str): path = path.strip() @@ -128,18 +128,8 @@ def load_file(path: Union[str, _T]) -> Union[BytesIO, _T]: return res -def load_file_decorator(func): - - def new_func(path, *args, **kwargs): - path = load_file(path) - res = func(path, *args, **kwargs) - return res - - return new_func - - -@load_file_decorator -def load_image(image: Union[Image.Image, BytesIO]) -> Image.Image: +def load_image(image: Union[str, bytes, Image.Image]) -> Image.Image: + image = load_file(image) if isinstance(image, BytesIO): image = Image.open(image) if image.mode != 'RGB': @@ -179,9 +169,9 @@ def transform_image(image, input_size=448, max_num=12): return pixel_values -@load_file_decorator -def load_video_internvl(video_io: BytesIO, bound=None, num_segments=32): +def load_video_internvl(video: Union[str, bytes], bound=None, num_segments=32): from decord import VideoReader, cpu + video_io = load_file(video) vr = VideoReader(video_io, ctx=cpu(0), num_threads=1) max_frame = len(vr) - 1 fps = float(vr.get_avg_fps()) @@ -193,20 +183,18 @@ def load_video_internvl(video_io: BytesIO, bound=None, num_segments=32): return images -def draw_plot(img_dir: str, bbox: List[int], bbox_type: str, output_file: str): - image = Image.open(img_dir) - +def draw_plot(image: Image, bbox: List[int], bbox_type: Literal['norm1000', 'norm100', 'norm1', 'none'] = 'norm1000'): objects = [{'bbox': bbox, 'bbox_type': bbox_type, 'image': 0}] normalize_bbox(objects, [image], 'real') bbox = objects[0]['bbox'] draw = ImageDraw.Draw(image) draw.rectangle(bbox, outline='red', width=2) - image.save(output_file) + return draw -@load_file_decorator -def load_video_cogvlm2(video_io: BytesIO) -> np.ndarray: +def load_video_cogvlm2(video: Union[str, bytes]) -> np.ndarray: from decord import cpu, VideoReader, bridge + video_io = load_file(video) bridge.set_bridge('torch') clip_end_sec = 60 clip_start_sec = 0 @@ -222,9 +210,9 @@ def load_video_cogvlm2(video_io: BytesIO) -> np.ndarray: return video_data -@load_file_decorator -def load_video_llava(video_io: BytesIO) -> np.ndarray: +def load_video_llava(video: Union[str, bytes]) -> np.ndarray: import av + video_io = load_file(video) container = av.open(video_io) total_frames = container.streams.video[0].frames num_frames = get_env_args('num_frames', int, 16) @@ -241,8 +229,8 @@ def load_video_llava(video_io: BytesIO) -> np.ndarray: return np.stack([x.to_ndarray(format='rgb24') for x in frames]) -@load_file_decorator -def load_video_minicpmv_mplug_owl3(video_io: BytesIO, max_num_frames): +def load_video_minicpmv_mplug_owl3(video: Union[str, bytes], max_num_frames): + from decord import VideoReader, cpu # pip install decord def uniform_sample(_l, _n): @@ -250,6 +238,7 @@ def uniform_sample(_l, _n): idxs = [int(i * gap + gap / 2) for i in range(_n)] return [_l[i] for i in idxs] + video_io = load_file(video) vr = VideoReader(video_io, ctx=cpu(0)) sample_fps = round(vr.get_avg_fps() / 1) # FPS frame_idx = [i for i in range(0, len(vr), sample_fps)] @@ -261,16 +250,16 @@ def uniform_sample(_l, _n): return frames -@load_file_decorator -def load_audio(audio_io: BytesIO, sampling_rate: int): +def load_audio(audio: Union[str, bytes], sampling_rate: int): import librosa + audio_io = load_file(audio) return librosa.load(audio_io, sr=sampling_rate)[0] -@load_file_decorator -def load_video_valley(video_io: BytesIO): +def load_video_valley(video: Union[str, bytes]): import decord from torchvision import transforms + video_io = load_file(video) video_reader = decord.VideoReader(video_io) decord.bridge.set_bridge('torch') video = video_reader.get_batch(np.linspace(0, len(video_reader) - 1, 8).astype(np.int_)).byte() @@ -278,61 +267,22 @@ def load_video_valley(video_io: BytesIO): return images -def normalize_bbox(objects: List[Dict[str, Any]], images: List[Image.Image], to_type: Literal['real', 'norm_1000', - 'norm_1']) -> None: - """Normalize bbox to needed. - to_type support real/norm_1000/norm_1, which literally means the coordinates in real, or normalized by 1000, - or normalized by 1. - - Args: - objects: The objects containing the bbox - images: The images list - to_type: The coordinate type needed by the model. - """ - if not objects or not images: +def normalize_bbox(images: List[Image.Image], + objects: Dict[str, List[Any]], + bbox_type: Literal['norm1000', 'norm100', 'none'] = 'norm1000') -> None: + if not objects or not images or bbox_type == 'none': return - - for object_ in objects: - bbox = object_['bbox'] - bbox_type = object_['bbox_type'] - idx = object_['image'] - image = images[idx] - if bbox_type == 'real': - if to_type == 'real': - continue + bbox_list = objects['bbox'] + ref_list = objects['ref'] + image_id_list = objects.get('image_id') or [] + image_id_list += [0] * (len(ref_list) - len(image_id_list)) + for bbox, ref, image_id in zip(bbox_list, ref_list, image_id_list): + image = images[image_id] + if bbox_type == 'norm1000': width, height = image.width, image.height - if isinstance(bbox[0], list): - bboxes = [] - for _box in bbox: - bboxes.append([ - int(coord / dim * 999) if to_type == 'norm_1000' else coord / dim - for coord, dim in zip(_box, [width, height, width, height]) - ]) - object_['bbox'] = bboxes - else: - object_['bbox'] = [ - int(coord / dim * 999) if to_type == 'norm_1000' else coord / dim - for coord, dim in zip(bbox, [width, height, width, height]) - ] - object_['bbox_type'] = to_type - elif bbox_type == 'norm_1000': - if to_type == 'norm_1000': - continue - if to_type == 'norm_1': - object_['bbox'] = [coord / 999. for coord in bbox] - elif to_type == 'real': - width, height = image.width, image.height - object_['bbox'] = [int(coord / 999. * dim) for coord, dim in zip(bbox, [width, height, width, height])] - object_['bbox_type'] = to_type - elif bbox_type == 'norm_1': - if to_type == 'norm_1': - continue - if to_type == 'norm_1000': - object_['bbox'] = [int(coord * 999) for coord in bbox] - elif to_type == 'real': - width, height = image.width, image.height - object_['bbox'] = [int(coord * dim) for coord, dim in zip(bbox, [width, height, width, height])] - object_['bbox_type'] = to_type + for i, (x, y) in enumerate(zip(bbox[::2], bbox[1::2])): + bbox[2 * i] = int(x / width * 1000) + bbox[2 * i + 1] = int(y / height * 1000) if __name__ == '__main__': From 8054048d78cfde11c2d1a0acb6680ee75693d297 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sun, 26 Jan 2025 23:18:11 +0800 Subject: [PATCH 06/31] support qwen2_5 vl --- ...36\213\345\222\214\346\225\260\346\215\256\351\233\206.md" | 2 ++ docs/source_en/Instruction/Supported-models-and-datasets.md | 2 ++ swift/llm/__init__.py | 4 ++-- swift/llm/model/model/qwen.py | 4 ++++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 5ea3e6b10f..5d7aa4e5fd 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -521,6 +521,8 @@ |[bytedance-research/UI-TARS-7B-DPO](https://modelscope.cn/models/bytedance-research/UI-TARS-7B-DPO)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[bytedance-research/UI-TARS-7B-DPO](https://huggingface.co/bytedance-research/UI-TARS-7B-DPO)| |[bytedance-research/UI-TARS-72B-SFT](https://modelscope.cn/models/bytedance-research/UI-TARS-72B-SFT)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[bytedance-research/UI-TARS-72B-SFT](https://huggingface.co/bytedance-research/UI-TARS-72B-SFT)| |[bytedance-research/UI-TARS-72B-DPO](https://modelscope.cn/models/bytedance-research/UI-TARS-72B-DPO)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[bytedance-research/UI-TARS-72B-DPO](https://huggingface.co/bytedance-research/UI-TARS-72B-DPO)| +|[Qwen/Qwen2.5-VL-3B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-3B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)| +|[Qwen/Qwen2.5-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)| |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)| |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)| |[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)| diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md index 1a4eb80593..3d2833321a 100644 --- a/docs/source_en/Instruction/Supported-models-and-datasets.md +++ b/docs/source_en/Instruction/Supported-models-and-datasets.md @@ -521,6 +521,8 @@ The table below introduces the models integrated with ms-swift: |[bytedance-research/UI-TARS-7B-DPO](https://modelscope.cn/models/bytedance-research/UI-TARS-7B-DPO)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[bytedance-research/UI-TARS-7B-DPO](https://huggingface.co/bytedance-research/UI-TARS-7B-DPO)| |[bytedance-research/UI-TARS-72B-SFT](https://modelscope.cn/models/bytedance-research/UI-TARS-72B-SFT)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[bytedance-research/UI-TARS-72B-SFT](https://huggingface.co/bytedance-research/UI-TARS-72B-SFT)| |[bytedance-research/UI-TARS-72B-DPO](https://modelscope.cn/models/bytedance-research/UI-TARS-72B-DPO)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[bytedance-research/UI-TARS-72B-DPO](https://huggingface.co/bytedance-research/UI-TARS-72B-DPO)| +|[Qwen/Qwen2.5-VL-3B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-3B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)| +|[Qwen/Qwen2.5-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)| |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)| |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)| |[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)| diff --git a/swift/llm/__init__.py b/swift/llm/__init__.py index 99bd05d605..ca8921b402 100644 --- a/swift/llm/__init__.py +++ b/swift/llm/__init__.py @@ -26,7 +26,7 @@ LazyLLMDataset, ConstantLengthDataset, standard_keys, load_dataset, DATASET_TYPE, sample_dataset, RowPreprocessor, DatasetMeta) from .utils import (deep_getattr, to_device, History, Messages, history_to_messages, messages_to_history, Processor, - save_checkpoint, ProcessorMixin, get_temporary_cache_files_directory) + save_checkpoint, ProcessorMixin, get_temporary_cache_files_directory, get_cache_dir) from .base import SwiftPipeline else: _extra_objects = {k: v for k, v in globals().items() if not k.startswith('_')} @@ -76,7 +76,7 @@ ], 'utils': [ 'deep_getattr', 'to_device', 'History', 'Messages', 'history_to_messages', 'messages_to_history', - 'Processor', 'save_checkpoint', 'ProcessorMixin', 'get_temporary_cache_files_directory' + 'Processor', 'save_checkpoint', 'ProcessorMixin', 'get_temporary_cache_files_directory', 'get_cache_dir' ], 'base': ['SwiftPipeline'], } diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py index fb59834f8d..53a2dbd4a1 100644 --- a/swift/llm/model/model/qwen.py +++ b/swift/llm/model/model/qwen.py @@ -559,6 +559,10 @@ def get_model_tokenizer_qwen2_vl(model_dir: str, Model('bytedance-research/UI-TARS-7B-DPO', 'bytedance-research/UI-TARS-7B-DPO'), Model('bytedance-research/UI-TARS-72B-SFT', 'bytedance-research/UI-TARS-72B-SFT'), Model('bytedance-research/UI-TARS-72B-DPO', 'bytedance-research/UI-TARS-72B-DPO'), + ]), + ModelGroup([ + Model('Qwen/Qwen2.5-VL-3B-Instruct', 'Qwen/Qwen2.5-VL-3B-Instruct'), + Model('Qwen/Qwen2.5-VL-7B-Instruct', 'Qwen/Qwen2.5-VL-7B-Instruct'), ]) ], TemplateType.qwen2_vl, From 8704c2f9e8bd7793620ca71f5679a89ddd23be85 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Mon, 27 Jan 2025 14:34:37 +0800 Subject: [PATCH 07/31] support qwen2_5_vl --- README.md | 2 +- README_CN.md | 2 +- ...44\350\241\214\345\217\202\346\225\260.md" | 2 +- ...14\346\225\260\346\215\256\351\233\206.md" | 4 +-- .../Instruction/Command-line-parameters.md | 2 +- .../Supported-models-and-datasets.md | 4 +-- swift/llm/model/constant.py | 1 + swift/llm/model/model/qwen.py | 34 +++++++++++++------ tests/test_align/test_template/test_vision.py | 23 ++++++++++--- 9 files changed, 52 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index fa28aa1fca..fbe8f3bba7 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ You can contact us and communicate with us by adding our group: ## 📝 Introduction -🍲 ms-swift is an official framework provided by the ModelScope community for fine-tuning and deploying large language models and multi-modal large models. It currently supports the training (pre-training, fine-tuning, human alignment), inference, evaluation, quantization, and deployment of 450+ large models and 150+ multi-modal large models. These large language models (LLMs) include models such as Qwen2.5, InternLM3, GLM4, Llama3.3, Mistral, DeepSeek3, Yi1.5, TeleChat2, Baichuan2, and Gemma2. The multi-modal LLMs include models such as Qwen2-VL, Qwen2-Audio, Llama3.2-Vision, Llava, InternVL2.5, MiniCPM-V-2.6, GLM4v, Xcomposer2.5, Yi-VL, DeepSeek-VL2, Phi3.5-Vision, and GOT-OCR2. +🍲 ms-swift is an official framework provided by the ModelScope community for fine-tuning and deploying large language models and multi-modal large models. It currently supports the training (pre-training, fine-tuning, human alignment), inference, evaluation, quantization, and deployment of 450+ large models and 150+ multi-modal large models. These large language models (LLMs) include models such as Qwen2.5, InternLM3, GLM4, Llama3.3, Mistral, DeepSeek3, Yi1.5, TeleChat2, Baichuan2, and Gemma2. The multi-modal LLMs include models such as Qwen2.5-VL, Qwen2-Audio, Llama3.2-Vision, Llava, InternVL2.5, MiniCPM-V-2.6, GLM4v, Xcomposer2.5, Yi-VL, DeepSeek-VL2, Phi3.5-Vision, and GOT-OCR2. 🍔 In addition, ms-swift gathers the latest training technologies, including LoRA, QLoRA, Llama-Pro, LongLoRA, GaLore, Q-GaLore, LoRA+, LISA, DoRA, FourierFt, ReFT, UnSloth, and Liger. ms-swift supports acceleration of inference, evaluation, and deployment modules using vLLM and LMDeploy, and supports the quantization of large models and multi-modal large models using technologies such as GPTQ, AWQ, and BNB. To help researchers and developers fine-tune and apply large models more easily, ms-swift also provides a Gradio-based Web-UI interface and a wealth of best practices. diff --git a/README_CN.md b/README_CN.md index cb23522da2..7bf3d068f8 100644 --- a/README_CN.md +++ b/README_CN.md @@ -53,7 +53,7 @@ | ## 📝 简介 -🍲 ms-swift是魔搭社区提供的大模型与多模态大模型微调部署框架,现已支持450+大模型与150+多模态大模型的训练(预训练、微调、人类对齐)、推理、评测、量化与部署。其中大模型包括:Qwen2.5、InternLM3、GLM4、Llama3.3、Mistral、DeepSeek3、Yi1.5、TeleChat2、Baichuan2、Gemma2等模型,多模态大模型包括:Qwen2-VL、Qwen2-Audio、Llama3.2-Vision、Llava、InternVL2.5、MiniCPM-V-2.6、GLM4v、Xcomposer2.5、Yi-VL、DeepSeek-VL2、Phi3.5-Vision、GOT-OCR2等模型。 +🍲 ms-swift是魔搭社区提供的大模型与多模态大模型微调部署框架,现已支持450+大模型与150+多模态大模型的训练(预训练、微调、人类对齐)、推理、评测、量化与部署。其中大模型包括:Qwen2.5、InternLM3、GLM4、Llama3.3、Mistral、DeepSeek3、Yi1.5、TeleChat2、Baichuan2、Gemma2等模型,多模态大模型包括:Qwen2.5-VL、Qwen2-Audio、Llama3.2-Vision、Llava、InternVL2.5、MiniCPM-V-2.6、GLM4v、Xcomposer2.5、Yi-VL、DeepSeek-VL2、Phi3.5-Vision、GOT-OCR2等模型。 🍔 除此之外,ms-swift汇集了最新的训练技术,包括LoRA、QLoRA、Llama-Pro、LongLoRA、GaLore、Q-GaLore、LoRA+、LISA、DoRA、FourierFt、ReFT、UnSloth、和Liger等。ms-swift支持使用vLLM和LMDeploy对推理、评测和部署模块进行加速,并支持使用GPTQ、AWQ、BNB等技术对大模型和多模态大模型进行量化。为了帮助研究者和开发者更轻松地微调和应用大模型,ms-swift还提供了基于Gradio的Web-UI界面及丰富的最佳实践。 diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 3507387c3f..0e05cd0b1b 100644 --- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -452,7 +452,7 @@ App参数继承于[部署参数](#部署参数), [Web-UI参数](#Web-UI参数) ## 特定模型参数 特定模型参数可以通过`--model_kwargs`或者环境变量进行设置,例如: `--model_kwargs '{"fps_max_frames": 12}'`或者`FPS_MAX_FRAMES=12` -### qwen2_vl, qvq +### qwen2_vl, qvq, qwen2_5_vl 参数含义可以查看[这里](https://github.com/QwenLM/Qwen2-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24) - IMAGE_FACTOR: 默认为28 diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 5d7aa4e5fd..559fc06b4f 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -521,8 +521,8 @@ |[bytedance-research/UI-TARS-7B-DPO](https://modelscope.cn/models/bytedance-research/UI-TARS-7B-DPO)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[bytedance-research/UI-TARS-7B-DPO](https://huggingface.co/bytedance-research/UI-TARS-7B-DPO)| |[bytedance-research/UI-TARS-72B-SFT](https://modelscope.cn/models/bytedance-research/UI-TARS-72B-SFT)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[bytedance-research/UI-TARS-72B-SFT](https://huggingface.co/bytedance-research/UI-TARS-72B-SFT)| |[bytedance-research/UI-TARS-72B-DPO](https://modelscope.cn/models/bytedance-research/UI-TARS-72B-DPO)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[bytedance-research/UI-TARS-72B-DPO](https://huggingface.co/bytedance-research/UI-TARS-72B-DPO)| -|[Qwen/Qwen2.5-VL-3B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-3B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)| -|[Qwen/Qwen2.5-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)| +|[Qwen/Qwen2.5-VL-3B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-3B-Instruct)|qwen2_5_vl|qwen2_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)| +|[Qwen/Qwen2.5-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct)|qwen2_5_vl|qwen2_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)| |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)| |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)| |[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)| diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index b6069ee07a..37eeb49f91 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -455,7 +455,7 @@ Export Arguments include the [basic arguments](#base-arguments) and [merge argum Specific model arguments can be set using `--model_kwargs` or environment variables, for example: `--model_kwargs '{"fps_max_frames": 12}'` or `FPS_MAX_FRAMES=12`. -### qwen2_vl, qvq +### qwen2_vl, qvq, qwen2_5_vl For the meaning of the arguments, please refer to [here](https://github.com/QwenLM/Qwen2-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24) - IMAGE_FACTOR: Default is 28 diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md index 3d2833321a..351361c3c2 100644 --- a/docs/source_en/Instruction/Supported-models-and-datasets.md +++ b/docs/source_en/Instruction/Supported-models-and-datasets.md @@ -521,8 +521,8 @@ The table below introduces the models integrated with ms-swift: |[bytedance-research/UI-TARS-7B-DPO](https://modelscope.cn/models/bytedance-research/UI-TARS-7B-DPO)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[bytedance-research/UI-TARS-7B-DPO](https://huggingface.co/bytedance-research/UI-TARS-7B-DPO)| |[bytedance-research/UI-TARS-72B-SFT](https://modelscope.cn/models/bytedance-research/UI-TARS-72B-SFT)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[bytedance-research/UI-TARS-72B-SFT](https://huggingface.co/bytedance-research/UI-TARS-72B-SFT)| |[bytedance-research/UI-TARS-72B-DPO](https://modelscope.cn/models/bytedance-research/UI-TARS-72B-DPO)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[bytedance-research/UI-TARS-72B-DPO](https://huggingface.co/bytedance-research/UI-TARS-72B-DPO)| -|[Qwen/Qwen2.5-VL-3B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-3B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)| -|[Qwen/Qwen2.5-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)| +|[Qwen/Qwen2.5-VL-3B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-3B-Instruct)|qwen2_5_vl|qwen2_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)| +|[Qwen/Qwen2.5-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct)|qwen2_5_vl|qwen2_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)| |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)| |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)| |[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)| diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py index 25d058cdb2..0d451ebdbe 100644 --- a/swift/llm/model/constant.py +++ b/swift/llm/model/constant.py @@ -120,6 +120,7 @@ class MLLMModelType: qwen_vl = 'qwen_vl' qwen_audio = 'qwen_audio' qwen2_vl = 'qwen2_vl' + qwen2_5_vl = 'qwen2_5_vl' qwen2_audio = 'qwen2_audio' qvq = 'qvq' ovis1_6 = 'ovis1_6' diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py index 53a2dbd4a1..6b5e3ec516 100644 --- a/swift/llm/model/model/qwen.py +++ b/swift/llm/model/model/qwen.py @@ -510,14 +510,10 @@ def _new_read_video_decord(ele: dict): vision_process._patch = True -def get_model_tokenizer_qwen2_vl(model_dir: str, - model_info: ModelInfo, - model_kwargs: Dict[str, Any], - load_model: bool = True, - **kwargs): +def get_model_tokenizer_qwen2_vl(*args, **kwargs): from transformers import Qwen2VLForConditionalGeneration kwargs['automodel_class'] = kwargs['automodel_class'] or Qwen2VLForConditionalGeneration - model, tokenizer = get_model_tokenizer_multimodal(model_dir, model_info, model_kwargs, load_model, **kwargs) + model, tokenizer = get_model_tokenizer_multimodal(*args, **kwargs) if model is not None and hasattr(model.model, 'embed_tokens'): patch_output_clone(model.model.embed_tokens) patch_output_to_input_device(model.model.embed_tokens) @@ -560,10 +556,6 @@ def get_model_tokenizer_qwen2_vl(model_dir: str, Model('bytedance-research/UI-TARS-72B-SFT', 'bytedance-research/UI-TARS-72B-SFT'), Model('bytedance-research/UI-TARS-72B-DPO', 'bytedance-research/UI-TARS-72B-DPO'), ]), - ModelGroup([ - Model('Qwen/Qwen2.5-VL-3B-Instruct', 'Qwen/Qwen2.5-VL-3B-Instruct'), - Model('Qwen/Qwen2.5-VL-7B-Instruct', 'Qwen/Qwen2.5-VL-7B-Instruct'), - ]) ], TemplateType.qwen2_vl, get_model_tokenizer_qwen2_vl, @@ -587,6 +579,28 @@ def get_model_tokenizer_qwen2_vl(model_dir: str, tags=['vision', 'video'])) +def get_model_tokenizer_qwen2_5_vl(*args, **kwargs): + from transformers import Qwen2_5_VLForConditionalGeneration + kwargs['automodel_class'] = kwargs['automodel_class'] or Qwen2_5_VLForConditionalGeneration + return get_model_tokenizer_qwen2_vl(*args, **kwargs) + + +register_model( + ModelMeta( + MLLMModelType.qwen2_5_vl, [ + ModelGroup([ + Model('Qwen/Qwen2.5-VL-3B-Instruct', 'Qwen/Qwen2.5-VL-3B-Instruct'), + Model('Qwen/Qwen2.5-VL-7B-Instruct', 'Qwen/Qwen2.5-VL-7B-Instruct'), + ]) + ], + TemplateType.qwen2_vl, + get_model_tokenizer_qwen2_5_vl, + model_arch=ModelArch.qwen2_vl, + architectures=['Qwen2_5_VLForConditionalGeneration'], + requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'pyav', 'decord'], + tags=['vision', 'video'])) + + def get_model_tokenizer_qwen2_audio(*args, **kwargs): from transformers import Qwen2AudioForConditionalGeneration kwargs['automodel_class'] = kwargs['automodel_class'] or Qwen2AudioForConditionalGeneration diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py index 5bc8cf7e90..b4f1b9477a 100644 --- a/tests/test_align/test_template/test_vision.py +++ b/tests/test_align/test_template/test_vision.py @@ -36,6 +36,20 @@ def test_qwen2_vl(): assert response == response2 == '这是一只小猫的图片。它有黑白相间的毛发,眼睛大而圆,显得非常可爱。' +def test_qwen2_5_vl(): + pt_engine = PtEngine('Qwen/Qwen2.5-VL-7B-Instruct') + messages = [{'role': 'user', 'content': 'What kind of dog is this?'}] + images = ['https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen2-VL/demo_small.jpg'] + response = _infer_model(pt_engine, messages=messages, images=images) + pt_engine.default_template.template_backend = 'jinja' + response2 = _infer_model(pt_engine, messages=messages, images=images) + assert response == response2 == ( + 'The dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and ' + 'energetic nature, which is evident in the image where the dog seems to be interacting playfully with ' + 'the person. The breed is characterized by its thick, water-repellent coat, which can come in various ' + 'colors including yellow, black, and chocolate.') + + def test_qvq(): pt_engine = PtEngine('Qwen/QVQ-72B-Preview') response = _infer_model(pt_engine) @@ -364,8 +378,8 @@ def test_valley(): def test_ui_tars(): os.environ['MAX_PIXELS'] = str(1280 * 28 * 28) pt_engine = PtEngine('bytedance-research/UI-TARS-2B-SFT') - prompt = r"""You are a GUI agent. You are given a task and your action history, with screenshots. \ -You need to perform the next action to complete the task. + prompt = ('You are a GUI agent. You are given a task and your action history, with screenshots. ' + 'You need to perform the next action to complete the task.' + r""" ## Output Format ```\nThought: ... @@ -391,7 +405,7 @@ def test_ui_tars(): - Summarize your next action (with its target element) in one sentence in `Thought` part. ## User Instruction -""" +""") instruction = "I'm looking for a software to \"edit my photo with grounding\"" messages = [ { @@ -417,6 +431,7 @@ def test_ui_tars(): logger = get_logger() # test_qwen2_vl() + test_qwen2_5_vl() # test_internvl2() # test_internvl2_phi3() # test_llava() @@ -451,4 +466,4 @@ def test_ui_tars(): # test_doc_owl2() # test_minicpmo() # test_valley() - test_ui_tars() + # test_ui_tars() From 5d82f1f3e2dd183f9aa81188884b2b96adbe2510 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Mon, 27 Jan 2025 14:38:10 +0800 Subject: [PATCH 08/31] update --- ...11\346\225\260\346\215\256\351\233\206.md" | 2 +- ...14\346\225\260\346\215\256\351\233\206.md" | 1 + .../Supported-models-and-datasets.md | 1 + examples/infer/demo_grounding.py | 35 +++++++++++++++++++ swift/llm/argument/base_args/template_args.py | 1 + swift/llm/template/base.py | 34 +++++++++--------- swift/llm/template/register.py | 1 + swift/llm/template/template/internvl.py | 11 ++---- swift/llm/template/template/microsoft.py | 2 +- swift/llm/template/vision_utils.py | 2 +- 10 files changed, 62 insertions(+), 28 deletions(-) create mode 100644 examples/infer/demo_grounding.py diff --git "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" index 9c63c6f080..09525cdf8a 100644 --- "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" @@ -127,7 +127,7 @@ RLHF的数据格式可以参考纯文本大模型的格式。 该格式比通用格式多了objects字段,该字段包含的字段有: - caption bbox对应的物体描述 - bbox 坐标 建议给四个整数(而非float型),分别是x_min,y_min,x_max,y_max四个值 - - bbox_type: bbox类型 目前支持三种:real/norm_1000/norm_1,分别代表实际像素值坐标/千分位比例坐标/归一化比例坐标 + - bbox_type: bbox类型 目前支持三种:norm1000/,分别代表实际像素值坐标/千分位比例坐标/归一化比例坐标 - image: bbox对应的图片是第几张, 索引从0开始 ### 文生图格式 diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 559fc06b4f..88ff94a739 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -713,6 +713,7 @@ |[AI-ModelScope/alpaca-gpt4-data-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh)|default|48818|157.2±93.2, min=27, max=544|chat, general, 🔥|[llm-wizard/alpaca-gpt4-data-zh](https://huggingface.co/datasets/llm-wizard/alpaca-gpt4-data-zh)| |[AI-ModelScope/blossom-math-v2](https://modelscope.cn/datasets/AI-ModelScope/blossom-math-v2)|default|10000|175.4±59.1, min=35, max=563|chat, math, 🔥|[Azure99/blossom-math-v2](https://huggingface.co/datasets/Azure99/blossom-math-v2)| |[AI-ModelScope/captcha-images](https://modelscope.cn/datasets/AI-ModelScope/captcha-images)|default|8000|47.0±0.0, min=47, max=47|chat, multi-modal, vision|-| +|[AI-ModelScope/coco](https://modelscope.cn/datasets/AI-ModelScope/coco)|default|huge dataset|-|multi-modal, en, vqa, quality|[detection-datasets/coco](https://huggingface.co/datasets/detection-datasets/coco)| |[AI-ModelScope/databricks-dolly-15k](https://modelscope.cn/datasets/AI-ModelScope/databricks-dolly-15k)|default|15011|199.0±268.8, min=26, max=5987|multi-task, en, quality|[databricks/databricks-dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k)| |[AI-ModelScope/deepctrl-sft-data](https://modelscope.cn/datasets/AI-ModelScope/deepctrl-sft-data)|default
en|huge dataset|-|chat, general, sft, multi-round|-| |[AI-ModelScope/egoschema](https://modelscope.cn/datasets/AI-ModelScope/egoschema)|default
cls|101|191.6±80.7, min=96, max=435|chat, multi-modal, video|[lmms-lab/egoschema](https://huggingface.co/datasets/lmms-lab/egoschema)| diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md index 351361c3c2..a2454cb55d 100644 --- a/docs/source_en/Instruction/Supported-models-and-datasets.md +++ b/docs/source_en/Instruction/Supported-models-and-datasets.md @@ -714,6 +714,7 @@ The table below introduces information about the datasets integrated with ms-swi |[AI-ModelScope/alpaca-gpt4-data-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh)|default|48818|157.2±93.2, min=27, max=544|chat, general, 🔥|[llm-wizard/alpaca-gpt4-data-zh](https://huggingface.co/datasets/llm-wizard/alpaca-gpt4-data-zh)| |[AI-ModelScope/blossom-math-v2](https://modelscope.cn/datasets/AI-ModelScope/blossom-math-v2)|default|10000|175.4±59.1, min=35, max=563|chat, math, 🔥|[Azure99/blossom-math-v2](https://huggingface.co/datasets/Azure99/blossom-math-v2)| |[AI-ModelScope/captcha-images](https://modelscope.cn/datasets/AI-ModelScope/captcha-images)|default|8000|47.0±0.0, min=47, max=47|chat, multi-modal, vision|-| +|[AI-ModelScope/coco](https://modelscope.cn/datasets/AI-ModelScope/coco)|default|huge dataset|-|multi-modal, en, vqa, quality|[detection-datasets/coco](https://huggingface.co/datasets/detection-datasets/coco)| |[AI-ModelScope/databricks-dolly-15k](https://modelscope.cn/datasets/AI-ModelScope/databricks-dolly-15k)|default|15011|199.0±268.8, min=26, max=5987|multi-task, en, quality|[databricks/databricks-dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k)| |[AI-ModelScope/deepctrl-sft-data](https://modelscope.cn/datasets/AI-ModelScope/deepctrl-sft-data)|default
en|huge dataset|-|chat, general, sft, multi-round|-| |[AI-ModelScope/egoschema](https://modelscope.cn/datasets/AI-ModelScope/egoschema)|default
cls|101|191.6±80.7, min=96, max=435|chat, multi-modal, video|[lmms-lab/egoschema](https://huggingface.co/datasets/lmms-lab/egoschema)| diff --git a/examples/infer/demo_grounding.py b/examples/infer/demo_grounding.py new file mode 100644 index 0000000000..74f113ed03 --- /dev/null +++ b/examples/infer/demo_grounding.py @@ -0,0 +1,35 @@ +import os +from typing import Literal +from swift.llm import load_image +import re +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + + + +def draw_bbox(image, response): + matchs = re.findall(r'<\|object_ref_start\|>(.*?)<\|object_ref_end\|><\|box_start\|>\((\d+),(\d+)\),\((\d+),(\d+)\)<\|box_end\|>', response) + + +def infer_grounding(): + from swift.llm import (PtEngine, RequestConfig, AdapterRequest, get_template, BaseArguments, InferRequest, + safe_snapshot_download, get_model_tokenizer) + from swift.tuners import Swift + image = load_image('http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png') + infer_request = InferRequest(messages=[{'role': 'user', 'content': 'Task: Object Detection'}], + images=[image]) + + request_config = RequestConfig(max_tokens=512, temperature=0) + adapter_path = safe_snapshot_download('/mnt/nas2/huangjintao.hjt/work/llmscope/output/v92-20250126-173609/checkpoint-1237') + args = BaseArguments.from_pretrained(adapter_path) + + engine = PtEngine(args.model, adapters=[adapter_path]) + resp_list = engine.infer([infer_request], request_config) + response = resp_list[0].choices[0].message.content + print(f'lora-response: {response}') + + new_image = draw_bbox(image, response) + new_image.save('animal_bbox.png') + + +if __name__ == '__main__': + infer_grounding() diff --git a/swift/llm/argument/base_args/template_args.py b/swift/llm/argument/base_args/template_args.py index 23a6c79278..b0d7db6555 100644 --- a/swift/llm/argument/base_args/template_args.py +++ b/swift/llm/argument/base_args/template_args.py @@ -36,6 +36,7 @@ class TemplateArguments: truncation_strategy: Literal['delete', 'left', 'right'] = 'delete' max_pixels: Optional[int] = None tools_prompt: str = 'react_en' # Override the default_tools_prompt in the template. + bbox_type: Literal['norm1000', 'norm100', 'none'] = 'norm1000' # train padding_side: Literal['left', 'right'] = 'right' loss_scale: str = 'default' diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py index 9998085c07..e557b4c158 100644 --- a/swift/llm/template/base.py +++ b/swift/llm/template/base.py @@ -36,7 +36,6 @@ class MaxLengthError(ValueError): class Template(ProcessorMixin): special_tokens = ['', '