Upload extra needed files

You can download gradio_helper.py and ov_qwen2_vl.py to avoid some error such as GBK Error. And qwen2vl.ipynb is the main file that load the model and establish the interface.

Files changed (5) hide show

gradio_helper.py +205 -0
notebook_utils.py +715 -0
ov_qwen2_vl.py +792 -0
qwen2-build.py +43 -0
qwen2vl.ipynb +0 -0

gradio_helper.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import gradio as gr
+import copy
+import re
+from threading import Thread
+from transformers import TextIteratorStreamer
+from qwen_vl_utils import process_vision_info
+def _parse_text(text):
+    lines = text.split("\n")
+    lines = [line for line in lines if line != ""]
+    count = 0
+    for i, line in enumerate(lines):
+        if "```" in line:
+            count += 1
+            items = line.split("`")
+            if count % 2 == 1:
+                lines[i] = f'<pre><code class="language-{items[-1]}">'
+            else:
+                lines[i] = "<br></code></pre>"
+        else:
+            if i > 0:
+                if count % 2 == 1:
+                    line = line.replace("`", r"\`")
+                    line = line.replace("<", "&lt;")
+                    line = line.replace(">", "&gt;")
+                    line = line.replace(" ", "&nbsp;")
+                    line = line.replace("*", "&ast;")
+                    line = line.replace("_", "&lowbar;")
+                    line = line.replace("-", "&#45;")
+                    line = line.replace(".", "&#46;")
+                    line = line.replace("!", "&#33;")
+                    line = line.replace("(", "&#40;")
+                    line = line.replace(")", "&#41;")
+                    line = line.replace("$", "&#36;")
+                lines[i] = "<br>" + line
+    text = "".join(lines)
+    return text
+def _remove_image_special(text):
+    text = text.replace("<ref>", "").replace("</ref>", "")
+    return re.sub(r"<box>.*?(</box>|$)", "", text)
+def is_video_file(filename):
+    video_extensions = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg"]
+    return any(filename.lower().endswith(ext) for ext in video_extensions)
+def transform_messages(original_messages):
+    transformed_messages = []
+    for message in original_messages:
+        new_content = []
+        for item in message["content"]:
+            if "image" in item:
+                new_item = {"type": "image", "image": item["image"]}
+            elif "text" in item:
+                new_item = {"type": "text", "text": item["text"]}
+            elif "video" in item:
+                new_item = {"type": "video", "video": item["video"]}
+            else:
+                continue
+            new_content.append(new_item)
+        new_message = {"role": message["role"], "content": new_content}
+        transformed_messages.append(new_message)
+    return transformed_messages
+def make_demo(model, processor):
+    def call_local_model(model, processor, messages):
+        messages = transform_messages(messages)
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to(model.device)
+        tokenizer = processor.tokenizer
+        streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+        gen_kwargs = {"max_new_tokens": 512, "streamer": streamer, **inputs}
+        thread = Thread(target=model.generate, kwargs=gen_kwargs)
+        thread.start()
+        generated_text = ""
+        for new_text in streamer:
+            generated_text += new_text
+            yield generated_text
+    def create_predict_fn():
+        def predict(_chatbot, task_history):
+            chat_query = _chatbot[-1][0]
+            query = task_history[-1][0]
+            if len(chat_query) == 0:
+                _chatbot.pop()
+                task_history.pop()
+                return _chatbot
+            print("User: " + _parse_text(query))
+            history_cp = copy.deepcopy(task_history)
+            full_response = ""
+            messages = []
+            content = []
+            for q, a in history_cp:
+                if isinstance(q, (tuple, list)):
+                    if is_video_file(q[0]):
+                        content.append({"video": f"file://{q[0]}"})
+                    else:
+                        content.append({"image": f"file://{q[0]}"})
+                else:
+                    content.append({"text": q})
+                    messages.append({"role": "user", "content": content})
+                    messages.append({"role": "assistant", "content": [{"text": a}]})
+                    content = []
+            messages.pop()
+            for response in call_local_model(model, processor, messages):
+                _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))
+                yield _chatbot
+                full_response = _parse_text(response)
+            task_history[-1] = (query, full_response)
+            print("Qwen-VL-Chat: " + _parse_text(full_response))
+            yield _chatbot
+        return predict
+    def create_regenerate_fn():
+        def regenerate(_chatbot, task_history):
+            if not task_history:
+                return _chatbot
+            item = task_history[-1]
+            if item[1] is None:
+                return _chatbot
+            task_history[-1] = (item[0], None)
+            chatbot_item = _chatbot.pop(-1)
+            if chatbot_item[0] is None:
+                _chatbot[-1] = (_chatbot[-1][0], None)
+            else:
+                _chatbot.append((chatbot_item[0], None))
+            _chatbot_gen = predict(_chatbot, task_history)
+            for _chatbot in _chatbot_gen:
+                yield _chatbot
+        return regenerate
+    predict = create_predict_fn()
+    regenerate = create_regenerate_fn()
+    def add_text(history, task_history, text):
+        task_text = text
+        history = history if history is not None else []
+        task_history = task_history if task_history is not None else []
+        history = history + [(_parse_text(text), None)]
+        task_history = task_history + [(task_text, None)]
+        return history, task_history, ""
+    def add_file(history, task_history, file):
+        history = history if history is not None else []
+        task_history = task_history if task_history is not None else []
+        history = history + [((file.name,), None)]
+        task_history = task_history + [((file.name,), None)]
+        return history, task_history
+    def reset_user_input():
+        return gr.update(value="")
+    def reset_state(task_history):
+        task_history.clear()
+        return []
+    with gr.Blocks() as demo:
+        gr.Markdown("""<center><font size=8>Qwen2-VL OpenVINO demo</center>""")
+        chatbot = gr.Chatbot(label="Qwen2-VL", elem_classes="control-height", height=500)
+        query = gr.Textbox(lines=2, label="Input")
+        task_history = gr.State([])
+        with gr.Row():
+            addfile_btn = gr.UploadButton("📁 Upload (上传文件)", file_types=["image", "video"])
+            submit_btn = gr.Button("🚀 Submit (发送)")
+            regen_btn = gr.Button("🤔️ Regenerate (重试)")
+            empty_bin = gr.Button("🧹 Clear History (清除历史)")
+        submit_btn.click(add_text, [chatbot, task_history, query], [chatbot, task_history]).then(
+            predict, [chatbot, task_history], [chatbot], show_progress=True
+        )
+        submit_btn.click(reset_user_input, [], [query])
+        empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True)
+        regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
+        addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
+        gr.Markdown(
+            """\
+<font size=2>Note: This demo is governed by the original license of Qwen2-VL. \
+We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, \
+including hate speech, violence, pornography, deception, etc. \
+(注：本演示受Qwen2-VL的许可协议限制。我们强烈建议，用户不应传播及不应允许他人传播以下内容，\
+包括但不限于仇恨言论、暴力、色情、欺诈相关的有害信息。)"""
+        )
+    return demo

notebook_utils.py ADDED Viewed

	@@ -0,0 +1,715 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[ ]:
+import os
+import platform
+import sys
+import threading
+import time
+import urllib.parse
+from os import PathLike
+from pathlib import Path
+from typing import List, NamedTuple, Optional, Tuple
+import numpy as np
+from openvino.runtime import Core, Type, get_version
+from IPython.display import HTML, Image, display
+import openvino as ov
+from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher
+from openvino.runtime import opset10 as ops
+# ## Files
+#
+# Load an image, download a file, download an IR model, and create a progress bar to show download progress.
+# In[ ]:
+def device_widget(default="AUTO", exclude=None, added=None):
+    import openvino as ov
+    import ipywidgets as widgets
+    core = ov.Core()
+    supported_devices = core.available_devices + ["AUTO"]
+    exclude = exclude or []
+    if exclude:
+        for ex_device in exclude:
+            if ex_device in supported_devices:
+                supported_devices.remove(ex_device)
+    added = added or []
+    if added:
+        for add_device in added:
+            if add_device not in supported_devices:
+                supported_devices.append(add_device)
+    device = widgets.Dropdown(
+        options=supported_devices,
+        value=default,
+        description="Device:",
+        disabled=False,
+    )
+    return device
+def quantization_widget(default=True):
+    import ipywidgets as widgets
+    to_quantize = widgets.Checkbox(
+        value=default,
+        description="Quantization",
+        disabled=False,
+    )
+    return to_quantize
+def pip_install(*args):
+    import subprocess  # nosec - disable B404:import-subprocess check
+    cli_args = []
+    for arg in args:
+        cli_args.extend(str(arg).split(" "))
+    subprocess.run([sys.executable, "-m", "pip", "install", *cli_args], shell=(platform.system() == "Windows"), check=True)
+def load_image(path: str) -> np.ndarray:
+    """
+    Loads an image from `path` and returns it as BGR numpy array. `path`
+    should point to an image file, either a local filename or a url. The image is
+    not stored to the filesystem. Use the `download_file` function to download and
+    store an image.
+    :param path: Local path name or URL to image.
+    :return: image as BGR numpy array
+    """
+    import cv2
+    import requests
+    if path.startswith("http"):
+        # Set User-Agent to Mozilla because some websites block
+        # requests with User-Agent Python
+        response = requests.get(path, headers={"User-Agent": "Mozilla/5.0"})
+        array = np.asarray(bytearray(response.content), dtype="uint8")
+        image = cv2.imdecode(array, -1)  # Loads the image as BGR
+    else:
+        image = cv2.imread(path)
+    return image
+def download_file(
+    url: PathLike,
+    filename: PathLike = None,
+    directory: PathLike = None,
+    show_progress: bool = True,
+    silent: bool = False,
+    timeout: int = 10,
+) -> PathLike:
+    """
+    Download a file from a url and save it to the local filesystem. The file is saved to the
+    current directory by default, or to `directory` if specified. If a filename is not given,
+    the filename of the URL will be used.
+    :param url: URL that points to the file to download
+    :param filename: Name of the local file to save. Should point to the name of the file only,
+                     not the full path. If None the filename from the url will be used
+    :param directory: Directory to save the file to. Will be created if it doesn't exist
+                      If None the file will be saved to the current working directory
+    :param show_progress: If True, show an TQDM ProgressBar
+    :param silent: If True, do not print a message if the file already exists
+    :param timeout: Number of seconds before cancelling the connection attempt
+    :return: path to downloaded file
+    """
+    from tqdm.notebook import tqdm_notebook
+    import requests
+    filename = filename or Path(urllib.parse.urlparse(url).path).name
+    chunk_size = 16384  # make chunks bigger so that not too many updates are triggered for Jupyter front-end
+    filename = Path(filename)
+    if len(filename.parts) > 1:
+        raise ValueError(
+            "`filename` should refer to the name of the file, excluding the directory. "
+            "Use the `directory` parameter to specify a target directory for the downloaded file."
+        )
+    # create the directory if it does not exist, and add the directory to the filename
+    if directory is not None:
+        directory = Path(directory)
+        directory.mkdir(parents=True, exist_ok=True)
+        filename = directory / Path(filename)
+    try:
+        response = requests.get(url=url, headers={"User-agent": "Mozilla/5.0"}, stream=True)
+        response.raise_for_status()
+    except (
+        requests.exceptions.HTTPError
+    ) as error:  # For error associated with not-200 codes. Will output something like: "404 Client Error: Not Found for url: {url}"
+        raise Exception(error) from None
+    except requests.exceptions.Timeout:
+        raise Exception(
+            "Connection timed out. If you access the internet through a proxy server, please "
+            "make sure the proxy is set in the shell from where you launched Jupyter."
+        ) from None
+    except requests.exceptions.RequestException as error:
+        raise Exception(f"File downloading failed with error: {error}") from None
+    # download the file if it does not exist, or if it exists with an incorrect file size
+    filesize = int(response.headers.get("Content-length", 0))
+    if not filename.exists() or (os.stat(filename).st_size != filesize):
+        with tqdm_notebook(
+            total=filesize,
+            unit="B",
+            unit_scale=True,
+            unit_divisor=1024,
+            desc=str(filename),
+            disable=not show_progress,
+        ) as progress_bar:
+            with open(filename, "wb") as file_object:
+                for chunk in response.iter_content(chunk_size):
+                    file_object.write(chunk)
+                    progress_bar.update(len(chunk))
+                    progress_bar.refresh()
+    else:
+        if not silent:
+            print(f"'{filename}' already exists.")
+    response.close()
+    return filename.resolve()
+def download_ir_model(model_xml_url: str, destination_folder: PathLike = None) -> PathLike:
+    """
+    Download IR model from `model_xml_url`. Downloads model xml and bin file; the weights file is
+    assumed to exist at the same location and name as model_xml_url with a ".bin" extension.
+    :param model_xml_url: URL to model xml file to download
+    :param destination_folder: Directory where downloaded model xml and bin are saved. If None, model
+                               files are saved to the current directory
+    :return: path to downloaded xml model file
+    """
+    model_bin_url = model_xml_url[:-4] + ".bin"
+    model_xml_path = download_file(model_xml_url, directory=destination_folder, show_progress=False)
+    download_file(model_bin_url, directory=destination_folder)
+    return model_xml_path
+# ## Images
+# ### Convert Pixel Data
+#
+# Normalize image pixel values between 0 and 1, and convert images to RGB and BGR.
+# In[ ]:
+def normalize_minmax(data):
+    """
+    Normalizes the values in `data` between 0 and 1
+    """
+    if data.max() == data.min():
+        raise ValueError("Normalization is not possible because all elements of" f"`data` have the same value: {data.max()}.")
+    return (data - data.min()) / (data.max() - data.min())
+def to_rgb(image_data: np.ndarray) -> np.ndarray:
+    """
+    Convert image_data from BGR to RGB
+    """
+    import cv2
+    return cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB)
+def to_bgr(image_data: np.ndarray) -> np.ndarray:
+    """
+    Convert image_data from RGB to BGR
+    """
+    import cv2
+    return cv2.cvtColor(image_data, cv2.COLOR_RGB2BGR)
+# ## Videos
+# ### Video Player
+#
+# Custom video player to fulfill FPS requirements. You can set target FPS and output size, flip the video horizontally or skip first N frames.
+# In[ ]:
+class VideoPlayer:
+    """
+    Custom video player to fulfill FPS requirements. You can set target FPS and output size,
+    flip the video horizontally or skip first N frames.
+    :param source: Video source. It could be either camera device or video file.
+    :param size: Output frame size.
+    :param flip: Flip source horizontally.
+    :param fps: Target FPS.
+    :param skip_first_frames: Skip first N frames.
+    """
+    def __init__(self, source, size=None, flip=False, fps=None, skip_first_frames=0, width=1280, height=720):
+        import cv2
+        self.cv2 = cv2  # This is done to access the package in class methods
+        self.__cap = cv2.VideoCapture(source)
+        # try HD by default to get better video quality
+        self.__cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
+        self.__cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
+        if not self.__cap.isOpened():
+            raise RuntimeError(f"Cannot open {'camera' if isinstance(source, int) else ''} {source}")
+        # skip first N frames
+        self.__cap.set(cv2.CAP_PROP_POS_FRAMES, skip_first_frames)
+        # fps of input file
+        self.__input_fps = self.__cap.get(cv2.CAP_PROP_FPS)
+        if self.__input_fps <= 0:
+            self.__input_fps = 60
+        # target fps given by user
+        self.__output_fps = fps if fps is not None else self.__input_fps
+        self.__flip = flip
+        self.__size = None
+        self.__interpolation = None
+        if size is not None:
+            self.__size = size
+            # AREA better for shrinking, LINEAR better for enlarging
+            self.__interpolation = cv2.INTER_AREA if size[0] < self.__cap.get(cv2.CAP_PROP_FRAME_WIDTH) else cv2.INTER_LINEAR
+        # first frame
+        _, self.__frame = self.__cap.read()
+        self.__lock = threading.Lock()
+        self.__thread = None
+        self.__stop = False
+    """
+    Start playing.
+    """
+    def start(self):
+        self.__stop = False
+        self.__thread = threading.Thread(target=self.__run, daemon=True)
+        self.__thread.start()
+    """
+    Stop playing and release resources.
+    """
+    def stop(self):
+        self.__stop = True
+        if self.__thread is not None:
+            self.__thread.join()
+        self.__cap.release()
+    def __run(self):
+        prev_time = 0
+        while not self.__stop:
+            t1 = time.time()
+            ret, frame = self.__cap.read()
+            if not ret:
+                break
+            # fulfill target fps
+            if 1 / self.__output_fps < time.time() - prev_time:
+                prev_time = time.time()
+                # replace by current frame
+                with self.__lock:
+                    self.__frame = frame
+            t2 = time.time()
+            # time to wait [s] to fulfill input fps
+            wait_time = 1 / self.__input_fps - (t2 - t1)
+            # wait until
+            time.sleep(max(0, wait_time))
+        self.__frame = None
+    """
+    Get current frame.
+    """
+    def next(self):
+        import cv2
+        with self.__lock:
+            if self.__frame is None:
+                return None
+            # need to copy frame, because can be cached and reused if fps is low
+            frame = self.__frame.copy()
+        if self.__size is not None:
+            frame = self.cv2.resize(frame, self.__size, interpolation=self.__interpolation)
+        if self.__flip:
+            frame = self.cv2.flip(frame, 1)
+        return frame
+# ## Visualization
+# ### Segmentation
+#
+# Define a SegmentationMap NamedTuple that keeps the labels and colormap for a segmentation project/dataset. Create CityScapesSegmentation and BinarySegmentation SegmentationMaps. Create a function to convert a segmentation map to an RGB image with a colormap, and to show the segmentation result as an overlay over the original image.
+# In[ ]:
+class Label(NamedTuple):
+    index: int
+    color: Tuple
+    name: Optional[str] = None
+# In[ ]:
+class SegmentationMap(NamedTuple):
+    labels: List
+    def get_colormap(self):
+        return np.array([label.color for label in self.labels])
+    def get_labels(self):
+        labelnames = [label.name for label in self.labels]
+        if any(labelnames):
+            return labelnames
+        else:
+            return None
+# In[ ]:
+cityscape_labels = [
+    Label(index=0, color=(128, 64, 128), name="road"),
+    Label(index=1, color=(244, 35, 232), name="sidewalk"),
+    Label(index=2, color=(70, 70, 70), name="building"),
+    Label(index=3, color=(102, 102, 156), name="wall"),
+    Label(index=4, color=(190, 153, 153), name="fence"),
+    Label(index=5, color=(153, 153, 153), name="pole"),
+    Label(index=6, color=(250, 170, 30), name="traffic light"),
+    Label(index=7, color=(220, 220, 0), name="traffic sign"),
+    Label(index=8, color=(107, 142, 35), name="vegetation"),
+    Label(index=9, color=(152, 251, 152), name="terrain"),
+    Label(index=10, color=(70, 130, 180), name="sky"),
+    Label(index=11, color=(220, 20, 60), name="person"),
+    Label(index=12, color=(255, 0, 0), name="rider"),
+    Label(index=13, color=(0, 0, 142), name="car"),
+    Label(index=14, color=(0, 0, 70), name="truck"),
+    Label(index=15, color=(0, 60, 100), name="bus"),
+    Label(index=16, color=(0, 80, 100), name="train"),
+    Label(index=17, color=(0, 0, 230), name="motorcycle"),
+    Label(index=18, color=(119, 11, 32), name="bicycle"),
+    Label(index=19, color=(255, 255, 255), name="background"),
+]
+CityScapesSegmentation = SegmentationMap(cityscape_labels)
+binary_labels = [
+    Label(index=0, color=(255, 255, 255), name="background"),
+    Label(index=1, color=(0, 0, 0), name="foreground"),
+]
+BinarySegmentation = SegmentationMap(binary_labels)
+# In[ ]:
+def segmentation_map_to_image(result: np.ndarray, colormap: np.ndarray, remove_holes: bool = False) -> np.ndarray:
+    """
+    Convert network result of floating point numbers to an RGB image with
+    integer values from 0-255 by applying a colormap.
+    :param result: A single network result after converting to pixel values in H,W or 1,H,W shape.
+    :param colormap: A numpy array of shape (num_classes, 3) with an RGB value per class.
+    :param remove_holes: If True, remove holes in the segmentation result.
+    :return: An RGB image where each pixel is an int8 value according to colormap.
+    """
+    import cv2
+    if len(result.shape) != 2 and result.shape[0] != 1:
+        raise ValueError(f"Expected result with shape (H,W) or (1,H,W), got result with shape {result.shape}")
+    if len(np.unique(result)) > colormap.shape[0]:
+        raise ValueError(
+            f"Expected max {colormap[0]} classes in result, got {len(np.unique(result))} "
+            "different output values. Please make sure to convert the network output to "
+            "pixel values before calling this function."
+        )
+    elif result.shape[0] == 1:
+        result = result.squeeze(0)
+    result = result.astype(np.uint8)
+    contour_mode = cv2.RETR_EXTERNAL if remove_holes else cv2.RETR_TREE
+    mask = np.zeros((result.shape[0], result.shape[1], 3), dtype=np.uint8)
+    for label_index, color in enumerate(colormap):
+        label_index_map = result == label_index
+        label_index_map = label_index_map.astype(np.uint8) * 255
+        contours, hierarchies = cv2.findContours(label_index_map, contour_mode, cv2.CHAIN_APPROX_SIMPLE)
+        cv2.drawContours(
+            mask,
+            contours,
+            contourIdx=-1,
+            color=color.tolist(),
+            thickness=cv2.FILLED,
+        )
+    return mask
+def segmentation_map_to_overlay(image, result, alpha, colormap, remove_holes=False) -> np.ndarray:
+    """
+    Returns a new image where a segmentation mask (created with colormap) is overlayed on
+    the source image.
+    :param image: Source image.
+    :param result: A single network result after converting to pixel values in H,W or 1,H,W shape.
+    :param alpha: Alpha transparency value for the overlay image.
+    :param colormap: A numpy array of shape (num_classes, 3) with an RGB value per class.
+    :param remove_holes: If True, remove holes in the segmentation result.
+    :return: An RGP image with segmentation mask overlayed on the source image.
+    """
+    import cv2
+    if len(image.shape) == 2:
+        image = np.repeat(np.expand_dims(image, -1), 3, 2)
+    mask = segmentation_map_to_image(result, colormap, remove_holes)
+    image_height, image_width = image.shape[:2]
+    mask = cv2.resize(src=mask, dsize=(image_width, image_height))
+    return cv2.addWeighted(mask, alpha, image, 1 - alpha, 0)
+# ### Network Results
+#
+# Show network result image, optionally together with the source image and a legend with labels.
+# In[ ]:
+def viz_result_image(
+    result_image: np.ndarray,
+    source_image: np.ndarray = None,
+    source_title: str = None,
+    result_title: str = None,
+    labels: List[Label] = None,
+    resize: bool = False,
+    bgr_to_rgb: bool = False,
+    hide_axes: bool = False,
+):
+    """
+    Show result image, optionally together with source images, and a legend with labels.
+    :param result_image: Numpy array of RGB result image.
+    :param source_image: Numpy array of source image. If provided this image will be shown
+                         next to the result image. source_image is expected to be in RGB format.
+                         Set bgr_to_rgb to True if source_image is in BGR format.
+    :param source_title: Title to display for the source image.
+    :param result_title: Title to display for the result image.
+    :param labels: List of labels. If provided, a legend will be shown with the given labels.
+    :param resize: If true, resize the result image to the same shape as the source image.
+    :param bgr_to_rgb: If true, convert the source image from BGR to RGB. Use this option if
+                       source_image is a BGR image.
+    :param hide_axes: If true, do not show matplotlib axes.
+    :return: Matplotlib figure with result image
+    """
+    import cv2
+    import matplotlib.pyplot as plt
+    from matplotlib.lines import Line2D
+    if bgr_to_rgb:
+        source_image = to_rgb(source_image)
+    if resize:
+        result_image = cv2.resize(result_image, (source_image.shape[1], source_image.shape[0]))
+    num_images = 1 if source_image is None else 2
+    fig, ax = plt.subplots(1, num_images, figsize=(16, 8), squeeze=False)
+    if source_image is not None:
+        ax[0, 0].imshow(source_image)
+        ax[0, 0].set_title(source_title)
+    ax[0, num_images - 1].imshow(result_image)
+    ax[0, num_images - 1].set_title(result_title)
+    if hide_axes:
+        for a in ax.ravel():
+            a.axis("off")
+    if labels:
+        colors = labels.get_colormap()
+        lines = [
+            Line2D(
+                [0],
+                [0],
+                color=[item / 255 for item in c.tolist()],
+                linewidth=3,
+                linestyle="-",
+            )
+            for c in colors
+        ]
+        plt.legend(
+            lines,
+            labels.get_labels(),
+            bbox_to_anchor=(1, 1),
+            loc="upper left",
+            prop={"size": 12},
+        )
+    plt.close(fig)
+    return fig
+# ### Live Inference
+# In[ ]:
+def show_array(frame: np.ndarray, display_handle=None):
+    """
+    Display array `frame`. Replace information at `display_handle` with `frame`
+    encoded as jpeg image. `frame` is expected to have data in BGR order.
+    Create a display_handle with: `display_handle = display(display_id=True)`
+    """
+    import cv2
+    _, frame = cv2.imencode(ext=".jpeg", img=frame)
+    if display_handle is None:
+        display_handle = display(Image(data=frame.tobytes()), display_id=True)
+    else:
+        display_handle.update(Image(data=frame.tobytes()))
+    return display_handle
+# ## Checks and Alerts
+#
+# Create an alert class to show stylized info/error/warning messages and a `check_device` function that checks whether a given device is available.
+# In[ ]:
+class NotebookAlert(Exception):
+    def __init__(self, message: str, alert_class: str):
+        """
+        Show an alert box with the given message.
+        :param message: The message to display.
+        :param alert_class: The class for styling the message. Options: info, warning, success, danger.
+        """
+        self.message = message
+        self.alert_class = alert_class
+        self.show_message()
+    def show_message(self):
+        display(HTML(f"""<div class="alert alert-{self.alert_class}">{self.message}"""))
+class DeviceNotFoundAlert(NotebookAlert):
+    def __init__(self, device: str):
+        """
+        Show a warning message about an unavailable device. This class does not check whether or
+        not the device is available, use the `check_device` function to check this. `check_device`
+        also shows the warning if the device is not found.
+        :param device: The unavailable device.
+        :return: A formatted alert box with the message that `device` is not available, and a list
+                 of devices that are available.
+        """
+        ie = Core()
+        supported_devices = ie.available_devices
+        self.message = f"Running this cell requires a {device} device, " "which is not available on this system. "
+        self.alert_class = "warning"
+        if len(supported_devices) == 1:
+            self.message += f"The following device is available: {ie.available_devices[0]}"
+        else:
+            self.message += "The following devices are available: " f"{', '.join(ie.available_devices)}"
+        super().__init__(self.message, self.alert_class)
+def check_device(device: str) -> bool:
+    """
+    Check if the specified device is available on the system.
+    :param device: Device to check. e.g. CPU, GPU
+    :return: True if the device is available, False if not. If the device is not available,
+             a DeviceNotFoundAlert will be shown.
+    """
+    ie = Core()
+    if device not in ie.available_devices:
+        DeviceNotFoundAlert(device)
+        return False
+    else:
+        return True
+def check_openvino_version(version: str) -> bool:
+    """
+    Check if the specified OpenVINO version is installed.
+    :param version: the OpenVINO version to check. Example: 2021.4
+    :return: True if the version is installed, False if not. If the version is not installed,
+             an alert message will be shown.
+    """
+    installed_version = get_version()
+    if version not in installed_version:
+        NotebookAlert(
+            f"This notebook requires OpenVINO {version}. "
+            f"The version on your system is: <i>{installed_version}</i>.<br>"
+            "Please run <span style='font-family:monospace'>pip install --upgrade -r requirements.txt</span> "
+            "in the openvino_env environment to install this version. "
+            "See the <a href='https://github.com/openvinotoolkit/openvino_notebooks'>"
+            "OpenVINO Notebooks README</a> for detailed instructions",
+            alert_class="danger",
+        )
+        return False
+    else:
+        return True
+packed_layername_tensor_dict_list = [{"name": "aten::mul/Multiply"}]
+class ReplaceTensor(MatcherPass):
+    def __init__(self, packed_layername_tensor_dict_list):
+        MatcherPass.__init__(self)
+        self.model_changed = False
+        param = WrapType("opset10.Multiply")
+        def callback(matcher: Matcher) -> bool:
+            root = matcher.get_match_root()
+            if root is None:
+                return False
+            for y in packed_layername_tensor_dict_list:
+                root_name = root.get_friendly_name()
+                if root_name.find(y["name"]) != -1:
+                    max_fp16 = np.array([[[[-np.finfo(np.float16).max]]]]).astype(np.float32)
+                    new_tenser = ops.constant(max_fp16, Type.f32, name="Constant_4431")
+                    root.set_arguments([root.input_value(0).node, new_tenser])
+                    packed_layername_tensor_dict_list.remove(y)
+            return True
+        self.register_matcher(Matcher(param, "ReplaceTensor"), callback)
+def optimize_bge_embedding(model_path, output_model_path):
+    """
+    optimize_bge_embedding used to optimize BGE model for NPU device
+    Arguments:
+        model_path {str} -- original BGE IR model path
+        output_model_path {str} -- Converted BGE IR model path
+    """
+    core = Core()
+    ov_model = core.read_model(model_path)
+    manager = Manager()
+    manager.register_pass(ReplaceTensor(packed_layername_tensor_dict_list))
+    manager.run_passes(ov_model)
+    ov.save_model(ov_model, output_model_path, compress_to_fp16=False)

ov_qwen2_vl.py ADDED Viewed

	@@ -0,0 +1,792 @@

+from pathlib import Path
+import types
+from typing import Optional, Tuple, Union, List, Dict, Any
+import gc
+import openvino as ov
+from openvino.runtime import opset13
+import nncf
+import numpy as np
+import torch
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoConfig
+from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLCausalLMOutputWithPast, VisionRotaryEmbedding
+from transformers.cache_utils import DynamicCache
+from transformers.modeling_outputs import ModelOutput
+from transformers.generation import GenerationConfig, GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+model_ids = ["Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen2-VL-7B-Instruct"]
+def model_selector(default=model_ids[0]):
+    import ipywidgets as widgets
+    model_checkpoint = widgets.Dropdown(
+        options=model_ids,
+        default=default,
+        description="Model:",
+    )
+    return model_checkpoint
+def model_has_state(ov_model: ov.Model):
+    return len(ov_model.get_sinks()) > 0
+def model_has_input_output_name(ov_model: ov.Model, name: str):
+    """
+    Helper function for checking that model has specified input or output name
+    Parameters:
+      ov_model (ov.Model):
+      name (str):
+          name of input or output
+    Returns:
+      True if input or output with requested name exists else False
+    """
+    return name in sum([list(t.get_names()) for t in ov_model.inputs + ov_model.outputs], [])
+def fuse_cache_reorder(
+    ov_model: ov.Model,
+    not_kv_inputs: List[str],
+    key_value_input_names: List[str],
+    gather_dim: int,
+):
+    """
+    Fuses reored_cache during generate cycle into ov.Model. Used with stateful models, because we can not modify model state directly.
+    Adds a new beam_idx parameter and Gather op per each kv-cache input in a given model.
+    Should be run before make_stateful. Implements optimumum's _reorder_cache
+    inside the model in the beginning of each iteration.
+    Gather works along given gather_dim dimension that may vary from model to model.
+    KV-cache inputs are identified based on names in key_value_input_names.
+    Append the new beam_idx parameter to not_kv_inputs.
+    Parameters:
+      ov_model (`ov.Model`):
+          openvino model for processing
+      not_kv_inputs (`List[str]`):
+          list of input nodes in model that not related to past key values
+      key_value_input_names (`List[str]`):
+          list of names for key value input layers
+      gather_dim (int):
+          dimension for gathering cache during reorder pass
+    """
+    if model_has_input_output_name(ov_model, "beam_idx"):
+        raise ValueError("Model already has fused cache")
+    input_batch = ov_model.input("inputs_embeds").get_partial_shape()[0]
+    beam_idx = opset13.parameter(name="beam_idx", dtype=ov.Type.i32, shape=ov.PartialShape([input_batch]))
+    beam_idx.output(0).get_tensor().add_names({"beam_idx"})  # why list is not accepted?
+    ov_model.add_parameters([beam_idx])
+    not_kv_inputs.append(ov_model.inputs[-1])
+    # Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx
+    for input_name in key_value_input_names:
+        parameter_output_port = ov_model.input(input_name)
+        consumers = parameter_output_port.get_target_inputs()
+        gather = opset13.gather(parameter_output_port, beam_idx, opset13.constant(gather_dim))
+        for consumer in consumers:
+            consumer.replace_source_output(gather.output(0))
+    ov_model.validate_nodes_and_infer_types()
+def build_state_initializer(ov_model: ov.Model, batch_dim: int):
+    """
+    Build initialization ShapeOf Expression for all ReadValue ops
+    Parameters:
+      ov_model (ov.Model):
+          openvino model
+      batch_dim (int):
+          index of dimension corresponding to batch size
+    """
+    input_ids = ov_model.input("inputs_embeds")
+    batch = opset13.gather(
+        opset13.shape_of(input_ids, output_type="i64"),
+        opset13.constant([0]),
+        opset13.constant(0),
+    )
+    for op in ov_model.get_ops():
+        if op.get_type_name() == "ReadValue":
+            dims = [dim.min_length for dim in list(op.get_output_partial_shape(0))]
+            dims[batch_dim] = batch
+            dims = [(opset13.constant(np.array([dim], dtype=np.int64)) if isinstance(dim, int) else dim) for dim in dims]
+            shape = opset13.concat(dims, axis=0)
+            broadcast = opset13.broadcast(opset13.constant(0.0, dtype=op.get_output_element_type(0)), shape)
+            op.set_arguments([broadcast])
+    ov_model.validate_nodes_and_infer_types()
+def make_stateful(
+    ov_model: ov.Model,
+    not_kv_inputs: List[str],
+    key_value_input_names: List[str],
+    key_value_output_names: List[str],
+    batch_dim: int,
+    num_attention_heads: int,
+    num_beams_and_batch: int = None,
+):
+    """
+    Hides kv-cache inputs and outputs inside the model as variables.
+    Parameters:
+        ov_model (ov.Model):
+            openvino model
+        not_kv_inputs (`List[str]`):
+            list of input nodes in model that not related to past key values
+        key_value_input_names (`List[str]`):
+            list of names for key value input layers
+        key_value_output_names (`List[str]`):
+            list of names for key value input layers
+        batch_dim (int):
+            index of batch dimension in key value layers
+        num_attention_heads (int):
+            number of attention heads for batch dimension initialization
+        num_beams_an_batch (int):
+            precalculated number of beams and batch for shapes initialization
+    """
+    from openvino._offline_transformations import apply_make_stateful_transformation
+    input_output_map = {}
+    if num_beams_and_batch is not None:
+        # Set batch size for input_ids and attention mask to avoid dynamic dimension got propagated from the end of the model back to ReadValue
+        for input in not_kv_inputs:
+            shape = input.get_partial_shape()
+            if shape.rank.get_length() <= 2:  # == 1 for beam_index
+                shape[0] = num_beams_and_batch
+                input.get_node().set_partial_shape(shape)
+    for kv_name_pair in zip(key_value_input_names, key_value_output_names):
+        input_output_map[kv_name_pair[0]] = kv_name_pair[1]
+        if num_beams_and_batch is not None:
+            input = ov_model.input(kv_name_pair[0])
+            shape = input.get_partial_shape()
+            shape[batch_dim] = num_beams_and_batch * num_attention_heads
+            input.get_node().set_partial_shape(shape)
+    if num_beams_and_batch is not None:
+        # Re-validation model if shapes are altered above
+        ov_model.validate_nodes_and_infer_types()
+    apply_make_stateful_transformation(ov_model, input_output_map)
+    if num_beams_and_batch is None:
+        build_state_initializer(ov_model, batch_dim)
+def patch_stateful(ov_model):
+    key_value_input_names = [key.get_any_name() for key in ov_model.inputs[2:-1]]
+    key_value_output_names = [key.get_any_name() for key in ov_model.outputs[1:]]
+    not_kv_inputs = [input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names())]
+    if not key_value_input_names or not key_value_output_names:
+        return
+    batch_dim = 0
+    num_attention_heads = 1
+    fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim)
+    make_stateful(
+        ov_model,
+        not_kv_inputs,
+        key_value_input_names,
+        key_value_output_names,
+        batch_dim,
+        num_attention_heads,
+        None,
+    )
+core = ov.Core()
+def cleanup_torchscript_cache():
+    """
+    Helper for removing cached model representation
+    """
+    torch._C._jit_clear_class_registry()
+    torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
+    torch.jit._state._clear_class_state()
+LANGUAGE_MODEL_NAME = "openvino_language_model.xml"
+IMAGE_EMBEDDING_NAME = "openvino_vision_embeddings_model.xml"
+IMAGE_EMBEDDING_MERGER_NAME = "openvino_vision_embeddings_merger_model.xml"
+TEXT_EMBEDDING_NAME = "openvino_text_embeddings_model.xml"
+def convert_qwen2vl_model(model_id, output_dir, quantization_config):
+    output_dir = Path(output_dir)
+    lang_model_path = output_dir / LANGUAGE_MODEL_NAME
+    image_embed_path = output_dir / IMAGE_EMBEDDING_NAME
+    embed_token_path = output_dir / TEXT_EMBEDDING_NAME
+    image_embed_merger_path = output_dir / IMAGE_EMBEDDING_MERGER_NAME
+    if all(
+        [
+            lang_model_path.exists(),
+            image_embed_path.exists(),
+            image_embed_merger_path.exists(),
+            embed_token_path.exists(),
+        ]
+    ):
+        print(f"✅ {model_id} model already converted. You can find results in {output_dir}")
+        return
+    print(f"⌛ {model_id} conversion started. Be patient, it may takes some time.")
+    print("⌛ Load Original model")
+    model = Qwen2VLForConditionalGeneration.from_pretrained(model_id)
+    processor = AutoProcessor.from_pretrained(model_id)
+    model.config.save_pretrained(output_dir)
+    processor.save_pretrained(output_dir)
+    print("✅ Original model successfully loaded")
+    if not embed_token_path.exists():
+        print("⌛ Convert Input embedding model")
+        ov_model = ov.convert_model(
+            model.model.embed_tokens,
+            example_input=torch.ones([2, 2], dtype=torch.int64),
+        )
+        ov.save_model(ov_model, embed_token_path)
+        del ov_model
+        cleanup_torchscript_cache()
+        gc.collect()
+        print("✅ Input embedding model successfully converted")
+    if not image_embed_path.exists() or not image_embed_merger_path.exists():
+        print("⌛ Convert Image embedding model")
+        vision_embed_tokens = model.visual
+        if not image_embed_path.exists():
+            ov_model = ov.convert_model(vision_embed_tokens.patch_embed, example_input={"hidden_states": torch.randn([4988, 1176])})
+            ov.save_model(ov_model, image_embed_path)
+            del ov_model
+            cleanup_torchscript_cache()
+        def image_embed_forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, rotary_pos_emb: torch.Tensor) -> torch.Tensor:
+            for blk in self.blocks:
+                hidden_states = blk(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb)
+            return self.merger(hidden_states)
+        def sdpa_attn_forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, rotary_pos_emb: torch.Tensor = None) -> torch.Tensor:
+            from transformers.models.qwen2_vl.modeling_qwen2_vl import apply_rotary_pos_emb_vision
+            seq_length = hidden_states.shape[0]
+            q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+            q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+            k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+            q = q.transpose(0, 1)
+            k = k.transpose(0, 1)
+            v = v.transpose(0, 1)
+            attn_output = torch.nn.functional.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+            attn_output = attn_output.transpose(0, 1)
+            attn_output = attn_output.reshape(seq_length, -1)
+            attn_output = self.proj(attn_output)
+            return attn_output
+        def block_forward(self, hidden_states, attention_mask, rotary_pos_emb) -> torch.Tensor:
+            hidden_states = hidden_states + self.attn(self.norm1(hidden_states), attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb)
+            hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+            return hidden_states
+        if not image_embed_merger_path.exists():
+            vision_embed_tokens.forward = types.MethodType(image_embed_forward, vision_embed_tokens)
+            for block in vision_embed_tokens.blocks:
+                block.forward = types.MethodType(block_forward, block)
+                block.attn.forward = types.MethodType(sdpa_attn_forward, block.attn)
+            ov_model = ov.convert_model(
+                vision_embed_tokens,
+                example_input={
+                    "hidden_states": torch.randn([4988, 1280]),
+                    "attention_mask": torch.ones([1, 4988, 4988]),
+                    "rotary_pos_emb": torch.randn([4988, 40]),
+                },
+            )
+            if quantization_config is not None:
+                print(f"⌛ Weights compression with {quantization_config['mode']} mode started")
+                ov_model = nncf.compress_weights(ov_model, **quantization_config)
+                print("✅ Weights compression finished")
+            ov.save_model(ov_model, image_embed_merger_path)
+            del ov_model
+            cleanup_torchscript_cache()
+        del vision_embed_tokens
+        gc.collect()
+        print("✅ Image embedding model successfully converted")
+    if not lang_model_path.exists():
+        print("⌛ Convert Language model")
+        def forward_wrap(
+            self,
+            attention_mask,
+            position_ids=None,
+            past_key_values=None,
+            inputs_embeds=None,
+        ):
+            new_past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            result = self._orig_forward(
+                input_ids=None,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=new_past_key_values,
+                inputs_embeds=inputs_embeds,
+            )
+            if past_key_values is not None:
+                result["past_key_values"] = result["past_key_values"].to_legacy_cache()
+            return tuple(result.values())
+        model._orig_forward = model.forward
+        model.forward = types.MethodType(forward_wrap, model)
+        hidden_size = model.config.hidden_size
+        num_pkv = model.config.num_hidden_layers
+        pkv_shape = (2, model.config.num_key_value_heads, 2, hidden_size // model.config.num_attention_heads)
+        cache_position = torch.arange(2, 4)
+        position_ids = cache_position.view(1, 1, -1).expand(3, 2, -1)
+        input_embeds = torch.randn((2, 2, hidden_size))
+        attention_mask = torch.ones([2, 4], dtype=torch.long)
+        input_names = ["attention_mask", "position_ids"]
+        output_names = ["logits"]
+        past_key_values = []
+        for i in range(num_pkv):
+            kv = [torch.randn(pkv_shape) for _ in range(2)]
+            past_key_values.append(kv)
+            input_names.extend([f"past_key_values.{i}.key", f"past_key_values.{i}.value"])
+            output_names.extend([f"present.{i}.key", f"present.{i}.value"])
+        input_names.append("inputs_embeds")
+        example_input = {"inputs_embeds": input_embeds, "attention_mask": attention_mask, "position_ids": position_ids, "past_key_values": past_key_values}
+        ov_model = ov.convert_model(
+            model,
+            example_input=example_input,
+        )
+        for input, input_name in zip(ov_model.inputs, input_names):
+            input.get_tensor().set_names({input_name})
+        for output, output_name in zip(ov_model.outputs, output_names):
+            output.get_tensor().set_names({output_name})
+        patch_stateful(ov_model)
+        print("✅ Language model successfully converted")
+        if quantization_config is not None:
+            print(f"⌛ Weights compression with {quantization_config['mode']} mode started")
+            ov_model = nncf.compress_weights(ov_model, **quantization_config)
+            print("✅ Weights compression finished")
+        ov.save_model(ov_model, lang_model_path, False)
+        del ov_model
+        cleanup_torchscript_cache()
+        del model
+        gc.collect()
+        print(f"✅ {model_id} model conversion finished. You can find results in {output_dir}")
+class OVQwen2VLModel(GenerationMixin):
+    def __init__(self, model_dir, device, ov_config=None):
+        model_dir = Path(model_dir)
+        self.model = core.read_model(model_dir / LANGUAGE_MODEL_NAME)
+        self.image_embed = core.compile_model(model_dir / IMAGE_EMBEDDING_NAME, device, ov_config)
+        self.image_embed_merger = core.compile_model(model_dir / IMAGE_EMBEDDING_MERGER_NAME, device, ov_config)
+        self.embed_tokens = core.compile_model(model_dir / TEXT_EMBEDDING_NAME, device)
+        self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
+        self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)}
+        compiled_model = core.compile_model(self.model, device, ov_config)
+        self.request = compiled_model.create_infer_request()
+        self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
+        self.generation_config = GenerationConfig.from_model_config(self.config)
+        self.main_input_name = "input_ids"
+        self.device = torch.device("cpu")
+        self.num_pkv = 2
+        self._supports_cache_class = False
+        self.next_beam_idx = None
+        self._past_length = None
+        self._rotary_pos_emb = VisionRotaryEmbedding(self.config.vision_config.embed_dim // self.config.vision_config.num_heads // 2)
+    def can_generate(self):
+        """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate."""
+        return True
+    def __call__(self, *args, **kwargs) -> CausalLMOutputWithPast:
+        return self.forward(
+            *args,
+            **kwargs,
+        )
+    def _reorder_cache(self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called.
+        This is required to match `past_key_values` with the correct beam_idx at every generation step.
+        """
+        self.next_beam_idx = np.array(beam_idx)  # save beam_idx to be used as an input in the next iteration
+        return past_key_values
+    def _get_past_length(self, past_key_values=None):
+        if past_key_values is None:
+            return 0
+        return self._past_length
+    def get_rope_index(
+        self,
+        input_ids: torch.LongTensor,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+        Explanation:
+            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+            For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
+            Examples:
+                input_ids: [T T T T T], here T is for text.
+                temporal position_ids: [0, 1, 2, 3, 4]
+                height position_ids: [0, 1, 2, 3, 4]
+                width position_ids: [0, 1, 2, 3, 4]
+            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+            and 1D rotary position embeddin for text part.
+            Examples:
+                Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
+                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+                vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
+                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+                text temporal position_ids: [3, 4, 5, 6, 7]
+                text height position_ids: [3, 4, 5, 6, 7]
+                text width position_ids: [3, 4, 5, 6, 7]
+                Here we calculate the text start position_ids as the max vision position_ids plus 1.
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+        Returns:
+            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+        """
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+        mrope_position_deltas = []
+        if image_grid_thw is not None or video_grid_thw is not None:
+            total_input_ids = input_ids
+            position_ids = torch.ones(3, input_ids.shape[0], input_ids.shape[1], dtype=input_ids.dtype, device=input_ids.device)
+            image_index, video_index = 0, 0
+            for i, input_ids in enumerate(total_input_ids):
+                if attention_mask is not None:
+                    input_ids = input_ids[attention_mask[i] == 1]
+                image_nums, video_nums = 0, 0
+                vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+                vision_tokens = input_ids[vision_start_indices + 1]
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (vision_tokens == video_token_id).sum()
+                input_tokens = input_ids.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos = image_nums, video_nums
+                for _ in range(image_nums + video_nums):
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if ed_image < ed_video:
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        image_index += 1
+                        remain_images -= 1
+                        ed = ed_image
+                    else:
+                        t, h, w = (
+                            video_grid_thw[video_index][0],
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        video_index += 1
+                        remain_videos -= 1
+                        ed = ed_video
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_merge_size,
+                        w.item() // spatial_merge_size,
+                    )
+                    text_len = ed - st
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+                if st < len(input_tokens):
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = torch.arange(input_ids.shape[1], device=input_ids.device).view(1, 1, -1).expand(3, input_ids.shape[0], -1)
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+            return position_ids, mrope_position_deltas
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs=outputs,
+            model_kwargs=model_kwargs,
+            is_encoder_decoder=is_encoder_decoder,
+            num_new_tokens=num_new_tokens,
+        )
+        if getattr(outputs, "rope_deltas", None) is not None:
+            model_kwargs["rope_deltas"] = outputs.rope_deltas
+        return model_kwargs
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        **kwargs,
+    ):
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        rope_deltas = kwargs.get("rope_deltas", None)
+        if attention_mask is not None and position_ids is None:
+            if cache_position is None or (cache_position is not None and cache_position[0] == 0):
+                position_ids, rope_deltas = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask)
+            else:
+                batch_size, seq_length = input_ids.shape
+                delta = cache_position[0] + rope_deltas if cache_position is not None and rope_deltas is not None else 0
+                position_ids = torch.arange(seq_length, device=input_ids.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+        if cache_position[0] != 0:
+            pixel_values = None
+            pixel_values_videos = None
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+                "pixel_values_videos": pixel_values_videos,
+                "image_grid_thw": image_grid_thw,
+                "video_grid_thw": video_grid_thw,
+                "rope_deltas": rope_deltas,
+            }
+        )
+        return model_inputs
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
+        r"""
+        Args:.to(inputs_embeds.device)
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+        >>> model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+        >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+        >>> messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
+        ```"""
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)[0]
+            if pixel_values is not None:
+                pixel_values = pixel_values
+                image_embeds = self.visual(pixel_values, image_grid_thw)
+                image_mask = input_ids == self.config.image_token_id
+                inputs_embeds[image_mask] = image_embeds
+            if pixel_values_videos is not None:
+                pixel_values_videos = pixel_values_videos
+                video_embeds = self.visual(pixel_values_videos, video_grid_thw)
+                video_mask = input_ids == self.config.video_token_id
+                inputs_embeds[video_mask] = video_embeds
+            if attention_mask is not None:
+                attention_mask = attention_mask
+        if past_key_values is None:
+            self.request.reset_state()
+            self.next_beam_idx = np.arange(inputs_embeds.shape[0], dtype=int)
+            self._past_length = 0
+        inputs = {}
+        inputs["inputs_embeds"] = inputs_embeds
+        inputs["attention_mask"] = attention_mask
+        inputs["position_ids"] = position_ids
+        if "beam_idx" in self.input_names:
+            inputs["beam_idx"] = self.next_beam_idx if self.next_beam_idx is not None else np.arange(inputs_embeds.shape[0], dtype=int)
+        self.request.start_async(inputs, share_inputs=True)
+        self.request.wait()
+        logits = self.request.get_tensor("logits").data
+        logits = torch.from_numpy(logits).to(self.device)
+        past_key_values = ((),)
+        self._past_length += inputs["inputs_embeds"].shape[1]
+        return Qwen2VLCausalLMOutputWithPast(
+            loss=None,
+            logits=logits,
+            past_key_values=past_key_values,
+            rope_deltas=rope_deltas,
+        )
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.config.vision_config.spatial_merge_size,
+                self.config.vision_config.spatial_merge_size,
+                w // self.config.vision_config.spatial_merge_size,
+                self.config.vision_config.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.config.vision_config.spatial_merge_size,
+                self.config.vision_config.spatial_merge_size,
+                w // self.config.vision_config.spatial_merge_size,
+                self.config.vision_config.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self._rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+    def visual(self, hidden_states, grid_thw):
+        hidden_states = self.image_embed(hidden_states)[0]
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = torch.nn.functional.pad(cu_seqlens, (1, 0), value=0)
+        attention_mask = torch.zeros((1, hidden_states.shape[0], hidden_states.shape[0]), dtype=torch.bool)
+        causal_mask = torch.zeros_like(attention_mask, dtype=torch.float32)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+        causal_mask.masked_fill_(torch.logical_not(attention_mask), float("-inf"))
+        res = self.image_embed_merger([hidden_states, causal_mask, rotary_pos_emb])[0]
+        return res

qwen2-build.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from pathlib import Path
+import requests
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
+if not Path("ov_qwen2_vl.py").exists():
+    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/qwen2-vl/ov_qwen2_vl.py")
+    open("ov_qwen2_vl.py", "w").write(r.text)
+if not Path("notebook_utils.py").exists():
+    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py")
+    open("notebook_utils.py", "w").write(r.text)
+from ov_qwen2_vl import model_selector
+model_id = model_selector()
+print(model_id)
+print(f"Selected {model_id.value}")
+pt_model_id = model_id.value
+model_dir = Path(pt_model_id.split("/")[-1])
+from ov_qwen2_vl import convert_qwen2vl_model
+# uncomment these lines to see model conversion code
+# convert_qwen2vl_model??
+import nncf
+compression_configuration = {
+    "mode": nncf.CompressWeightsMode.INT4_ASYM,
+    "group_size": 32,
+    "ratio": 1.0,
+}
+convert_qwen2vl_model(pt_model_id, model_dir, compression_configuration)

qwen2vl.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff