Qwen
/

Qwen-VL

@@ -69,44 +69,7 @@ Pass argument `stream` to model.chat() is buggy, deprecated, and marked for remo
 apply_rotary_emb_func = None
 rms_norm = None
-flash_attn_unpadded_func = None
-def _import_flash_attn():
-    global apply_rotary_emb_func, rms_norm, flash_attn_unpadded_func
-    try:
-        from flash_attn.layers.rotary import apply_rotary_emb_func as __apply_rotary_emb_func
-        apply_rotary_emb_func = __apply_rotary_emb_func
-    except ImportError:
-        logger.warn(
-            "Warning: import flash_attn rotary fail, please install FlashAttention rotary to get higher efficiency "
-            "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary"
-        )
-    try:
-        from flash_attn.ops.rms_norm import rms_norm as __rms_norm
-        rms_norm = __rms_norm
-    except ImportError:
-        logger.warn(
-            "Warning: import flash_attn rms_norm fail, please install FlashAttention layer_norm to get higher efficiency "
-            "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm"
-        )
-    try:
-        import flash_attn
-        if not hasattr(flash_attn, '__version__'):
-            from flash_attn.flash_attn_interface import flash_attn_unpadded_func as __flash_attn_unpadded_func
-        else:
-            if int(flash_attn.__version__.split(".")[0]) >= 2:
-                from flash_attn.flash_attn_interface import flash_attn_varlen_func as __flash_attn_unpadded_func
-            else:
-                from flash_attn.flash_attn_interface import flash_attn_unpadded_func as __flash_attn_unpadded_func
-        flash_attn_unpadded_func = __flash_attn_unpadded_func
-    except ImportError:
-        logger.warn(
-            "Warning: import flash_attn fail, please install FlashAttention to get higher efficiency "
-            "https://github.com/Dao-AILab/flash-attention"
-        )
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
@@ -141,70 +104,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-class FlashSelfAttention(torch.nn.Module):
-    def __init__(
-        self,
-        causal=False,
-        softmax_scale=None,
-        attention_dropout=0.0,
-    ):
-        super().__init__()
-        assert flash_attn_unpadded_func is not None, (
-            "Please install FlashAttention first, " "e.g., with pip install flash-attn"
-        )
-        assert (
-            rearrange is not None
-        ), "Please install einops first, e.g., with pip install einops"
-        self.causal = causal
-        self.softmax_scale = softmax_scale
-        self.dropout_p = attention_dropout
-    def forward(self, q, k, v):
-        assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q, k, v)))
-        assert all((i.is_cuda for i in (q, k, v)))
-        batch_size, seqlen_q = q.shape[0], q.shape[1]
-        seqlen_k = k.shape[1]
-        q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
-        cu_seqlens_q = torch.arange(
-            0,
-            (batch_size + 1) * seqlen_q,
-            step=seqlen_q,
-            dtype=torch.int32,
-            device=q.device,
-        )
-        if self.training:
-            assert seqlen_k == seqlen_q
-            is_causal = self.causal
-            cu_seqlens_k = cu_seqlens_q
-        else:
-            is_causal = seqlen_q == seqlen_k
-            cu_seqlens_k = torch.arange(
-                0,
-                (batch_size + 1) * seqlen_k,
-                step=seqlen_k,
-                dtype=torch.int32,
-                device=q.device,
-            )
-            self.dropout_p = 0
-        output = flash_attn_unpadded_func(
-            q,
-            k,
-            v,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            seqlen_q,
-            seqlen_k,
-            self.dropout_p,
-            softmax_scale=self.softmax_scale,
-            causal=is_causal,
-        )
-        output = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
-        return output
 class QWenAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -225,7 +124,6 @@ class QWenAttention(nn.Module):
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
-        self.use_flash_attn = config.use_flash_attn
         self.scale_attn_weights = True
         self.projection_size = config.kv_channels * config.num_attention_heads
@@ -242,15 +140,6 @@ class QWenAttention(nn.Module):
         )
         self.is_fp32 = not (config.bf16 or config.fp16)
-        if (
-            self.use_flash_attn
-            and flash_attn_unpadded_func is not None
-            and not self.is_fp32
-        ):
-            self.core_attention_flash = FlashSelfAttention(
-                causal=True, attention_dropout=config.attn_dropout_prob
-            )
         self.bf16 = config.bf16
         if config.rotary_pct == 1.0:
@@ -453,40 +342,20 @@ class QWenAttention(nn.Module):
             logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
             query = query * logn_tensor.expand_as(query)
-        if (
-            self.use_flash_attn
-            and flash_attn_unpadded_func is not None
-            and not self.is_fp32
-            and query.is_cuda
-        ):
-            q, k, v = query, key, value
-            context_layer = self.core_attention_flash(q, k, v)
-            context_layer = rearrange(
-                context_layer, "b s h d -> b s (h d)"
-            ).contiguous()
-        else:
-            query = query.permute(0, 2, 1, 3)
-            key = key.permute(0, 2, 1, 3)
-            value = value.permute(0, 2, 1, 3)
-            attn_output, attn_weight = self._attn(
-                query, key, value, attention_mask, head_mask
-            )
-            context_layer = self._merge_heads(
-                attn_output, self.num_heads, self.head_dim
-            )
         attn_output = self.c_proj(context_layer)
         outputs = (attn_output, present)
         if output_attentions:
-            if (
-                self.use_flash_attn
-                and flash_attn_unpadded_func is not None
-                and not self.is_fp32
-            ):
-                raise ValueError("Cannot output attentions while using flash-attn")
-            else:
-                outputs += (attn_weight,)
         return outputs
@@ -882,18 +751,6 @@ class QWenLMHeadModel(QWenPreTrainedModel):
                 logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
             elif SUPPORT_FP16:
                 logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
-        if config.use_flash_attn == "auto":
-            if config.bf16 or config.fp16:
-                logger.warn("Try importing flash-attention for faster inference...")
-                config.use_flash_attn = True
-            else:
-                config.use_flash_attn = False
-        if config.use_flash_attn and config.fp32:
-            logger.warn("Flash attention will be disabled because it does NOT support fp32.")
-        if config.use_flash_attn:
-            _import_flash_attn()
         self.transformer = QWenModel(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

 apply_rotary_emb_func = None
 rms_norm = None
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 class QWenAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
         self.scale_attn_weights = True
         self.projection_size = config.kv_channels * config.num_attention_heads
         )
         self.is_fp32 = not (config.bf16 or config.fp16)
         self.bf16 = config.bf16
         if config.rotary_pct == 1.0:
             logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
             query = query * logn_tensor.expand_as(query)
+        query = query.permute(0, 2, 1, 3)
+        key = key.permute(0, 2, 1, 3)
+        value = value.permute(0, 2, 1, 3)
+        attn_output, attn_weight = self._attn(
+            query, key, value, attention_mask, head_mask
+        )
+        context_layer = self._merge_heads(
+            attn_output, self.num_heads, self.head_dim
+        )
         attn_output = self.c_proj(context_layer)
         outputs = (attn_output, present)
         if output_attentions:
+            outputs += (attn_weight,)
         return outputs
                 logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
             elif SUPPORT_FP16:
                 logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
         self.transformer = QWenModel(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

tokenization_qwen.py CHANGED Viewed

@@ -10,7 +10,7 @@ import logging
 import os
 import requests
 import unicodedata
-from typing import Collection, Dict, List, Set, Tuple, Union, Any, Callable
 import tiktoken
 import numpy as np
@@ -359,6 +359,22 @@ class QWenTokenizer(PreTrainedTokenizer):
             _encode_vl_info,
         )
     def _fetch_latest_picture(self, response, history):
         if history is None:
             history = []
@@ -377,15 +393,19 @@ class QWenTokenizer(PreTrainedTokenizer):
                 bbox = tuple(map(int, ele['box'].replace('(', '').replace(')', '').split(',')))
                 assert len(bbox) == 4
                 output.append({'box': bbox})
-                if i > 0 and 'ref' in list_format[i-1]:
-                    output[-1]['ref'] = list_format[i-1]['ref'].strip()
         return output
     def draw_bbox_on_latest_picture(
         self,
         response,
         history=None,
-    ):
         image = self._fetch_latest_picture(response, history)
         if image is None:
             return None
@@ -399,14 +419,14 @@ class QWenTokenizer(PreTrainedTokenizer):
         boxes = self._fetch_all_box_with_ref(response)
         if not boxes:
             return None
-        fnt = ImageFont.truetype("SimSun.ttf", 20)
         draw = ImageDraw.Draw(image)
         for box in boxes:
             x1, y1, x2, y2 = box['box']
             x1, y1, x2, y2 = (int(x1 / 1000 * w), int(y1 / 1000 * h), int(x2 / 1000 * w), int(y2 / 1000 * h))
-            draw.rectangle((x1, y1, x2, y2), outline='red', width=2)
             if 'ref' in box:
-                draw.text((x1, y1), box['ref'], fill='red', font=fnt)
         return image

 import os
 import requests
 import unicodedata
+from typing import Collection, Dict, List, Set, Tuple, Union, Any, Callable, Optional
 import tiktoken
 import numpy as np
             _encode_vl_info,
         )
+    def from_list_format(self, list_format: List[Dict]):
+        text = ''
+        for ele in list_format:
+            if 'image' in ele:
+                text += self.image_start_tag + ele['image'] + self.image_end_tag
+            elif 'text' in ele:
+                text += ele['text']
+            elif 'box' in ele:
+                if 'ref' in ele:
+                    text += self.ref_start_tag + ele['ref'] + self.ref_end_tag
+                for box in ele['box']:
+                    text += self.box_start_tag + '(%d,%d),(%d,%d)' % (box[0], box[1], box[2], box[3]) + self.box_end_tag
+            else:
+                raise ValueError("Unsupport element: " + str(ele))
+        return text
     def _fetch_latest_picture(self, response, history):
         if history is None:
             history = []
                 bbox = tuple(map(int, ele['box'].replace('(', '').replace(')', '').split(',')))
                 assert len(bbox) == 4
                 output.append({'box': bbox})
+                ref_idx = i - 1
+                while ref_idx >= 0 and 'box' in list_format[ref_idx]:
+                    ref_idx -= 1
+                if ref_idx >= 0 and 'ref' in list_format[ref_idx]:
+                    output[-1]['ref'] = list_format[ref_idx]['ref'].strip()
         return output
     def draw_bbox_on_latest_picture(
         self,
         response,
         history=None,
+    ) -> Optional[Image.Image]:
         image = self._fetch_latest_picture(response, history)
         if image is None:
             return None
         boxes = self._fetch_all_box_with_ref(response)
         if not boxes:
             return None
+        fnt = ImageFont.truetype("SimSun.ttf", 50)
         draw = ImageDraw.Draw(image)
         for box in boxes:
             x1, y1, x2, y2 = box['box']
             x1, y1, x2, y2 = (int(x1 / 1000 * w), int(y1 / 1000 * h), int(x2 / 1000 * w), int(y2 / 1000 * h))
+            draw.rectangle((x1, y1, x2, y2), outline='red', width=4)
             if 'ref' in box:
+                draw.text((x1, y1), box['ref'], fill='yellow', font=fnt)
         return image

visual.py CHANGED Viewed

@@ -1,3 +1,8 @@
 from collections import OrderedDict
 import math
 import requests
@@ -5,11 +10,11 @@ from io import BytesIO
 from functools import partial
 from PIL import Image
 from typing import Callable, Optional, Sequence, Tuple, List
 import torch
 from torch import nn
 from torch.nn import functional as F
-from torch.utils.checkpoint import checkpoint
 from torch.nn.init import trunc_normal_
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
@@ -33,8 +38,64 @@ def get_abs_pos(abs_pos, tgt_size):
     else:
         return abs_pos
 class Resampler(nn.Module):
     def __init__(
             self,
             grid_size,
@@ -48,7 +109,9 @@ class Resampler(nn.Module):
         self.embed_dim = embed_dim
         self.num_heads = num_heads
-        self.pos_embed = nn.Parameter(torch.randn(embed_dim, grid_size)).requires_grad_(False)
         self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
         trunc_normal_(self.query, std=.02)
@@ -234,7 +297,7 @@ class VisualAttentionBlock(nn.Module):
         return x
-class Transformer(nn.Module):
     def __init__(
             self,
             width: int,
@@ -247,7 +310,6 @@ class Transformer(nn.Module):
         super().__init__()
         self.width = width
         self.layers = layers
-        self.grad_checkpointing = False
         self.resblocks = nn.ModuleList([
             VisualAttentionBlock(
@@ -263,11 +325,7 @@ class Transformer(nn.Module):
     def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
         for r in self.resblocks:
-            if self.grad_checkpointing and not torch.jit.is_scripting():
-                # TODO: handle kwargs https://github.com/pytorch/pytorch/issues/79887#issuecomment-1161758372
-                x = checkpoint(r, x, None, None, attn_mask)
-            else:
-                x = r(x, attn_mask=attn_mask)
         return x
@@ -306,13 +364,13 @@ class VisionTransformer(nn.Module):
         # class embeddings and positional embeddings
         scale = width ** -0.5
-        self.positional_embedding = nn.Parameter(scale * torch.randn(self.grid_size[0] * self.grid_size[1], width))
         norm_layer = partial(nn.LayerNorm, eps=1e-6)
         act_layer = nn.GELU
         self.ln_pre = norm_layer(width)
-        self.transformer = Transformer(
             width,
             layers,
             heads,
@@ -331,10 +389,6 @@ class VisionTransformer(nn.Module):
         self.ln_post = norm_layer(output_dim)
         self.proj = nn.Parameter((output_dim** -0.5) * torch.randn(output_dim, output_dim))
-    @torch.jit.ignore
-    def set_grad_checkpointing(self, enable=True):
-        self.transformer.grad_checkpointing = enable
     def forward(self, x: torch.Tensor):
         x = x.to(
             dtype=self.transformer.get_cast_dtype(),
@@ -353,8 +407,7 @@ class VisionTransformer(nn.Module):
         x = self.transformer(x)
         x = x.permute(1, 0, 2)  # LND -> NLD
-        if self.attn_pool:
-            x = self.attn_pool(x)
         x = self.ln_post(x)
         x = x @ self.proj
@@ -365,8 +418,6 @@ class VisionTransformer(nn.Module):
         for image_path in image_paths:
             if image_path.startswith("http://") or image_path.startswith("https://"):
                 image = Image.open(requests.get(image_path, stream=True).raw)
-            elif image_path.startswith("oss://"):
-                raise NotImplementedError
             else:
                 image = Image.open(image_path)
             image = image.convert("RGB")

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
 from collections import OrderedDict
 import math
 import requests
 from functools import partial
 from PIL import Image
 from typing import Callable, Optional, Sequence, Tuple, List
+import numpy as np
 import torch
 from torch import nn
 from torch.nn import functional as F
 from torch.nn.init import trunc_normal_
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
     else:
         return abs_pos
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
 class Resampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
     def __init__(
             self,
             grid_size,
         self.embed_dim = embed_dim
         self.num_heads = num_heads
+        self.pos_embed = nn.Parameter(
+            torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float()
+        ).requires_grad_(False)
         self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
         trunc_normal_(self.query, std=.02)
         return x
+class TransformerBlock(nn.Module):
     def __init__(
             self,
             width: int,
         super().__init__()
         self.width = width
         self.layers = layers
         self.resblocks = nn.ModuleList([
             VisualAttentionBlock(
     def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
         for r in self.resblocks:
+            x = r(x, attn_mask=attn_mask)
         return x
         # class embeddings and positional embeddings
         scale = width ** -0.5
+        self.positional_embedding = nn.Parameter(scale * torch.randn(256, width))
         norm_layer = partial(nn.LayerNorm, eps=1e-6)
         act_layer = nn.GELU
         self.ln_pre = norm_layer(width)
+        self.transformer = TransformerBlock(
             width,
             layers,
             heads,
         self.ln_post = norm_layer(output_dim)
         self.proj = nn.Parameter((output_dim** -0.5) * torch.randn(output_dim, output_dim))
     def forward(self, x: torch.Tensor):
         x = x.to(
             dtype=self.transformer.get_cast_dtype(),
         x = self.transformer(x)
         x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.attn_pool(x)
         x = self.ln_post(x)
         x = x @ self.proj
         for image_path in image_paths:
             if image_path.startswith("http://") or image_path.startswith("https://"):
                 image = Image.open(requests.get(image_path, stream=True).raw)
             else:
                 image = Image.open(image_path)
             image = image.convert("RGB")