stepfun-ai
/

GOT-OCR2_0

@@ -1,3 +1,4 @@
 from transformers import Qwen2Config, Qwen2Model, Qwen2ForCausalLM, StoppingCriteria, TextStreamer
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from typing import List, Optional, Tuple, Union
@@ -19,7 +20,7 @@ DEFAULT_IMAGE_PATCH_TOKEN = '<imgpad>'
 DEFAULT_IM_START_TOKEN = '<img>'
 DEFAULT_IM_END_TOKEN = '</img>'
-from enum import auto, Enum
 class SeparatorStyle(Enum):
     """Different separator style."""
     SINGLE = auto()
@@ -65,7 +66,7 @@ class Conversation:
             return ret
         if self.sep_style == SeparatorStyle.MPT:
             if self.system:
-                ret = self.system + self.sep
             else:
                 ret = ''
             for role, message in self.messages:
@@ -79,7 +80,6 @@ class Conversation:
         else:
             raise ValueError(f"Invalid style: {self.sep_style}")
     def append_message(self, role, message):
         self.messages.append([role, message])
@@ -94,12 +94,12 @@ class Conversation:
             sep2=self.sep2)
 class KeywordsStoppingCriteria(StoppingCriteria):
     def __init__(self, keywords, tokenizer, input_ids):
         self.keywords = keywords
         self.keyword_ids = [tokenizer(keyword).input_ids for keyword in keywords]
-        self.keyword_ids = [keyword_id[0] for keyword_id in self.keyword_ids if type(keyword_id) is list and len(keyword_id) == 1]
         self.tokenizer = tokenizer
         self.start_len = None
         self.input_ids = input_ids
@@ -111,12 +111,13 @@ class KeywordsStoppingCriteria(StoppingCriteria):
             for keyword_id in self.keyword_ids:
                 if output_ids[0, -1] == keyword_id:
                     return True
-            outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0]
             for keyword in self.keywords:
                 if keyword in outputs:
                     return True
         return False
 class GOTImageEvalProcessor:
     def __init__(self, image_size=384, mean=None, std=None):
@@ -136,11 +137,11 @@ class GOTImageEvalProcessor:
                 self.normalize,
             ]
         )
     def __call__(self, item):
         return self.transform(item)
 class GOTConfig(Qwen2Config):
     model_type = "GOT"
@@ -153,28 +154,25 @@ class GOTQwenModel(Qwen2Model):
         self.vision_tower_high = build_GOT_vit_b()
-        self.mm_projector_vary =  nn.Linear(1024, 1024)
     def initialize_vision_modules(
-        self,
         vision_tower,
         pretrained_stage1_model=None,
         freeze_vision_tower=False,
         use_im_start_end=False,
         vision_select_layer=-1,
         dtype=torch.float16,
-        device="cuda"
     ):
         image_processor_high = GOTImageEvalProcessor(image_size=1024)
         self.vision_tower_high = self.vision_tower_high.to(dtype=dtype, device=device)
         self.mm_projector_vary = self.mm_projector_vary.to(dtype=dtype, device=device)
         image_token_len = 256
         self.config.vision_tower = vision_tower
@@ -184,13 +182,12 @@ class GOTQwenModel(Qwen2Model):
         self.config.vision_select_layer = vision_select_layer
         self.config.freeze_vision_tower = freeze_vision_tower
         return dict(
             image_processor_high=image_processor_high,
             image_token_len=image_token_len,
         )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -209,16 +206,17 @@ class GOTQwenModel(Qwen2Model):
         orig_embeds_params = getattr(self, 'orig_embeds_params', None)
         if orig_embeds_params is not None:
             with torch.no_grad():
-                self.get_input_embeddings().weight[:-self.num_new_tokens] = orig_embeds_params[:-self.num_new_tokens].data
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
         vision_tower_high = getattr(self, 'vision_tower_high', None)
-        if vision_tower_high is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
             use_im_start_end = getattr(self.config, "use_im_start_end", -1)
             vision_select_layer = getattr(self.config, "vision_select_layer", -1)
@@ -232,15 +230,15 @@ class GOTQwenModel(Qwen2Model):
             im_start_token = 151857
             im_end_token = 151858
             image_features = []
             for image in images:
                 P, C, H, W = image.shape
                 if P == 1:
                     with torch.set_grad_enabled(False):
                         cnn_feature = vision_tower_high(image)
-                        cnn_feature = cnn_feature.flatten(2).permute(0, 2, 1) # 256*1024
                     image_feature = self.mm_projector_vary(cnn_feature)
                     image_features.append(image_feature)
@@ -249,7 +247,7 @@ class GOTQwenModel(Qwen2Model):
                     image_patches_features = []
                     for image_patch in image_patches:
                         image_p = torch.stack([image_patch])
                         with torch.set_grad_enabled(False):
                             cnn_feature_p = vision_tower_high(image_p)
                             cnn_feature_p = cnn_feature_p.flatten(2).permute(0, 2, 1)
@@ -258,39 +256,44 @@ class GOTQwenModel(Qwen2Model):
                     image_feature = torch.cat(image_patches_features, dim=1)
                     image_features.append(image_feature)
-            dummy_image_features_2 = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
             dummy_image_features = dummy_image_features_2
             use_im_start_end = True
             new_input_embeds = []
-            for cur_input_ids, cur_input_embeds, cur_image_features in zip(input_ids, inputs_embeds, image_features):
                 if (cur_input_ids == im_patch_token).sum() == 0:
                     cur_input_embeds = cur_input_embeds + (0. * dummy_image_features).sum()
                     new_input_embeds.append(cur_input_embeds)
                     continue
                 if use_im_start_end:
-                    if (cur_input_ids == im_start_token).sum() != (cur_input_ids == im_end_token).sum():
-                        raise ValueError("The number of image start tokens and image end tokens should be the same.")
                     image_start_tokens = torch.where(cur_input_ids == im_start_token)[0]
-                    for image_start_token_pos, per_cur_image_features in zip(image_start_tokens, cur_image_features):
-                        per_cur_image_features = per_cur_image_features.to(device=cur_input_embeds.device)
                         num_patches = per_cur_image_features.shape[0]
                         if cur_input_ids[image_start_token_pos + num_patches + 1] != im_end_token:
-                            raise ValueError("The image end token should follow the image start token.")
                         cur_input_embeds = torch.cat(
                             (
-                                cur_input_embeds[:image_start_token_pos+1],
-                                per_cur_image_features,
                                 cur_input_embeds[image_start_token_pos + num_patches + 1:]
-                            ),
                             dim=0
                         )
                     new_input_embeds.append(cur_input_embeds)
                 else:
                     raise NotImplementedError
@@ -299,13 +302,12 @@ class GOTQwenModel(Qwen2Model):
         return super(GOTQwenModel, self).forward(
             input_ids=None, attention_mask=attention_mask, past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds, use_cache=use_cache, position_ids = position_ids,
             output_attentions=output_attentions, output_hidden_states=output_hidden_states,
             return_dict=return_dict
         )
 class GOTQwenForCausalLM(Qwen2ForCausalLM):
     config_class = GOTConfig
     # supports_gradient_checkpointing = True
@@ -336,15 +338,14 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         output_hidden_states: Optional[bool] = None,
         images: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        outputs  = self.model(
             input_ids=input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
@@ -355,7 +356,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             output_hidden_states=output_hidden_states,
             images=images,
             return_dict=return_dict
         )
         hidden_states = outputs[0]
@@ -389,7 +390,6 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
@@ -408,14 +408,16 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.
             elif past_length < input_ids.shape[1]:
                 input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
             if (
                 max_cache_length is not None
                 and attention_mask is not None
@@ -429,7 +431,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
@@ -449,15 +451,13 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         return model_inputs
     def initialize_vision_tokenizer(
-        self,
-        tokenizer,
-        freeze_lm_model=False,
         pretrained_stage1_model=None,
-        device="cuda"
     ):
         config = self.get_model().config
         self.resize_token_embeddings(len(tokenizer))
         config.im_patch_token = 151859
@@ -484,12 +484,23 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
         setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
-    def chat(self, tokenizer, image_file, ocr_type, ocr_box='', ocr_color='', render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag = False):
         self.disable_torch_init()
-        image_processor_high =  GOTImageEvalProcessor(image_size=1024)
         use_im_start_end = True
@@ -501,7 +512,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             image = self.load_image(image_file)
         w, h = image.size
         if ocr_type == 'format':
             qs = 'OCR with format: '
         else:
@@ -510,13 +521,13 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         if ocr_box:
             bbox = eval(ocr_box)
             if len(bbox) == 2:
-                bbox[0] = int(bbox[0]/w*1000)
-                bbox[1] = int(bbox[1]/h*1000)
             if len(bbox) == 4:
-                bbox[0] = int(bbox[0]/w*1000)
-                bbox[1] = int(bbox[1]/h*1000)
-                bbox[2] = int(bbox[2]/w*1000)
-                bbox[3] = int(bbox[3]/h*1000)
             if ocr_type == 'format':
                 qs = str(bbox) + ' ' + 'OCR with format: '
             else:
@@ -529,11 +540,11 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                 qs = '[' + ocr_color + ']' + ' ' + 'OCR: '
         if use_im_start_end:
-            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
         You should follow the instructions carefully and explain your answers in detail.""",
@@ -558,40 +569,42 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         image_tensor_1 = image_processor_high(image)
-        input_ids = torch.as_tensor(inputs.input_ids).cuda()
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
-        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         if stream_flag:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
-                    images=[image_tensor_1.unsqueeze(0).half().cuda()],
                     do_sample=False,
-                    num_beams = 1,
-                    no_repeat_ngram_size = 20,
                     streamer=streamer,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
-                    )
         else:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
-                    images=[image_tensor_1.unsqueeze(0).half().cuda()],
                     do_sample=False,
-                    num_beams = 1,
-                    no_repeat_ngram_size = 20,
                     # streamer=streamer,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
-                    )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         if outputs.endswith(stop_str):
             outputs = outputs[:-len(stop_str)]
         outputs = outputs.strip()
@@ -606,8 +619,8 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                 tk = verovio.toolkit()
                 tk.loadData(outputs)
                 tk.setOptions({"pageWidth": 2100, "footer": 'none',
-            'barLineWidth': 0.5, 'beamMaxSlope': 15,
-            'staffLineWidth': 0.2, 'spacingStaff': 6})
                 tk.getPageCount()
                 svg = tk.renderToSVG()
                 svg = svg.replace("overflow=\"inherit\"", "overflow=\"visible\"")
@@ -616,35 +629,52 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             if ocr_type == 'format' and '**kern' not in outputs:
-                if  '\\begin{tikzpicture}' not in outputs:
                     html_path_2 = save_render_file
                     right_num = outputs.count('\\right')
-                    left_num = outputs.count('\left')
                     if right_num != left_num:
-                        outputs = outputs.replace('\left(', '(').replace('\\right)', ')').replace('\left[', '[').replace('\\right]', ']').replace('\left{', '{').replace('\\right}', '}').replace('\left|', '|').replace('\\right|', '|').replace('\left.', '.').replace('\\right.', '.')
                     outputs = outputs.replace('"', '``').replace('$', '')
                     outputs_list = outputs.split('\n')
-                    gt= ''
                     for out in outputs_list:
-                        gt +=  '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
-                    gt = gt[:-2]
                     lines = content_mmd_to_html
                     lines = lines.split("const text =")
-                    new_web = lines[0] + 'const text ='  + gt  + lines[1]
                 else:
                     html_path_2 = save_render_file
                     outputs = outputs.translate(translation_table)
                     outputs_list = outputs.split('\n')
-                    gt= ''
                     for out in outputs_list:
                         if out:
                             if '\\begin{tikzpicture}' not in out and '\\end{tikzpicture}' not in out:
@@ -652,7 +682,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                                     out = out[:-1]
                                     if out is None:
                                         break
                                 if out:
                                     if out[-1] != ';':
                                         gt += out[:-1] + ';\n'
@@ -661,7 +691,6 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                             else:
                                 gt += out + '\n'
                     lines = tik_html
                     lines = lines.split("const text =")
                     new_web = lines[0] + gt + lines[1]
@@ -671,7 +700,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         return response_str
     def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):
         def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
             best_ratio_diff = float('inf')
             best_ratio = (1, 1)
@@ -687,14 +716,25 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                         best_ratio = ratio
             # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
             return best_ratio
         orig_width, orig_height = image.size
         aspect_ratio = orig_width / orig_height
         # calculate the existing image aspect ratio
         target_ratios = set(
-            (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
-            i * j <= max_num and i * j >= min_num)
         # print(target_ratios)
         target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
@@ -727,18 +767,25 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             processed_images.append(thumbnail_img)
         return processed_images
-    def chat_crop(self, tokenizer, image_file, ocr_type, render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag = False):
         # Model
         self.disable_torch_init()
-        multi_page=False
-        image_processor_high =  GOTImageEvalProcessor(image_size=1024)
         use_im_start_end = True
         image_token_len = 256
         image_list = []
@@ -778,18 +825,16 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             image_tensor_1 = image_processor_high(image)
             image_list.append(image_tensor_1)
         image_list = torch.stack(image_list)
-        print('====new images batch size======:  \n',image_list.shape)
         if use_im_start_end:
-            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len*ll + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
         You should follow the instructions carefully and explain your answers in detail.""",
@@ -812,43 +857,45 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         inputs = tokenizer([prompt])
-        input_ids = torch.as_tensor(inputs.input_ids).cuda()
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
-        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         if stream_flag:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
-                    images=[image_list.half().cuda()],
                     do_sample=False,
-                    num_beams = 1,
                     # no_repeat_ngram_size = 20,
                     streamer=streamer,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
-                    )
         else:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
-                    images=[image_list.half().cuda()],
                     do_sample=False,
-                    num_beams = 1,
                     # no_repeat_ngram_size = 20,
                     # streamer=streamer,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
-                    )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         if outputs.endswith(stop_str):
             outputs = outputs[:-len(stop_str)]
-        outputs = outputs.strip()
         response_str = outputs
         if render:
@@ -856,26 +903,45 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             from .render_tools import content_mmd_to_html
             html_path_2 = save_render_file
             right_num = outputs.count('\\right')
-            left_num = outputs.count('\left')
             if right_num != left_num:
-                outputs = outputs.replace('\left(', '(').replace('\\right)', ')').replace('\left[', '[').replace('\\right]', ']').replace('\left{', '{').replace('\\right}', '}').replace('\left|', '|').replace('\\right|', '|').replace('\left.', '.').replace('\\right.', '.')
             outputs = outputs.replace('"', '``').replace('$', '')
             outputs_list = outputs.split('\n')
-            gt= ''
             for out in outputs_list:
-                gt +=  '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
             gt = gt[:-2]
             lines = content_mmd_to_html
             lines = lines.split("const text =")
-            new_web = lines[0] + 'const text ='  + gt  + lines[1]
             with open(html_path_2, 'w') as web_f_new:
                 web_f_new.write(new_web)
-        return response_str

+from enum import auto, Enum
 from transformers import Qwen2Config, Qwen2Model, Qwen2ForCausalLM, StoppingCriteria, TextStreamer
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from typing import List, Optional, Tuple, Union
 DEFAULT_IM_START_TOKEN = '<img>'
 DEFAULT_IM_END_TOKEN = '</img>'
 class SeparatorStyle(Enum):
     """Different separator style."""
     SINGLE = auto()
             return ret
         if self.sep_style == SeparatorStyle.MPT:
             if self.system:
+                ret = self.system + self.sep
             else:
                 ret = ''
             for role, message in self.messages:
         else:
             raise ValueError(f"Invalid style: {self.sep_style}")
     def append_message(self, role, message):
         self.messages.append([role, message])
             sep2=self.sep2)
 class KeywordsStoppingCriteria(StoppingCriteria):
     def __init__(self, keywords, tokenizer, input_ids):
         self.keywords = keywords
         self.keyword_ids = [tokenizer(keyword).input_ids for keyword in keywords]
+        self.keyword_ids = [keyword_id[0] for keyword_id in self.keyword_ids if type(
+            keyword_id) is list and len(keyword_id) == 1]
         self.tokenizer = tokenizer
         self.start_len = None
         self.input_ids = input_ids
             for keyword_id in self.keyword_ids:
                 if output_ids[0, -1] == keyword_id:
                     return True
+            outputs = self.tokenizer.batch_decode(
+                output_ids[:, self.start_len:], skip_special_tokens=True)[0]
             for keyword in self.keywords:
                 if keyword in outputs:
                     return True
         return False
 class GOTImageEvalProcessor:
     def __init__(self, image_size=384, mean=None, std=None):
                 self.normalize,
             ]
         )
     def __call__(self, item):
         return self.transform(item)
 class GOTConfig(Qwen2Config):
     model_type = "GOT"
         self.vision_tower_high = build_GOT_vit_b()
+        self.mm_projector_vary = nn.Linear(1024, 1024)
     def initialize_vision_modules(
+        self,
         vision_tower,
         pretrained_stage1_model=None,
         freeze_vision_tower=False,
         use_im_start_end=False,
         vision_select_layer=-1,
         dtype=torch.float16,
     ):
+        device = self.device
         image_processor_high = GOTImageEvalProcessor(image_size=1024)
         self.vision_tower_high = self.vision_tower_high.to(dtype=dtype, device=device)
         self.mm_projector_vary = self.mm_projector_vary.to(dtype=dtype, device=device)
         image_token_len = 256
         self.config.vision_tower = vision_tower
         self.config.vision_select_layer = vision_select_layer
         self.config.freeze_vision_tower = freeze_vision_tower
         return dict(
             image_processor_high=image_processor_high,
             image_token_len=image_token_len,
         )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         orig_embeds_params = getattr(self, 'orig_embeds_params', None)
         if orig_embeds_params is not None:
             with torch.no_grad():
+                self.get_input_embeddings().weight[:-
+                                                   self.num_new_tokens] = orig_embeds_params[:-
+                                                                                             self.num_new_tokens].data
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
         vision_tower_high = getattr(self, 'vision_tower_high', None)
+        if vision_tower_high is not None and (
+                input_ids.shape[1] != 1 or self.training) and images is not None:
             use_im_start_end = getattr(self.config, "use_im_start_end", -1)
             vision_select_layer = getattr(self.config, "vision_select_layer", -1)
             im_start_token = 151857
             im_end_token = 151858
             image_features = []
             for image in images:
                 P, C, H, W = image.shape
                 if P == 1:
                     with torch.set_grad_enabled(False):
                         cnn_feature = vision_tower_high(image)
+                        cnn_feature = cnn_feature.flatten(2).permute(0, 2, 1)  # 256*1024
                     image_feature = self.mm_projector_vary(cnn_feature)
                     image_features.append(image_feature)
                     image_patches_features = []
                     for image_patch in image_patches:
                         image_p = torch.stack([image_patch])
                         with torch.set_grad_enabled(False):
                             cnn_feature_p = vision_tower_high(image_p)
                             cnn_feature_p = cnn_feature_p.flatten(2).permute(0, 2, 1)
                     image_feature = torch.cat(image_patches_features, dim=1)
                     image_features.append(image_feature)
+            dummy_image_features_2 = torch.zeros(
+                256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
             dummy_image_features = dummy_image_features_2
             use_im_start_end = True
             new_input_embeds = []
+            for cur_input_ids, cur_input_embeds, cur_image_features in zip(
+                    input_ids, inputs_embeds, image_features):
                 if (cur_input_ids == im_patch_token).sum() == 0:
                     cur_input_embeds = cur_input_embeds + (0. * dummy_image_features).sum()
                     new_input_embeds.append(cur_input_embeds)
                     continue
                 if use_im_start_end:
+                    if (cur_input_ids == im_start_token).sum() != (
+                            cur_input_ids == im_end_token).sum():
+                        raise ValueError(
+                            "The number of image start tokens and image end tokens should be the same.")
                     image_start_tokens = torch.where(cur_input_ids == im_start_token)[0]
+                    for image_start_token_pos, per_cur_image_features in zip(
+                            image_start_tokens, cur_image_features):
+                        per_cur_image_features = per_cur_image_features.to(
+                            device=cur_input_embeds.device)
                         num_patches = per_cur_image_features.shape[0]
                         if cur_input_ids[image_start_token_pos + num_patches + 1] != im_end_token:
+                            raise ValueError(
+                                "The image end token should follow the image start token.")
                         cur_input_embeds = torch.cat(
                             (
+                                cur_input_embeds[:image_start_token_pos + 1],
+                                per_cur_image_features,
                                 cur_input_embeds[image_start_token_pos + num_patches + 1:]
+                            ),
                             dim=0
                         )
                     new_input_embeds.append(cur_input_embeds)
                 else:
                     raise NotImplementedError
         return super(GOTQwenModel, self).forward(
             input_ids=None, attention_mask=attention_mask, past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds, use_cache=use_cache, position_ids=position_ids,
             output_attentions=output_attentions, output_hidden_states=output_hidden_states,
             return_dict=return_dict
         )
 class GOTQwenForCausalLM(Qwen2ForCausalLM):
     config_class = GOTConfig
     # supports_gradient_checkpointing = True
         output_hidden_states: Optional[bool] = None,
         images: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
             input_ids=input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
             output_hidden_states=output_hidden_states,
             images=images,
             return_dict=return_dict
         )
         hidden_states = outputs[0]
             attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.
             elif past_length < input_ids.shape[1]:
                 input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume
+            # input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop
+            # the input attention mask.
             if (
                 max_cache_length is not None
                 and attention_mask is not None
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
         return model_inputs
     def initialize_vision_tokenizer(
+        self,
+        tokenizer,
+        freeze_lm_model=False,
         pretrained_stage1_model=None,
     ):
         config = self.get_model().config
         self.resize_token_embeddings(len(tokenizer))
         config.im_patch_token = 151859
         setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
         setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+    def chat(
+            self,
+            tokenizer,
+            image_file,
+            ocr_type,
+            ocr_box='',
+            ocr_color='',
+            render=False,
+            save_render_file=None,
+            print_prompt=False,
+            gradio_input=False,
+            stream_flag=False,
+            streamer=None):
         self.disable_torch_init()
+        image_processor_high = GOTImageEvalProcessor(image_size=1024)
         use_im_start_end = True
             image = self.load_image(image_file)
         w, h = image.size
         if ocr_type == 'format':
             qs = 'OCR with format: '
         else:
         if ocr_box:
             bbox = eval(ocr_box)
             if len(bbox) == 2:
+                bbox[0] = int(bbox[0] / w * 1000)
+                bbox[1] = int(bbox[1] / h * 1000)
             if len(bbox) == 4:
+                bbox[0] = int(bbox[0] / w * 1000)
+                bbox[1] = int(bbox[1] / h * 1000)
+                bbox[2] = int(bbox[2] / w * 1000)
+                bbox[3] = int(bbox[3] / h * 1000)
             if ocr_type == 'format':
                 qs = str(bbox) + ' ' + 'OCR with format: '
             else:
                 qs = '[' + ocr_color + ']' + ' ' + 'OCR: '
         if use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN * \
+                image_token_len + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
         You should follow the instructions carefully and explain your answers in detail.""",
         image_tensor_1 = image_processor_high(image)
+        input_ids = torch.as_tensor(inputs.input_ids).to(self.model.device)
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        streamer = streamer if streamer else TextStreamer(
+            tokenizer, skip_prompt=True, skip_special_tokens=True)
+        device = "cuda" if "cuda" in str(self.model.device) else "cpu"
         if stream_flag:
+            with torch.autocast(device, dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
+                    images=[image_tensor_1.unsqueeze(0).half().to(self.model.device)],
                     do_sample=False,
+                    num_beams=1,
+                    no_repeat_ngram_size=20,
                     streamer=streamer,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
+                )
         else:
+            with torch.autocast(device, dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
+                    images=[image_tensor_1.unsqueeze(0).half().to(self.model.device)],
                     do_sample=False,
+                    num_beams=1,
+                    no_repeat_ngram_size=20,
                     # streamer=streamer,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
+                )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         if outputs.endswith(stop_str):
             outputs = outputs[:-len(stop_str)]
         outputs = outputs.strip()
                 tk = verovio.toolkit()
                 tk.loadData(outputs)
                 tk.setOptions({"pageWidth": 2100, "footer": 'none',
+                               'barLineWidth': 0.5, 'beamMaxSlope': 15,
+                               'staffLineWidth': 0.2, 'spacingStaff': 6})
                 tk.getPageCount()
                 svg = tk.renderToSVG()
                 svg = svg.replace("overflow=\"inherit\"", "overflow=\"visible\"")
             if ocr_type == 'format' and '**kern' not in outputs:
+                if '\\begin{tikzpicture}' not in outputs:
                     html_path_2 = save_render_file
                     right_num = outputs.count('\\right')
+                    left_num = outputs.count('\\left')
                     if right_num != left_num:
+                        outputs = outputs.replace(
+                            '\\left(',
+                            '(').replace(
+                            '\\right)',
+                            ')').replace(
+                            '\\left[',
+                            '[').replace(
+                            '\\right]',
+                            ']').replace(
+                            '\\left{',
+                            '{').replace(
+                            '\\right}',
+                            '}').replace(
+                            '\\left|',
+                            '|').replace(
+                                '\\right|',
+                                '|').replace(
+                                    '\\left.',
+                                    '.').replace(
+                                        '\\right.',
+                            '.')
                     outputs = outputs.replace('"', '``').replace('$', '')
                     outputs_list = outputs.split('\n')
+                    gt = ''
                     for out in outputs_list:
+                        gt += '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
+                    gt = gt[:-2]
                     lines = content_mmd_to_html
                     lines = lines.split("const text =")
+                    new_web = lines[0] + 'const text =' + gt + lines[1]
                 else:
                     html_path_2 = save_render_file
                     outputs = outputs.translate(translation_table)
                     outputs_list = outputs.split('\n')
+                    gt = ''
                     for out in outputs_list:
                         if out:
                             if '\\begin{tikzpicture}' not in out and '\\end{tikzpicture}' not in out:
                                     out = out[:-1]
                                     if out is None:
                                         break
                                 if out:
                                     if out[-1] != ';':
                                         gt += out[:-1] + ';\n'
                             else:
                                 gt += out + '\n'
                     lines = tik_html
                     lines = lines.split("const text =")
                     new_web = lines[0] + gt + lines[1]
         return response_str
     def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):
         def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
             best_ratio_diff = float('inf')
             best_ratio = (1, 1)
                         best_ratio = ratio
             # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
             return best_ratio
         orig_width, orig_height = image.size
         aspect_ratio = orig_width / orig_height
         # calculate the existing image aspect ratio
         target_ratios = set(
+            (i,
+             j) for n in range(
+                min_num,
+                max_num +
+                1) for i in range(
+                1,
+                n +
+                1) for j in range(
+                1,
+                n +
+                1) if i *
+            j <= max_num and i *
+            j >= min_num)
         # print(target_ratios)
         target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
             processed_images.append(thumbnail_img)
         return processed_images
+    def chat_crop(
+            self,
+            tokenizer,
+            image_file,
+            ocr_type,
+            render=False,
+            save_render_file=None,
+            print_prompt=False,
+            gradio_input=False,
+            stream_flag=False,
+            streamer=None):
         # Model
         self.disable_torch_init()
+        multi_page = False
+        image_processor_high = GOTImageEvalProcessor(image_size=1024)
         use_im_start_end = True
         image_token_len = 256
         image_list = []
             image_tensor_1 = image_processor_high(image)
             image_list.append(image_tensor_1)
         image_list = torch.stack(image_list)
+        print('====new images batch size======:  \n', image_list.shape)
         if use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN * \
+                image_token_len * ll + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
         You should follow the instructions carefully and explain your answers in detail.""",
         inputs = tokenizer([prompt])
+        input_ids = torch.as_tensor(inputs.input_ids).to(self.model.device)
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        streamer = streamer if streamer else TextStreamer(
+            tokenizer, skip_prompt=True, skip_special_tokens=True)
+        device = "cuda" if "cuda" in str(self.model.device) else "cpu"
         if stream_flag:
+            with torch.autocast(device, dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
+                    images=[image_list.half().to(self.model.device)],
                     do_sample=False,
+                    num_beams=1,
                     # no_repeat_ngram_size = 20,
                     streamer=streamer,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
+                )
         else:
+            with torch.autocast(device, dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
+                    images=[image_list.half().to(self.model.device)],
                     do_sample=False,
+                    num_beams=1,
                     # no_repeat_ngram_size = 20,
                     # streamer=streamer,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
+                )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         if outputs.endswith(stop_str):
             outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
         response_str = outputs
         if render:
             from .render_tools import content_mmd_to_html
             html_path_2 = save_render_file
             right_num = outputs.count('\\right')
+            left_num = outputs.count('\\left')
             if right_num != left_num:
+                outputs = outputs.replace(
+                    '\\left(',
+                    '(').replace(
+                    '\\right)',
+                    ')').replace(
+                    '\\left[',
+                    '[').replace(
+                    '\\right]',
+                    ']').replace(
+                    '\\left{',
+                    '{').replace(
+                        '\\right}',
+                        '}').replace(
+                            '\\left|',
+                            '|').replace(
+                                '\\right|',
+                                '|').replace(
+                                    '\\left.',
+                                    '.').replace(
+                                        '\\right.',
+                    '.')
             outputs = outputs.replace('"', '``').replace('$', '')
             outputs_list = outputs.split('\n')
+            gt = ''
             for out in outputs_list:
+                gt += '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
             gt = gt[:-2]
             lines = content_mmd_to_html
             lines = lines.split("const text =")
+            new_web = lines[0] + 'const text =' + gt + lines[1]
             with open(html_path_2, 'w') as web_f_new:
                 web_f_new.write(new_web)
+        return response_str