Upload folder using huggingface_hub
Browse files- config.json +4 -3
- modeling_internlm2.py +182 -0
- modeling_internvl_chat.py +2 -3
config.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"_commit_hash": null,
|
3 |
-
"_name_or_path": "
|
4 |
"architectures": [
|
5 |
"InternVLChatModel"
|
6 |
],
|
@@ -11,6 +11,7 @@
|
|
11 |
"downsample_ratio": 0.5,
|
12 |
"dynamic_image_size": true,
|
13 |
"force_image_size": 448,
|
|
|
14 |
"llm_config": {
|
15 |
"_name_or_path": "pretrained/internlm2-chat-20b/",
|
16 |
"add_cross_attention": false,
|
@@ -100,7 +101,7 @@
|
|
100 |
"use_cache": false,
|
101 |
"vocab_size": 92553
|
102 |
},
|
103 |
-
"max_dynamic_patch":
|
104 |
"min_dynamic_patch": 1,
|
105 |
"model_type": "internvl_chat",
|
106 |
"pad2square": false,
|
@@ -113,7 +114,7 @@
|
|
113 |
"use_llm_lora": 0,
|
114 |
"use_thumbnail": true,
|
115 |
"vision_config": {
|
116 |
-
"_name_or_path": "
|
117 |
"add_cross_attention": false,
|
118 |
"architectures": [
|
119 |
"InternVisionModel"
|
|
|
1 |
{
|
2 |
"_commit_hash": null,
|
3 |
+
"_name_or_path": "./work_dirs/internvl_chat_internlm2_20b_448_dynamic_chinese_pretrain3/checkpoint-1600_replace_llm",
|
4 |
"architectures": [
|
5 |
"InternVLChatModel"
|
6 |
],
|
|
|
11 |
"downsample_ratio": 0.5,
|
12 |
"dynamic_image_size": true,
|
13 |
"force_image_size": 448,
|
14 |
+
"image_fold": null,
|
15 |
"llm_config": {
|
16 |
"_name_or_path": "pretrained/internlm2-chat-20b/",
|
17 |
"add_cross_attention": false,
|
|
|
101 |
"use_cache": false,
|
102 |
"vocab_size": 92553
|
103 |
},
|
104 |
+
"max_dynamic_patch": 6,
|
105 |
"min_dynamic_patch": 1,
|
106 |
"model_type": "internvl_chat",
|
107 |
"pad2square": false,
|
|
|
114 |
"use_llm_lora": 0,
|
115 |
"use_thumbnail": true,
|
116 |
"vision_config": {
|
117 |
+
"_name_or_path": "work_dirs/internvl_chat_internlm2_20b_448_dynamic_chinese_pretrain/checkpoint-5200-vit",
|
118 |
"add_cross_attention": false,
|
119 |
"architectures": [
|
120 |
"InternVisionModel"
|
modeling_internlm2.py
CHANGED
@@ -39,6 +39,20 @@ try:
|
|
39 |
from transformers.generation.streamers import BaseStreamer
|
40 |
except: # noqa # pylint: disable=bare-except
|
41 |
BaseStreamer = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
from .configuration_internlm2 import InternLM2Config
|
44 |
|
@@ -1272,6 +1286,174 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
|
|
1272 |
|
1273 |
return consumer()
|
1274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1275 |
|
1276 |
# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
|
1277 |
@add_start_docstrings(
|
|
|
39 |
from transformers.generation.streamers import BaseStreamer
|
40 |
except: # noqa # pylint: disable=bare-except
|
41 |
BaseStreamer = None
|
42 |
+
from typing import Any, List, Optional, Tuple, Union
|
43 |
+
import torch.distributed as dist
|
44 |
+
import torch.utils.checkpoint
|
45 |
+
from peft import LoraConfig, get_peft_model
|
46 |
+
from torch import nn
|
47 |
+
from torch.nn import CrossEntropyLoss
|
48 |
+
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
|
49 |
+
from transformers.generation.logits_process import LogitsProcessorList
|
50 |
+
from transformers.generation.stopping_criteria import StoppingCriteriaList
|
51 |
+
from transformers.generation.streamers import BaseStreamer
|
52 |
+
from transformers.modeling_outputs import CausalLMOutputWithPast
|
53 |
+
from transformers.modeling_utils import PreTrainedModel
|
54 |
+
from transformers.utils import ModelOutput, logging
|
55 |
+
from transformers.generation.utils import GreedySearchOutput, validate_stopping_criteria, GreedySearchDecoderOnlyOutput, GreedySearchEncoderDecoderOutput
|
56 |
|
57 |
from .configuration_internlm2 import InternLM2Config
|
58 |
|
|
|
1286 |
|
1287 |
return consumer()
|
1288 |
|
1289 |
+
def greedy_search(
|
1290 |
+
self,
|
1291 |
+
input_ids: torch.LongTensor,
|
1292 |
+
logits_processor: Optional[LogitsProcessorList] = None,
|
1293 |
+
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
1294 |
+
max_length: Optional[int] = None,
|
1295 |
+
pad_token_id: Optional[int] = None,
|
1296 |
+
eos_token_id: Optional[Union[int, List[int]]] = None,
|
1297 |
+
output_attentions: Optional[bool] = None,
|
1298 |
+
output_hidden_states: Optional[bool] = None,
|
1299 |
+
output_scores: Optional[bool] = None,
|
1300 |
+
return_dict_in_generate: Optional[bool] = None,
|
1301 |
+
synced_gpus: bool = False,
|
1302 |
+
streamer: Optional["BaseStreamer"] = None,
|
1303 |
+
**model_kwargs,
|
1304 |
+
) -> Union[GreedySearchOutput, torch.LongTensor]:
|
1305 |
+
# init values
|
1306 |
+
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
|
1307 |
+
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
|
1308 |
+
if max_length is not None:
|
1309 |
+
warnings.warn(
|
1310 |
+
"`max_length` is deprecated in this function, use"
|
1311 |
+
" `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
|
1312 |
+
UserWarning,
|
1313 |
+
)
|
1314 |
+
stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
|
1315 |
+
pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
|
1316 |
+
eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
|
1317 |
+
if isinstance(eos_token_id, int):
|
1318 |
+
eos_token_id = [eos_token_id]
|
1319 |
+
eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
|
1320 |
+
output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
|
1321 |
+
output_attentions = (
|
1322 |
+
output_attentions if output_attentions is not None else self.generation_config.output_attentions
|
1323 |
+
)
|
1324 |
+
output_hidden_states = (
|
1325 |
+
output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
|
1326 |
+
)
|
1327 |
+
return_dict_in_generate = (
|
1328 |
+
return_dict_in_generate
|
1329 |
+
if return_dict_in_generate is not None
|
1330 |
+
else self.generation_config.return_dict_in_generate
|
1331 |
+
)
|
1332 |
+
|
1333 |
+
# init attention / hidden states / scores tuples
|
1334 |
+
scores = () if (return_dict_in_generate and output_scores) else None
|
1335 |
+
decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
|
1336 |
+
cross_attentions = () if (return_dict_in_generate and output_attentions) else None
|
1337 |
+
decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
|
1338 |
+
|
1339 |
+
# if model is an encoder-decoder, retrieve encoder attention weights and hidden states
|
1340 |
+
if return_dict_in_generate and self.config.is_encoder_decoder:
|
1341 |
+
encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
|
1342 |
+
encoder_hidden_states = (
|
1343 |
+
model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
|
1344 |
+
)
|
1345 |
+
|
1346 |
+
# keep track of which sequences are already finished
|
1347 |
+
unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
|
1348 |
+
|
1349 |
+
this_peer_finished = False # used by synced_gpus only
|
1350 |
+
while True:
|
1351 |
+
if synced_gpus:
|
1352 |
+
# Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
|
1353 |
+
# The following logic allows an early break if all peers finished generating their sequence
|
1354 |
+
this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
|
1355 |
+
# send 0.0 if we finished, 1.0 otherwise
|
1356 |
+
dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
|
1357 |
+
# did all peers finish? the reduced sum will be 0.0 then
|
1358 |
+
if this_peer_finished_flag.item() == 0.0:
|
1359 |
+
break
|
1360 |
+
|
1361 |
+
# prepare model inputs
|
1362 |
+
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
|
1363 |
+
|
1364 |
+
# forward pass to get next token
|
1365 |
+
outputs = self(
|
1366 |
+
**model_inputs,
|
1367 |
+
return_dict=True,
|
1368 |
+
output_attentions=output_attentions,
|
1369 |
+
output_hidden_states=output_hidden_states,
|
1370 |
+
)
|
1371 |
+
|
1372 |
+
if synced_gpus and this_peer_finished:
|
1373 |
+
continue # don't waste resources running the code we don't need
|
1374 |
+
|
1375 |
+
next_token_logits = outputs.logits[:, -1, :]
|
1376 |
+
|
1377 |
+
# pre-process distribution
|
1378 |
+
next_tokens_scores = logits_processor(input_ids, next_token_logits)
|
1379 |
+
|
1380 |
+
# Store scores, attentions and hidden_states when required
|
1381 |
+
if return_dict_in_generate:
|
1382 |
+
if output_scores:
|
1383 |
+
scores += (next_tokens_scores,)
|
1384 |
+
if output_attentions:
|
1385 |
+
decoder_attentions += (
|
1386 |
+
(outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
|
1387 |
+
)
|
1388 |
+
if self.config.is_encoder_decoder:
|
1389 |
+
cross_attentions += (outputs.cross_attentions,)
|
1390 |
+
|
1391 |
+
if output_hidden_states:
|
1392 |
+
decoder_hidden_states += (
|
1393 |
+
(outputs.decoder_hidden_states,)
|
1394 |
+
if self.config.is_encoder_decoder
|
1395 |
+
else (outputs.hidden_states,)
|
1396 |
+
)
|
1397 |
+
|
1398 |
+
# argmax
|
1399 |
+
next_tokens = torch.argmax(next_tokens_scores, dim=-1).to(device=input_ids.device)
|
1400 |
+
# finished sentences should have their next token be a padding token
|
1401 |
+
if eos_token_id is not None:
|
1402 |
+
if pad_token_id is None:
|
1403 |
+
raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
|
1404 |
+
next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
|
1405 |
+
|
1406 |
+
# update generated ids, model inputs, and length for next step
|
1407 |
+
input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
|
1408 |
+
if streamer is not None:
|
1409 |
+
streamer.put(next_tokens.cpu())
|
1410 |
+
model_kwargs = self._update_model_kwargs_for_generation(
|
1411 |
+
outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
|
1412 |
+
)
|
1413 |
+
|
1414 |
+
# if eos_token was found in one sentence, set sentence to finished
|
1415 |
+
if eos_token_id_tensor is not None:
|
1416 |
+
unfinished_sequences = unfinished_sequences.mul(
|
1417 |
+
next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
|
1418 |
+
)
|
1419 |
+
|
1420 |
+
# stop when each sentence is finished
|
1421 |
+
if unfinished_sequences.max() == 0:
|
1422 |
+
this_peer_finished = True
|
1423 |
+
|
1424 |
+
# stop if we exceed the maximum length
|
1425 |
+
if stopping_criteria(input_ids, scores):
|
1426 |
+
this_peer_finished = True
|
1427 |
+
|
1428 |
+
if this_peer_finished and not synced_gpus:
|
1429 |
+
break
|
1430 |
+
|
1431 |
+
if streamer is not None:
|
1432 |
+
streamer.end()
|
1433 |
+
|
1434 |
+
if return_dict_in_generate:
|
1435 |
+
if self.config.is_encoder_decoder:
|
1436 |
+
return GreedySearchEncoderDecoderOutput(
|
1437 |
+
sequences=input_ids,
|
1438 |
+
scores=scores,
|
1439 |
+
encoder_attentions=encoder_attentions,
|
1440 |
+
encoder_hidden_states=encoder_hidden_states,
|
1441 |
+
decoder_attentions=decoder_attentions,
|
1442 |
+
cross_attentions=cross_attentions,
|
1443 |
+
decoder_hidden_states=decoder_hidden_states,
|
1444 |
+
past_key_values=model_kwargs.get("past_key_values"),
|
1445 |
+
)
|
1446 |
+
else:
|
1447 |
+
return GreedySearchDecoderOnlyOutput(
|
1448 |
+
sequences=input_ids,
|
1449 |
+
scores=scores,
|
1450 |
+
attentions=decoder_attentions,
|
1451 |
+
hidden_states=decoder_hidden_states,
|
1452 |
+
past_key_values=model_kwargs.get("past_key_values"),
|
1453 |
+
)
|
1454 |
+
else:
|
1455 |
+
return input_ids
|
1456 |
+
|
1457 |
|
1458 |
# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
|
1459 |
@add_start_docstrings(
|
modeling_internvl_chat.py
CHANGED
@@ -26,7 +26,7 @@ logger = logging.get_logger(__name__)
|
|
26 |
class InternVLChatModel(PreTrainedModel):
|
27 |
config_class = InternVLChatConfig
|
28 |
main_input_name = 'pixel_values'
|
29 |
-
_no_split_modules = ['InternVisionEncoderLayer', 'LlamaDecoderLayer'
|
30 |
|
31 |
def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
|
32 |
super().__init__(config)
|
@@ -337,7 +337,6 @@ class InternVLChatModel(PreTrainedModel):
|
|
337 |
vit_embeds = visual_features
|
338 |
else:
|
339 |
vit_embeds = self.extract_feature(pixel_values)
|
340 |
-
|
341 |
input_embeds = self.language_model.get_input_embeddings()(input_ids)
|
342 |
B, N, C = input_embeds.shape
|
343 |
input_embeds = input_embeds.reshape(B * N, C)
|
@@ -345,7 +344,7 @@ class InternVLChatModel(PreTrainedModel):
|
|
345 |
input_ids = input_ids.reshape(B * N)
|
346 |
selected = (input_ids == self.img_context_token_id)
|
347 |
assert selected.sum() != 0
|
348 |
-
input_embeds[selected] = vit_embeds.reshape(-1, C)
|
349 |
|
350 |
input_embeds = input_embeds.reshape(B, N, C)
|
351 |
else:
|
|
|
26 |
class InternVLChatModel(PreTrainedModel):
|
27 |
config_class = InternVLChatConfig
|
28 |
main_input_name = 'pixel_values'
|
29 |
+
_no_split_modules = ['InternVisionEncoderLayer', 'LlamaDecoderLayer']
|
30 |
|
31 |
def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
|
32 |
super().__init__(config)
|
|
|
337 |
vit_embeds = visual_features
|
338 |
else:
|
339 |
vit_embeds = self.extract_feature(pixel_values)
|
|
|
340 |
input_embeds = self.language_model.get_input_embeddings()(input_ids)
|
341 |
B, N, C = input_embeds.shape
|
342 |
input_embeds = input_embeds.reshape(B * N, C)
|
|
|
344 |
input_ids = input_ids.reshape(B * N)
|
345 |
selected = (input_ids == self.img_context_token_id)
|
346 |
assert selected.sum() != 0
|
347 |
+
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
|
348 |
|
349 |
input_embeds = input_embeds.reshape(B, N, C)
|
350 |
else:
|