KoichiYasuoka commited on
Commit
e2024dd
1 Parent(s): e62a0a0

model improved

Browse files
config.json CHANGED
@@ -372,7 +372,7 @@
372
  "summary_use_proj": true,
373
  "tokenizer_class": "PreTrainedTokenizerFast",
374
  "torch_dtype": "float32",
375
- "transformers_version": "4.39.3",
376
  "use_cache": true,
377
  "vocab_size": 44928
378
  }
 
372
  "summary_use_proj": true,
373
  "tokenizer_class": "PreTrainedTokenizerFast",
374
  "torch_dtype": "float32",
375
+ "transformers_version": "4.44.2",
376
  "use_cache": true,
377
  "vocab_size": 44928
378
  }
maker.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /usr/bin/python3
2
+ src="rinna/japanese-gpt-1b"
3
+ tgt="KoichiYasuoka/rinna-gpt2-1b-japanese-ud-causal"
4
+ url="https://github.com/UniversalDependencies/UD_Japanese-GSDLUW"
5
+
6
+ import os,json,unicodedata
7
+ from transformers import AutoTokenizer,PreTrainedTokenizerFast,AutoConfig,GPT2ForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
8
+ d=os.path.basename(url)
9
+ os.system("test -d "+d+" || git clone --depth=1 "+url)
10
+ os.system("for F in train dev test ; do cp "+d+"/*-$F.conllu $F.conllu ; done")
11
+ tkz=AutoTokenizer.from_pretrained(src,add_prefix_space=False,legacy=False,model_max_length=2048)
12
+ tkz.save_pretrained("tmpdir")
13
+ d=json.loads(tkz.backend_tokenizer.to_str())
14
+ tkz.backend_tokenizer.from_str(json.dumps(d)).save("tmpdir/oldtokenizer.json")
15
+ form=set()
16
+ with open("train.conllu","r",encoding="utf-8") as r:
17
+ for s in r:
18
+ w=s.split("\t")
19
+ if len(w)==10 and w[0].isdecimal():
20
+ form.add(w[1])
21
+ m=[t for t in d["model"]["merges"] if len(t)<4 and t.replace(" ","") in form and not unicodedata.name(t[0],"").startswith("HIRAGANA")]
22
+ d["model"]["merges"]=m
23
+ tkz.backend_tokenizer.from_str(json.dumps(d)).save("tmpdir/tokenizer.json")
24
+ ntk=PreTrainedTokenizerFast.from_pretrained("tmpdir")
25
+ otk=PreTrainedTokenizerFast.from_pretrained("tmpdir",tokenizer_file="tmpdir/oldtokenizer.json")
26
+
27
+ class UDCausalDataset(object):
28
+ def __init__(self,conllu,tokenizer,oldtokenizer=None,embeddings=None):
29
+ self.conllu=open(conllu,"r",encoding="utf-8")
30
+ self.tokenizer=tokenizer
31
+ self.oldtokenizer=oldtokenizer if oldtokenizer else tokenizer
32
+ self.embeddings=embeddings
33
+ self.max_tokens=3
34
+ self.seeks=[(0,0)]
35
+ label=set(["SYM"])
36
+ dep=set()
37
+ s=self.conllu.readline()
38
+ while s!="":
39
+ if s=="\n":
40
+ self.seeks.append((self.conllu.tell(),0))
41
+ else:
42
+ w=s.split("\t")
43
+ if len(w)==10:
44
+ if w[0].isdecimal():
45
+ p=w[3] if w[5]=="_" else w[3]+"|"+w[5]
46
+ label.add(p)
47
+ dep.add(p+("|" if w[6]=="0" else "|l-" if int(w[0])<int(w[6]) else "|r-")+w[7])
48
+ self.seeks.append((self.seeks[-1][0],int(w[0])))
49
+ self.max_tokens=max(self.max_tokens,int(w[0])*2+1)
50
+ s=self.conllu.readline()
51
+ lid={}
52
+ for i,l in enumerate(sorted(label)):
53
+ lid[l],lid["B-"+l],lid["I-"+l]=i*3,i*3+1,i*3+2
54
+ for i,d in enumerate(sorted(dep),len(lid)):
55
+ lid[d]=i
56
+ self.label2id=lid
57
+ def __call__(*args):
58
+ lid={l:i for i,l in enumerate(sorted(set(sum([list(t.label2id) for t in args],[]))))}
59
+ for t in args:
60
+ t.label2id=lid
61
+ return lid
62
+ def __del__(self):
63
+ self.conllu.close()
64
+ __len__=lambda self:len(self.seeks)-1
65
+ def __getitem__(self,i):
66
+ s,t=self.seeks[i]
67
+ self.conllu.seek(s)
68
+ form,upos,deps,w=[],[],[],[""]
69
+ while w[0]!="\n":
70
+ w=self.conllu.readline().split("\t")
71
+ if len(w)==10:
72
+ form.append(w[1])
73
+ if w[0].isdecimal():
74
+ upos.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
75
+ deps.append((int(w[6]),w[7]))
76
+ if t==0:
77
+ v=self.tokenizer(form,add_special_tokens=False)
78
+ i,u=[],[]
79
+ for j,(x,y) in enumerate(zip(v["input_ids"],upos)):
80
+ if x!=[]:
81
+ i+=x
82
+ u+=[y] if len(x)==1 else ["B-"+y]+["I-"+y]*(len(x)-1)
83
+ emb=self.embeddings
84
+ pad=self.tokenizer.pad_token_id
85
+ else:
86
+ import torch
87
+ v=self.oldtokenizer(form,add_special_tokens=False)
88
+ m=[]
89
+ for x in v["input_ids"]:
90
+ if x==[]:
91
+ m.append(self.embeddings[self.tokenizer.unk_token_id,:])
92
+ else:
93
+ m.append(self.embeddings[x,:].sum(axis=0))
94
+ m.append(self.embeddings[self.tokenizer.sep_token_id,:])
95
+ m.append(self.embeddings[self.tokenizer.pad_token_id,:])
96
+ emb=torch.stack(m)
97
+ i,u=list(range(len(upos)+1)),upos+["SYM"]
98
+ i.append(t-1)
99
+ k,d=deps[t-1]
100
+ u.append(upos[t-1]+"|"+d if k==0 else upos[t-1])
101
+ for j in range(t,len(upos)):
102
+ i.append(j)
103
+ a,b=deps[j]
104
+ u.append(upos[j]+"|r-"+b if a==t else upos[t-1]+"|l-"+d if j+1==k else upos[j])
105
+ pad=-1
106
+ j=self.max_tokens-len(i)
107
+ if j>0:
108
+ ids=i+[pad]*j
109
+ upos=u+["SYM"]*j
110
+ else:
111
+ ids=i[0:self.max_tokens]
112
+ upos=u[0:self.max_tokens]
113
+ return {"inputs_embeds":emb[ids,:],"labels":[self.label2id[p] for p in upos]}
114
+
115
+ trainDS=UDCausalDataset("train.conllu",ntk,otk)
116
+ devDS=UDCausalDataset("dev.conllu",ntk,otk)
117
+ testDS=UDCausalDataset("test.conllu",ntk,otk)
118
+ lid=trainDS(devDS,testDS)
119
+ cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
120
+ mdl=GPT2ForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True)
121
+ trainDS.embeddings=mdl.get_input_embeddings().weight
122
+ trainDS.max_tokens=min(trainDS.max_tokens,cfg.max_position_embeddings)
123
+ arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,dataloader_pin_memory=False,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
124
+ trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
125
+ trn.train()
126
+ trn.save_model(tgt)
127
+ ntk.save_pretrained(tgt)
oldtokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model-00001-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91039cb93e06719c95158ec2ba95280ae10d397f42bec3365252b0e277dea8d0
3
  size 4942377882
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ac2e53a9210148a94e53883c4618de3a22d0051ea5c71f16e53cfc22b90e02d
3
  size 4942377882
pytorch_model-00002-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a0e0733c23ac2270146c508d8386554be0326096203db6c1f2b5d18d36aded3
3
  size 269925550
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dc9f31720cbe935f3695ffa53174ee6d8be651e31390f9c306ebdc6480474d7
3
  size 269925550
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
ud.py CHANGED
@@ -1,5 +1,10 @@
1
  import numpy
2
- from transformers import TokenClassificationPipeline
 
 
 
 
 
3
 
4
  class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
5
  def __init__(self,**kwargs):
@@ -42,6 +47,7 @@ class UniversalDependenciesCausalPipeline(BellmanFordTokenClassificationPipeline
42
  def __init__(self,**kwargs):
43
  kwargs["aggregation_strategy"]="simple"
44
  super().__init__(**kwargs)
 
45
  x=self.model.config.label2id
46
  self.root=numpy.full((len(x)),numpy.nan)
47
  self.left_arc=numpy.full((len(x)),numpy.nan)
@@ -87,7 +93,7 @@ class UniversalDependenciesCausalPipeline(BellmanFordTokenClassificationPipeline
87
  if d[i].strip()=="":
88
  d.pop(i)
89
  w.pop(i)
90
- v=self.tokenizer(d,add_special_tokens=False)
91
  e=self.model.get_input_embeddings().weight
92
  m=[]
93
  for x in v["input_ids"]:
 
1
  import numpy
2
+ from transformers import TokenClassificationPipeline,AutoTokenizer
3
+ try:
4
+ from transformers.utils import cached_file
5
+ except:
6
+ from transformers.file_utils import cached_path,hf_bucket_url
7
+ cached_file=lambda x,y:os.path.join(x,y) if os.path.isdir(x) else cached_path(hf_bucket_url(x,y))
8
 
9
  class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
10
  def __init__(self,**kwargs):
 
47
  def __init__(self,**kwargs):
48
  kwargs["aggregation_strategy"]="simple"
49
  super().__init__(**kwargs)
50
+ self.oldtokenizer=AutoTokenizer.from_pretrained(self.tokenizer.name_or_path,tokenizer_file=cached_file(self.tokenizer.name_or_path,"oldtokenizer.json"))
51
  x=self.model.config.label2id
52
  self.root=numpy.full((len(x)),numpy.nan)
53
  self.left_arc=numpy.full((len(x)),numpy.nan)
 
93
  if d[i].strip()=="":
94
  d.pop(i)
95
  w.pop(i)
96
+ v=self.oldtokenizer(d,add_special_tokens=False)
97
  e=self.model.get_input_embeddings().weight
98
  m=[]
99
  for x in v["input_ids"]: