KoichiYasuoka
commited on
Commit
•
e2024dd
1
Parent(s):
e62a0a0
model improved
Browse files- config.json +1 -1
- maker.py +127 -0
- oldtokenizer.json +0 -0
- pytorch_model-00001-of-00002.bin +1 -1
- pytorch_model-00002-of-00002.bin +1 -1
- tokenizer.json +0 -0
- ud.py +8 -2
config.json
CHANGED
@@ -372,7 +372,7 @@
|
|
372 |
"summary_use_proj": true,
|
373 |
"tokenizer_class": "PreTrainedTokenizerFast",
|
374 |
"torch_dtype": "float32",
|
375 |
-
"transformers_version": "4.
|
376 |
"use_cache": true,
|
377 |
"vocab_size": 44928
|
378 |
}
|
|
|
372 |
"summary_use_proj": true,
|
373 |
"tokenizer_class": "PreTrainedTokenizerFast",
|
374 |
"torch_dtype": "float32",
|
375 |
+
"transformers_version": "4.44.2",
|
376 |
"use_cache": true,
|
377 |
"vocab_size": 44928
|
378 |
}
|
maker.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#! /usr/bin/python3
|
2 |
+
src="rinna/japanese-gpt-1b"
|
3 |
+
tgt="KoichiYasuoka/rinna-gpt2-1b-japanese-ud-causal"
|
4 |
+
url="https://github.com/UniversalDependencies/UD_Japanese-GSDLUW"
|
5 |
+
|
6 |
+
import os,json,unicodedata
|
7 |
+
from transformers import AutoTokenizer,PreTrainedTokenizerFast,AutoConfig,GPT2ForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
|
8 |
+
d=os.path.basename(url)
|
9 |
+
os.system("test -d "+d+" || git clone --depth=1 "+url)
|
10 |
+
os.system("for F in train dev test ; do cp "+d+"/*-$F.conllu $F.conllu ; done")
|
11 |
+
tkz=AutoTokenizer.from_pretrained(src,add_prefix_space=False,legacy=False,model_max_length=2048)
|
12 |
+
tkz.save_pretrained("tmpdir")
|
13 |
+
d=json.loads(tkz.backend_tokenizer.to_str())
|
14 |
+
tkz.backend_tokenizer.from_str(json.dumps(d)).save("tmpdir/oldtokenizer.json")
|
15 |
+
form=set()
|
16 |
+
with open("train.conllu","r",encoding="utf-8") as r:
|
17 |
+
for s in r:
|
18 |
+
w=s.split("\t")
|
19 |
+
if len(w)==10 and w[0].isdecimal():
|
20 |
+
form.add(w[1])
|
21 |
+
m=[t for t in d["model"]["merges"] if len(t)<4 and t.replace(" ","") in form and not unicodedata.name(t[0],"").startswith("HIRAGANA")]
|
22 |
+
d["model"]["merges"]=m
|
23 |
+
tkz.backend_tokenizer.from_str(json.dumps(d)).save("tmpdir/tokenizer.json")
|
24 |
+
ntk=PreTrainedTokenizerFast.from_pretrained("tmpdir")
|
25 |
+
otk=PreTrainedTokenizerFast.from_pretrained("tmpdir",tokenizer_file="tmpdir/oldtokenizer.json")
|
26 |
+
|
27 |
+
class UDCausalDataset(object):
|
28 |
+
def __init__(self,conllu,tokenizer,oldtokenizer=None,embeddings=None):
|
29 |
+
self.conllu=open(conllu,"r",encoding="utf-8")
|
30 |
+
self.tokenizer=tokenizer
|
31 |
+
self.oldtokenizer=oldtokenizer if oldtokenizer else tokenizer
|
32 |
+
self.embeddings=embeddings
|
33 |
+
self.max_tokens=3
|
34 |
+
self.seeks=[(0,0)]
|
35 |
+
label=set(["SYM"])
|
36 |
+
dep=set()
|
37 |
+
s=self.conllu.readline()
|
38 |
+
while s!="":
|
39 |
+
if s=="\n":
|
40 |
+
self.seeks.append((self.conllu.tell(),0))
|
41 |
+
else:
|
42 |
+
w=s.split("\t")
|
43 |
+
if len(w)==10:
|
44 |
+
if w[0].isdecimal():
|
45 |
+
p=w[3] if w[5]=="_" else w[3]+"|"+w[5]
|
46 |
+
label.add(p)
|
47 |
+
dep.add(p+("|" if w[6]=="0" else "|l-" if int(w[0])<int(w[6]) else "|r-")+w[7])
|
48 |
+
self.seeks.append((self.seeks[-1][0],int(w[0])))
|
49 |
+
self.max_tokens=max(self.max_tokens,int(w[0])*2+1)
|
50 |
+
s=self.conllu.readline()
|
51 |
+
lid={}
|
52 |
+
for i,l in enumerate(sorted(label)):
|
53 |
+
lid[l],lid["B-"+l],lid["I-"+l]=i*3,i*3+1,i*3+2
|
54 |
+
for i,d in enumerate(sorted(dep),len(lid)):
|
55 |
+
lid[d]=i
|
56 |
+
self.label2id=lid
|
57 |
+
def __call__(*args):
|
58 |
+
lid={l:i for i,l in enumerate(sorted(set(sum([list(t.label2id) for t in args],[]))))}
|
59 |
+
for t in args:
|
60 |
+
t.label2id=lid
|
61 |
+
return lid
|
62 |
+
def __del__(self):
|
63 |
+
self.conllu.close()
|
64 |
+
__len__=lambda self:len(self.seeks)-1
|
65 |
+
def __getitem__(self,i):
|
66 |
+
s,t=self.seeks[i]
|
67 |
+
self.conllu.seek(s)
|
68 |
+
form,upos,deps,w=[],[],[],[""]
|
69 |
+
while w[0]!="\n":
|
70 |
+
w=self.conllu.readline().split("\t")
|
71 |
+
if len(w)==10:
|
72 |
+
form.append(w[1])
|
73 |
+
if w[0].isdecimal():
|
74 |
+
upos.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
|
75 |
+
deps.append((int(w[6]),w[7]))
|
76 |
+
if t==0:
|
77 |
+
v=self.tokenizer(form,add_special_tokens=False)
|
78 |
+
i,u=[],[]
|
79 |
+
for j,(x,y) in enumerate(zip(v["input_ids"],upos)):
|
80 |
+
if x!=[]:
|
81 |
+
i+=x
|
82 |
+
u+=[y] if len(x)==1 else ["B-"+y]+["I-"+y]*(len(x)-1)
|
83 |
+
emb=self.embeddings
|
84 |
+
pad=self.tokenizer.pad_token_id
|
85 |
+
else:
|
86 |
+
import torch
|
87 |
+
v=self.oldtokenizer(form,add_special_tokens=False)
|
88 |
+
m=[]
|
89 |
+
for x in v["input_ids"]:
|
90 |
+
if x==[]:
|
91 |
+
m.append(self.embeddings[self.tokenizer.unk_token_id,:])
|
92 |
+
else:
|
93 |
+
m.append(self.embeddings[x,:].sum(axis=0))
|
94 |
+
m.append(self.embeddings[self.tokenizer.sep_token_id,:])
|
95 |
+
m.append(self.embeddings[self.tokenizer.pad_token_id,:])
|
96 |
+
emb=torch.stack(m)
|
97 |
+
i,u=list(range(len(upos)+1)),upos+["SYM"]
|
98 |
+
i.append(t-1)
|
99 |
+
k,d=deps[t-1]
|
100 |
+
u.append(upos[t-1]+"|"+d if k==0 else upos[t-1])
|
101 |
+
for j in range(t,len(upos)):
|
102 |
+
i.append(j)
|
103 |
+
a,b=deps[j]
|
104 |
+
u.append(upos[j]+"|r-"+b if a==t else upos[t-1]+"|l-"+d if j+1==k else upos[j])
|
105 |
+
pad=-1
|
106 |
+
j=self.max_tokens-len(i)
|
107 |
+
if j>0:
|
108 |
+
ids=i+[pad]*j
|
109 |
+
upos=u+["SYM"]*j
|
110 |
+
else:
|
111 |
+
ids=i[0:self.max_tokens]
|
112 |
+
upos=u[0:self.max_tokens]
|
113 |
+
return {"inputs_embeds":emb[ids,:],"labels":[self.label2id[p] for p in upos]}
|
114 |
+
|
115 |
+
trainDS=UDCausalDataset("train.conllu",ntk,otk)
|
116 |
+
devDS=UDCausalDataset("dev.conllu",ntk,otk)
|
117 |
+
testDS=UDCausalDataset("test.conllu",ntk,otk)
|
118 |
+
lid=trainDS(devDS,testDS)
|
119 |
+
cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
|
120 |
+
mdl=GPT2ForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True)
|
121 |
+
trainDS.embeddings=mdl.get_input_embeddings().weight
|
122 |
+
trainDS.max_tokens=min(trainDS.max_tokens,cfg.max_position_embeddings)
|
123 |
+
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,dataloader_pin_memory=False,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
|
124 |
+
trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
|
125 |
+
trn.train()
|
126 |
+
trn.save_model(tgt)
|
127 |
+
ntk.save_pretrained(tgt)
|
oldtokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pytorch_model-00001-of-00002.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4942377882
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ac2e53a9210148a94e53883c4618de3a22d0051ea5c71f16e53cfc22b90e02d
|
3 |
size 4942377882
|
pytorch_model-00002-of-00002.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 269925550
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8dc9f31720cbe935f3695ffa53174ee6d8be651e31390f9c306ebdc6480474d7
|
3 |
size 269925550
|
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
ud.py
CHANGED
@@ -1,5 +1,10 @@
|
|
1 |
import numpy
|
2 |
-
from transformers import TokenClassificationPipeline
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
5 |
def __init__(self,**kwargs):
|
@@ -42,6 +47,7 @@ class UniversalDependenciesCausalPipeline(BellmanFordTokenClassificationPipeline
|
|
42 |
def __init__(self,**kwargs):
|
43 |
kwargs["aggregation_strategy"]="simple"
|
44 |
super().__init__(**kwargs)
|
|
|
45 |
x=self.model.config.label2id
|
46 |
self.root=numpy.full((len(x)),numpy.nan)
|
47 |
self.left_arc=numpy.full((len(x)),numpy.nan)
|
@@ -87,7 +93,7 @@ class UniversalDependenciesCausalPipeline(BellmanFordTokenClassificationPipeline
|
|
87 |
if d[i].strip()=="":
|
88 |
d.pop(i)
|
89 |
w.pop(i)
|
90 |
-
v=self.
|
91 |
e=self.model.get_input_embeddings().weight
|
92 |
m=[]
|
93 |
for x in v["input_ids"]:
|
|
|
1 |
import numpy
|
2 |
+
from transformers import TokenClassificationPipeline,AutoTokenizer
|
3 |
+
try:
|
4 |
+
from transformers.utils import cached_file
|
5 |
+
except:
|
6 |
+
from transformers.file_utils import cached_path,hf_bucket_url
|
7 |
+
cached_file=lambda x,y:os.path.join(x,y) if os.path.isdir(x) else cached_path(hf_bucket_url(x,y))
|
8 |
|
9 |
class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
10 |
def __init__(self,**kwargs):
|
|
|
47 |
def __init__(self,**kwargs):
|
48 |
kwargs["aggregation_strategy"]="simple"
|
49 |
super().__init__(**kwargs)
|
50 |
+
self.oldtokenizer=AutoTokenizer.from_pretrained(self.tokenizer.name_or_path,tokenizer_file=cached_file(self.tokenizer.name_or_path,"oldtokenizer.json"))
|
51 |
x=self.model.config.label2id
|
52 |
self.root=numpy.full((len(x)),numpy.nan)
|
53 |
self.left_arc=numpy.full((len(x)),numpy.nan)
|
|
|
93 |
if d[i].strip()=="":
|
94 |
d.pop(i)
|
95 |
w.pop(i)
|
96 |
+
v=self.oldtokenizer(d,add_special_tokens=False)
|
97 |
e=self.model.get_input_embeddings().weight
|
98 |
m=[]
|
99 |
for x in v["input_ids"]:
|