[#9] main_deploy.py: removed authors & versions. config.yaml: incresed epochs with smaller lr. fetchers.py: typo fixed
Browse files- config.yaml +2 -2
- idiomify/fetchers.py +5 -5
- idiomify/models.py +0 -1
- idiomify/paths.py +0 -1
- idiomify/preprocess.py +0 -1
- main_deploy.py +2 -6
config.yaml
CHANGED
@@ -3,11 +3,11 @@ idiomifier:
|
|
3 |
ver: m-1-3
|
4 |
desc: Just overfitting on PIE dataset, but now with <idiom> & </idiom> special tokens.
|
5 |
bart: facebook/bart-base
|
6 |
-
lr: 0.
|
7 |
literal2idiomatic_ver: d-1-3
|
8 |
idioms_ver: d-1-3
|
9 |
tokenizer_ver: t-1-1
|
10 |
-
max_epochs:
|
11 |
batch_size: 40
|
12 |
shuffle: true
|
13 |
seed: 104
|
|
|
3 |
ver: m-1-3
|
4 |
desc: Just overfitting on PIE dataset, but now with <idiom> & </idiom> special tokens.
|
5 |
bart: facebook/bart-base
|
6 |
+
lr: 0.00005
|
7 |
literal2idiomatic_ver: d-1-3
|
8 |
idioms_ver: d-1-3
|
9 |
tokenizer_ver: t-1-1
|
10 |
+
max_epochs: 8
|
11 |
batch_size: 40
|
12 |
shuffle: true
|
13 |
seed: 104
|
idiomify/fetchers.py
CHANGED
@@ -27,7 +27,7 @@ def fetch_idioms(ver: str, run: Run = None) -> pd.DataFrame:
|
|
27 |
artifact = run.use_artifact(f"idioms:{ver}", type="dataset")
|
28 |
else:
|
29 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/idioms:{ver}", type="dataset")
|
30 |
-
artifact_dir = artifact.download(root=idioms_dir(ver))
|
31 |
tsv_path = path.join(artifact_dir, "all.tsv")
|
32 |
return pd.read_csv(tsv_path, sep="\t")
|
33 |
|
@@ -39,7 +39,7 @@ def fetch_literal2idiomatic(ver: str, run: Run = None) -> Tuple[pd.DataFrame, pd
|
|
39 |
artifact = run.use_artifact(f"literal2idiomatic:{ver}", type="dataset")
|
40 |
else:
|
41 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/literal2idiomatic:{ver}", type="dataset")
|
42 |
-
artifact_dir = artifact.download(root=literal2idiomatic(ver))
|
43 |
train_path = path.join(artifact_dir, "train.tsv")
|
44 |
test_path = path.join(artifact_dir, "test.tsv")
|
45 |
train_df = pd.read_csv(train_path, sep="\t")
|
@@ -57,10 +57,10 @@ def fetch_idiomifier(ver: str, run: Run = None) -> Idiomifier:
|
|
57 |
else:
|
58 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/idiomifier:{ver}", type="model")
|
59 |
config = artifact.metadata
|
60 |
-
artifact_dir = artifact.download(root=idiomifier_dir(ver))
|
61 |
ckpt_path = path.join(artifact_dir, "model.ckpt")
|
62 |
bart = AutoModelForSeq2SeqLM.from_config(AutoConfig.from_pretrained(config['bart']))
|
63 |
-
bart.
|
64 |
model = Idiomifier.load_from_checkpoint(ckpt_path, bart=bart)
|
65 |
return model
|
66 |
|
@@ -70,7 +70,7 @@ def fetch_tokenizer(ver: str, run: Run = None) -> BartTokenizer:
|
|
70 |
artifact = run.use_artifact(f"tokenizer:{ver}", type="other")
|
71 |
else:
|
72 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/tokenizer:{ver}", type="other")
|
73 |
-
artifact_dir = artifact.download(root=tokenizer_dir(ver))
|
74 |
tokenizer = BartTokenizer.from_pretrained(artifact_dir)
|
75 |
return tokenizer
|
76 |
|
|
|
27 |
artifact = run.use_artifact(f"idioms:{ver}", type="dataset")
|
28 |
else:
|
29 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/idioms:{ver}", type="dataset")
|
30 |
+
artifact_dir = artifact.download(root=str(idioms_dir(ver)))
|
31 |
tsv_path = path.join(artifact_dir, "all.tsv")
|
32 |
return pd.read_csv(tsv_path, sep="\t")
|
33 |
|
|
|
39 |
artifact = run.use_artifact(f"literal2idiomatic:{ver}", type="dataset")
|
40 |
else:
|
41 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/literal2idiomatic:{ver}", type="dataset")
|
42 |
+
artifact_dir = artifact.download(root=str(literal2idiomatic(ver)))
|
43 |
train_path = path.join(artifact_dir, "train.tsv")
|
44 |
test_path = path.join(artifact_dir, "test.tsv")
|
45 |
train_df = pd.read_csv(train_path, sep="\t")
|
|
|
57 |
else:
|
58 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/idiomifier:{ver}", type="model")
|
59 |
config = artifact.metadata
|
60 |
+
artifact_dir = artifact.download(root=str(idiomifier_dir(ver)))
|
61 |
ckpt_path = path.join(artifact_dir, "model.ckpt")
|
62 |
bart = AutoModelForSeq2SeqLM.from_config(AutoConfig.from_pretrained(config['bart']))
|
63 |
+
bart.resize_token_embeddings(config['vocab_size'])
|
64 |
model = Idiomifier.load_from_checkpoint(ckpt_path, bart=bart)
|
65 |
return model
|
66 |
|
|
|
70 |
artifact = run.use_artifact(f"tokenizer:{ver}", type="other")
|
71 |
else:
|
72 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/tokenizer:{ver}", type="other")
|
73 |
+
artifact_dir = artifact.download(root=str(tokenizer_dir(ver)))
|
74 |
tokenizer = BartTokenizer.from_pretrained(artifact_dir)
|
75 |
return tokenizer
|
76 |
|
idiomify/models.py
CHANGED
@@ -71,4 +71,3 @@ class Idiomifier(pl.LightningModule): # noqa
|
|
71 |
"""
|
72 |
# The authors used Adam, so we might as well use it as well.
|
73 |
return torch.optim.AdamW(self.parameters(), lr=self.hparams['lr'])
|
74 |
-
|
|
|
71 |
"""
|
72 |
# The authors used Adam, so we might as well use it as well.
|
73 |
return torch.optim.AdamW(self.parameters(), lr=self.hparams['lr'])
|
|
idiomify/paths.py
CHANGED
@@ -19,4 +19,3 @@ def idiomifier_dir(ver: str) -> Path:
|
|
19 |
|
20 |
def tokenizer_dir(ver: str) -> Path:
|
21 |
return ARTIFACTS_DIR / f"tokenizer_{ver}"
|
22 |
-
|
|
|
19 |
|
20 |
def tokenizer_dir(ver: str) -> Path:
|
21 |
return ARTIFACTS_DIR / f"tokenizer_{ver}"
|
|
idiomify/preprocess.py
CHANGED
@@ -59,4 +59,3 @@ def stratified_split(df: pd.DataFrame, ratio: float, seed: int) -> Tuple[pd.Data
|
|
59 |
test_size=other_size, random_state=seed,
|
60 |
shuffle=True)
|
61 |
return ratio_df, other_df
|
62 |
-
|
|
|
59 |
test_size=other_size, random_state=seed,
|
60 |
shuffle=True)
|
61 |
return ratio_df, other_df
|
|
main_deploy.py
CHANGED
@@ -1,20 +1,18 @@
|
|
1 |
"""
|
2 |
we deploy the pipeline via streamlit.
|
3 |
"""
|
4 |
-
from typing import Tuple, List
|
5 |
import streamlit as st
|
6 |
from transformers import BartTokenizer
|
7 |
from idiomify.fetchers import fetch_config, fetch_idiomifier, fetch_idioms
|
8 |
from idiomify.pipeline import Pipeline
|
9 |
-
from idiomify.models import Idiomifier
|
10 |
|
11 |
|
12 |
@st.cache(allow_output_mutation=True)
|
13 |
-
def fetch_resources() ->
|
14 |
config = fetch_config()['idiomifier']
|
15 |
model = fetch_idiomifier(config['ver'])
|
16 |
-
idioms = fetch_idioms(config['idioms_ver'])
|
17 |
tokenizer = BartTokenizer.from_pretrained(config['bart'])
|
|
|
18 |
return config, model, tokenizer, idioms
|
19 |
|
20 |
|
@@ -24,8 +22,6 @@ def main():
|
|
24 |
model.eval()
|
25 |
pipeline = Pipeline(model, tokenizer)
|
26 |
st.title("Idiomify Demo")
|
27 |
-
st.markdown(f"Author: `Eu-Bin KIM`")
|
28 |
-
st.markdown(f"Version: `{config['ver']}`")
|
29 |
text = st.text_area("Type sentences here",
|
30 |
value="Just remember there will always be a hope even when things look black")
|
31 |
with st.sidebar:
|
|
|
1 |
"""
|
2 |
we deploy the pipeline via streamlit.
|
3 |
"""
|
|
|
4 |
import streamlit as st
|
5 |
from transformers import BartTokenizer
|
6 |
from idiomify.fetchers import fetch_config, fetch_idiomifier, fetch_idioms
|
7 |
from idiomify.pipeline import Pipeline
|
|
|
8 |
|
9 |
|
10 |
@st.cache(allow_output_mutation=True)
|
11 |
+
def fetch_resources() -> tuple:
|
12 |
config = fetch_config()['idiomifier']
|
13 |
model = fetch_idiomifier(config['ver'])
|
|
|
14 |
tokenizer = BartTokenizer.from_pretrained(config['bart'])
|
15 |
+
idioms = fetch_idioms(config['idioms_ver'])
|
16 |
return config, model, tokenizer, idioms
|
17 |
|
18 |
|
|
|
22 |
model.eval()
|
23 |
pipeline = Pipeline(model, tokenizer)
|
24 |
st.title("Idiomify Demo")
|
|
|
|
|
25 |
text = st.text_area("Type sentences here",
|
26 |
value="Just remember there will always be a hope even when things look black")
|
27 |
with st.sidebar:
|