eubinecto commited on
Commit
59df933
1 Parent(s): 72190fb

[#9] main_deploy.py: removed authors & versions. config.yaml: incresed epochs with smaller lr. fetchers.py: typo fixed

Browse files
config.yaml CHANGED
@@ -3,11 +3,11 @@ idiomifier:
3
  ver: m-1-3
4
  desc: Just overfitting on PIE dataset, but now with <idiom> & </idiom> special tokens.
5
  bart: facebook/bart-base
6
- lr: 0.0001
7
  literal2idiomatic_ver: d-1-3
8
  idioms_ver: d-1-3
9
  tokenizer_ver: t-1-1
10
- max_epochs: 3
11
  batch_size: 40
12
  shuffle: true
13
  seed: 104
 
3
  ver: m-1-3
4
  desc: Just overfitting on PIE dataset, but now with <idiom> & </idiom> special tokens.
5
  bart: facebook/bart-base
6
+ lr: 0.00005
7
  literal2idiomatic_ver: d-1-3
8
  idioms_ver: d-1-3
9
  tokenizer_ver: t-1-1
10
+ max_epochs: 8
11
  batch_size: 40
12
  shuffle: true
13
  seed: 104
idiomify/fetchers.py CHANGED
@@ -27,7 +27,7 @@ def fetch_idioms(ver: str, run: Run = None) -> pd.DataFrame:
27
  artifact = run.use_artifact(f"idioms:{ver}", type="dataset")
28
  else:
29
  artifact = wandb.Api().artifact(f"eubinecto/idiomify/idioms:{ver}", type="dataset")
30
- artifact_dir = artifact.download(root=idioms_dir(ver))
31
  tsv_path = path.join(artifact_dir, "all.tsv")
32
  return pd.read_csv(tsv_path, sep="\t")
33
 
@@ -39,7 +39,7 @@ def fetch_literal2idiomatic(ver: str, run: Run = None) -> Tuple[pd.DataFrame, pd
39
  artifact = run.use_artifact(f"literal2idiomatic:{ver}", type="dataset")
40
  else:
41
  artifact = wandb.Api().artifact(f"eubinecto/idiomify/literal2idiomatic:{ver}", type="dataset")
42
- artifact_dir = artifact.download(root=literal2idiomatic(ver))
43
  train_path = path.join(artifact_dir, "train.tsv")
44
  test_path = path.join(artifact_dir, "test.tsv")
45
  train_df = pd.read_csv(train_path, sep="\t")
@@ -57,10 +57,10 @@ def fetch_idiomifier(ver: str, run: Run = None) -> Idiomifier:
57
  else:
58
  artifact = wandb.Api().artifact(f"eubinecto/idiomify/idiomifier:{ver}", type="model")
59
  config = artifact.metadata
60
- artifact_dir = artifact.download(root=idiomifier_dir(ver))
61
  ckpt_path = path.join(artifact_dir, "model.ckpt")
62
  bart = AutoModelForSeq2SeqLM.from_config(AutoConfig.from_pretrained(config['bart']))
63
- bart.resize_embeddings(config['vocab_size'])
64
  model = Idiomifier.load_from_checkpoint(ckpt_path, bart=bart)
65
  return model
66
 
@@ -70,7 +70,7 @@ def fetch_tokenizer(ver: str, run: Run = None) -> BartTokenizer:
70
  artifact = run.use_artifact(f"tokenizer:{ver}", type="other")
71
  else:
72
  artifact = wandb.Api().artifact(f"eubinecto/idiomify/tokenizer:{ver}", type="other")
73
- artifact_dir = artifact.download(root=tokenizer_dir(ver))
74
  tokenizer = BartTokenizer.from_pretrained(artifact_dir)
75
  return tokenizer
76
 
 
27
  artifact = run.use_artifact(f"idioms:{ver}", type="dataset")
28
  else:
29
  artifact = wandb.Api().artifact(f"eubinecto/idiomify/idioms:{ver}", type="dataset")
30
+ artifact_dir = artifact.download(root=str(idioms_dir(ver)))
31
  tsv_path = path.join(artifact_dir, "all.tsv")
32
  return pd.read_csv(tsv_path, sep="\t")
33
 
 
39
  artifact = run.use_artifact(f"literal2idiomatic:{ver}", type="dataset")
40
  else:
41
  artifact = wandb.Api().artifact(f"eubinecto/idiomify/literal2idiomatic:{ver}", type="dataset")
42
+ artifact_dir = artifact.download(root=str(literal2idiomatic(ver)))
43
  train_path = path.join(artifact_dir, "train.tsv")
44
  test_path = path.join(artifact_dir, "test.tsv")
45
  train_df = pd.read_csv(train_path, sep="\t")
 
57
  else:
58
  artifact = wandb.Api().artifact(f"eubinecto/idiomify/idiomifier:{ver}", type="model")
59
  config = artifact.metadata
60
+ artifact_dir = artifact.download(root=str(idiomifier_dir(ver)))
61
  ckpt_path = path.join(artifact_dir, "model.ckpt")
62
  bart = AutoModelForSeq2SeqLM.from_config(AutoConfig.from_pretrained(config['bart']))
63
+ bart.resize_token_embeddings(config['vocab_size'])
64
  model = Idiomifier.load_from_checkpoint(ckpt_path, bart=bart)
65
  return model
66
 
 
70
  artifact = run.use_artifact(f"tokenizer:{ver}", type="other")
71
  else:
72
  artifact = wandb.Api().artifact(f"eubinecto/idiomify/tokenizer:{ver}", type="other")
73
+ artifact_dir = artifact.download(root=str(tokenizer_dir(ver)))
74
  tokenizer = BartTokenizer.from_pretrained(artifact_dir)
75
  return tokenizer
76
 
idiomify/models.py CHANGED
@@ -71,4 +71,3 @@ class Idiomifier(pl.LightningModule): # noqa
71
  """
72
  # The authors used Adam, so we might as well use it as well.
73
  return torch.optim.AdamW(self.parameters(), lr=self.hparams['lr'])
74
-
 
71
  """
72
  # The authors used Adam, so we might as well use it as well.
73
  return torch.optim.AdamW(self.parameters(), lr=self.hparams['lr'])
 
idiomify/paths.py CHANGED
@@ -19,4 +19,3 @@ def idiomifier_dir(ver: str) -> Path:
19
 
20
  def tokenizer_dir(ver: str) -> Path:
21
  return ARTIFACTS_DIR / f"tokenizer_{ver}"
22
-
 
19
 
20
  def tokenizer_dir(ver: str) -> Path:
21
  return ARTIFACTS_DIR / f"tokenizer_{ver}"
 
idiomify/preprocess.py CHANGED
@@ -59,4 +59,3 @@ def stratified_split(df: pd.DataFrame, ratio: float, seed: int) -> Tuple[pd.Data
59
  test_size=other_size, random_state=seed,
60
  shuffle=True)
61
  return ratio_df, other_df
62
-
 
59
  test_size=other_size, random_state=seed,
60
  shuffle=True)
61
  return ratio_df, other_df
 
main_deploy.py CHANGED
@@ -1,20 +1,18 @@
1
  """
2
  we deploy the pipeline via streamlit.
3
  """
4
- from typing import Tuple, List
5
  import streamlit as st
6
  from transformers import BartTokenizer
7
  from idiomify.fetchers import fetch_config, fetch_idiomifier, fetch_idioms
8
  from idiomify.pipeline import Pipeline
9
- from idiomify.models import Idiomifier
10
 
11
 
12
  @st.cache(allow_output_mutation=True)
13
- def fetch_resources() -> Tuple[dict, Idiomifier, BartTokenizer, List[str]]:
14
  config = fetch_config()['idiomifier']
15
  model = fetch_idiomifier(config['ver'])
16
- idioms = fetch_idioms(config['idioms_ver'])
17
  tokenizer = BartTokenizer.from_pretrained(config['bart'])
 
18
  return config, model, tokenizer, idioms
19
 
20
 
@@ -24,8 +22,6 @@ def main():
24
  model.eval()
25
  pipeline = Pipeline(model, tokenizer)
26
  st.title("Idiomify Demo")
27
- st.markdown(f"Author: `Eu-Bin KIM`")
28
- st.markdown(f"Version: `{config['ver']}`")
29
  text = st.text_area("Type sentences here",
30
  value="Just remember there will always be a hope even when things look black")
31
  with st.sidebar:
 
1
  """
2
  we deploy the pipeline via streamlit.
3
  """
 
4
  import streamlit as st
5
  from transformers import BartTokenizer
6
  from idiomify.fetchers import fetch_config, fetch_idiomifier, fetch_idioms
7
  from idiomify.pipeline import Pipeline
 
8
 
9
 
10
  @st.cache(allow_output_mutation=True)
11
+ def fetch_resources() -> tuple:
12
  config = fetch_config()['idiomifier']
13
  model = fetch_idiomifier(config['ver'])
 
14
  tokenizer = BartTokenizer.from_pretrained(config['bart'])
15
+ idioms = fetch_idioms(config['idioms_ver'])
16
  return config, model, tokenizer, idioms
17
 
18
 
 
22
  model.eval()
23
  pipeline = Pipeline(model, tokenizer)
24
  st.title("Idiomify Demo")
 
 
25
  text = st.text_area("Type sentences here",
26
  value="Just remember there will always be a hope even when things look black")
27
  with st.sidebar: