Petr Tsvetkov commited on
Commit
5f3a4af
β€’
1 Parent(s): 0c136d8

Synthetic dataset visualization

Browse files
change_visualizer.py CHANGED
@@ -5,6 +5,9 @@ import generate_annotated_diffs
5
  df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
6
  n_diffs_manual = len(df_manual)
7
 
 
 
 
8
 
9
  def update_manual_view(diff_idx):
10
  diff_idx -= 1
@@ -14,6 +17,14 @@ def update_manual_view(diff_idx):
14
  'session'], f"https://github.com/{df_manual.iloc[diff_idx]['repo']}/commit/{df_manual.iloc[diff_idx]['hash']}"
15
 
16
 
 
 
 
 
 
 
 
 
17
  if __name__ == '__main__':
18
  with gr.Blocks(theme=gr.themes.Soft()) as application:
19
  with gr.Tab("Manual"):
@@ -36,7 +47,28 @@ if __name__ == '__main__':
36
  slider_manual.change(update_manual_view, inputs=slider_manual,
37
  outputs=view_manual)
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  application.load(update_manual_view, inputs=slider_manual,
40
  outputs=view_manual)
41
 
 
 
 
42
  application.launch()
 
5
  df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
6
  n_diffs_manual = len(df_manual)
7
 
8
+ df_synthetic = generate_annotated_diffs.synthetic_data_with_annotated_diffs()
9
+ n_diffs_synthetic = len(df_synthetic)
10
+
11
 
12
  def update_manual_view(diff_idx):
13
  diff_idx -= 1
 
17
  'session'], f"https://github.com/{df_manual.iloc[diff_idx]['repo']}/commit/{df_manual.iloc[diff_idx]['hash']}"
18
 
19
 
20
+ def update_synthetic_view(diff_idx):
21
+ diff_idx -= 1
22
+ return (df_synthetic.iloc[diff_idx]['annotated_diff'], df_synthetic.iloc[diff_idx]['initial_msg_pred'],
23
+ df_synthetic.iloc[diff_idx][
24
+ 'get_annotated_diff'],
25
+ f"https://github.com/{df_synthetic.iloc[diff_idx]['repo']}/commit/{df_synthetic.iloc[diff_idx]['hash']}")
26
+
27
+
28
  if __name__ == '__main__':
29
  with gr.Blocks(theme=gr.themes.Soft()) as application:
30
  with gr.Tab("Manual"):
 
47
  slider_manual.change(update_manual_view, inputs=slider_manual,
48
  outputs=view_manual)
49
 
50
+ with gr.Tab("Synthetic"):
51
+ slider_synthetic = gr.Slider(minimum=1, maximum=n_diffs_synthetic, step=1, value=1,
52
+ label=f"Sample number (total: {n_diffs_synthetic})")
53
+
54
+ diff_view_synthetic = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"})
55
+ start_view_synthetic = gr.Textbox(interactive=False, label="Start message", container=True)
56
+ end_view_synthetic = gr.Textbox(interactive=False, label="End message", container=True)
57
+ link_view_synthetic = gr.Markdown()
58
+ view_synthetic = [
59
+ diff_view_synthetic,
60
+ start_view_synthetic,
61
+ end_view_synthetic,
62
+ link_view_synthetic
63
+ ]
64
+
65
+ slider_synthetic.change(update_synthetic_view, inputs=slider_synthetic,
66
+ outputs=view_synthetic)
67
+
68
  application.load(update_manual_view, inputs=slider_manual,
69
  outputs=view_manual)
70
 
71
+ application.load(update_synthetic_view, inputs=slider_synthetic,
72
+ outputs=view_synthetic)
73
+
74
  application.launch()
config.py CHANGED
@@ -4,9 +4,17 @@ from pathlib import Path
4
  GRAZIE_API_JWT_TOKEN = os.environ.get("GRAZIE_API_JWT_TOKEN")
5
 
6
  HF_TOKEN = os.environ.get('HF_TOKEN')
 
7
  HF_RAW_DATASET_NAME = "petrtsv-jb/commit-msg-rewriting"
8
  HF_RAW_DATASET_SPLIT = 'train'
9
 
 
 
 
 
 
 
 
10
  CACHE_DIR = Path("cache")
11
  CACHE_DIR.mkdir(exist_ok=True)
12
 
 
4
  GRAZIE_API_JWT_TOKEN = os.environ.get("GRAZIE_API_JWT_TOKEN")
5
 
6
  HF_TOKEN = os.environ.get('HF_TOKEN')
7
+
8
  HF_RAW_DATASET_NAME = "petrtsv-jb/commit-msg-rewriting"
9
  HF_RAW_DATASET_SPLIT = 'train'
10
 
11
+ HF_FULL_COMMITS_DATASET_NAME = "JetBrains-Research/lca-commit-message-generation"
12
+ HF_FULL_COMMITS_DATASET_SUBNAME = "commitchronicle-py-long"
13
+ HF_FULL_COMMITS_DATASET_SPLIT = "test"
14
+
15
+ HF_SYNTHETIC_DATASET_NAME = "petrtsv-jb/synthetic-commit-msg-rewriting"
16
+ HF_SYNTHETIC_DATASET_SPLIT = 'train'
17
+
18
  CACHE_DIR = Path("cache")
19
  CACHE_DIR.mkdir(exist_ok=True)
20
 
generate_annotated_diffs.py CHANGED
@@ -36,3 +36,10 @@ def manual_data_with_annotated_diffs():
36
  annotated = df.apply(annotated_diff_for_row_manual_df, axis=1)
37
  df['annotated_diff'] = annotated
38
  return df
 
 
 
 
 
 
 
 
36
  annotated = df.apply(annotated_diff_for_row_manual_df, axis=1)
37
  df['annotated_diff'] = annotated
38
  return df
39
+
40
+
41
+ def synthetic_data_with_annotated_diffs():
42
+ df = hf_data_loader.load_synthetic_dataset_as_pandas()
43
+ annotated = df.apply(annotated_diff_for_row_synthetic_df, axis=1)
44
+ df['annotated_diff'] = annotated
45
+ return df
generate_synthetic_dataset.py CHANGED
@@ -58,7 +58,8 @@ def generate_synthetic_dataset():
58
  initial_messages_pred = []
59
 
60
  for prompt in tqdm(df['initial_msg_prompt']):
61
- initial_messages_pred.append(generate_initial_msg(prompt))
 
62
 
63
  df['initial_msg_pred'] = initial_messages_pred
64
 
 
58
  initial_messages_pred = []
59
 
60
  for prompt in tqdm(df['initial_msg_prompt']):
61
+ output = generate_initial_msg(prompt)
62
+ initial_messages_pred.append(output)
63
 
64
  df['initial_msg_pred'] = initial_messages_pred
65
 
hf_data_loader.py CHANGED
@@ -11,8 +11,15 @@ def load_raw_rewriting_dataset_as_pandas():
11
 
12
 
13
  def load_full_commit_dataset_as_pandas():
14
- return load_dataset("JetBrains-Research/lca-commit-message-generation",
15
- "commitchronicle-py-long",
16
- split="test",
17
  cache_dir=config.CACHE_DIR).to_pandas().rename(
18
  columns={'message': 'reference'})
 
 
 
 
 
 
 
 
11
 
12
 
13
  def load_full_commit_dataset_as_pandas():
14
+ return load_dataset(path=config.HF_FULL_COMMITS_DATASET_NAME,
15
+ name=config.HF_FULL_COMMITS_DATASET_SUBNAME,
16
+ split=config.HF_FULL_COMMITS_DATASET_SPLIT,
17
  cache_dir=config.CACHE_DIR).to_pandas().rename(
18
  columns={'message': 'reference'})
19
+
20
+
21
+ def load_synthetic_dataset_as_pandas():
22
+ load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
23
+ split=config.HF_SYNTHETIC_DATASET_SPLIT,
24
+ token=config.HF_TOKEN,
25
+ cache_dir=config.CACHE_DIR).to_pandas()