Petr Tsvetkov
commited on
Commit
β’
5f3a4af
1
Parent(s):
0c136d8
Synthetic dataset visualization
Browse files- change_visualizer.py +32 -0
- config.py +8 -0
- generate_annotated_diffs.py +7 -0
- generate_synthetic_dataset.py +2 -1
- hf_data_loader.py +10 -3
change_visualizer.py
CHANGED
@@ -5,6 +5,9 @@ import generate_annotated_diffs
|
|
5 |
df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
|
6 |
n_diffs_manual = len(df_manual)
|
7 |
|
|
|
|
|
|
|
8 |
|
9 |
def update_manual_view(diff_idx):
|
10 |
diff_idx -= 1
|
@@ -14,6 +17,14 @@ def update_manual_view(diff_idx):
|
|
14 |
'session'], f"https://github.com/{df_manual.iloc[diff_idx]['repo']}/commit/{df_manual.iloc[diff_idx]['hash']}"
|
15 |
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
if __name__ == '__main__':
|
18 |
with gr.Blocks(theme=gr.themes.Soft()) as application:
|
19 |
with gr.Tab("Manual"):
|
@@ -36,7 +47,28 @@ if __name__ == '__main__':
|
|
36 |
slider_manual.change(update_manual_view, inputs=slider_manual,
|
37 |
outputs=view_manual)
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
application.load(update_manual_view, inputs=slider_manual,
|
40 |
outputs=view_manual)
|
41 |
|
|
|
|
|
|
|
42 |
application.launch()
|
|
|
5 |
df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
|
6 |
n_diffs_manual = len(df_manual)
|
7 |
|
8 |
+
df_synthetic = generate_annotated_diffs.synthetic_data_with_annotated_diffs()
|
9 |
+
n_diffs_synthetic = len(df_synthetic)
|
10 |
+
|
11 |
|
12 |
def update_manual_view(diff_idx):
|
13 |
diff_idx -= 1
|
|
|
17 |
'session'], f"https://github.com/{df_manual.iloc[diff_idx]['repo']}/commit/{df_manual.iloc[diff_idx]['hash']}"
|
18 |
|
19 |
|
20 |
+
def update_synthetic_view(diff_idx):
|
21 |
+
diff_idx -= 1
|
22 |
+
return (df_synthetic.iloc[diff_idx]['annotated_diff'], df_synthetic.iloc[diff_idx]['initial_msg_pred'],
|
23 |
+
df_synthetic.iloc[diff_idx][
|
24 |
+
'get_annotated_diff'],
|
25 |
+
f"https://github.com/{df_synthetic.iloc[diff_idx]['repo']}/commit/{df_synthetic.iloc[diff_idx]['hash']}")
|
26 |
+
|
27 |
+
|
28 |
if __name__ == '__main__':
|
29 |
with gr.Blocks(theme=gr.themes.Soft()) as application:
|
30 |
with gr.Tab("Manual"):
|
|
|
47 |
slider_manual.change(update_manual_view, inputs=slider_manual,
|
48 |
outputs=view_manual)
|
49 |
|
50 |
+
with gr.Tab("Synthetic"):
|
51 |
+
slider_synthetic = gr.Slider(minimum=1, maximum=n_diffs_synthetic, step=1, value=1,
|
52 |
+
label=f"Sample number (total: {n_diffs_synthetic})")
|
53 |
+
|
54 |
+
diff_view_synthetic = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"})
|
55 |
+
start_view_synthetic = gr.Textbox(interactive=False, label="Start message", container=True)
|
56 |
+
end_view_synthetic = gr.Textbox(interactive=False, label="End message", container=True)
|
57 |
+
link_view_synthetic = gr.Markdown()
|
58 |
+
view_synthetic = [
|
59 |
+
diff_view_synthetic,
|
60 |
+
start_view_synthetic,
|
61 |
+
end_view_synthetic,
|
62 |
+
link_view_synthetic
|
63 |
+
]
|
64 |
+
|
65 |
+
slider_synthetic.change(update_synthetic_view, inputs=slider_synthetic,
|
66 |
+
outputs=view_synthetic)
|
67 |
+
|
68 |
application.load(update_manual_view, inputs=slider_manual,
|
69 |
outputs=view_manual)
|
70 |
|
71 |
+
application.load(update_synthetic_view, inputs=slider_synthetic,
|
72 |
+
outputs=view_synthetic)
|
73 |
+
|
74 |
application.launch()
|
config.py
CHANGED
@@ -4,9 +4,17 @@ from pathlib import Path
|
|
4 |
GRAZIE_API_JWT_TOKEN = os.environ.get("GRAZIE_API_JWT_TOKEN")
|
5 |
|
6 |
HF_TOKEN = os.environ.get('HF_TOKEN')
|
|
|
7 |
HF_RAW_DATASET_NAME = "petrtsv-jb/commit-msg-rewriting"
|
8 |
HF_RAW_DATASET_SPLIT = 'train'
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
CACHE_DIR = Path("cache")
|
11 |
CACHE_DIR.mkdir(exist_ok=True)
|
12 |
|
|
|
4 |
GRAZIE_API_JWT_TOKEN = os.environ.get("GRAZIE_API_JWT_TOKEN")
|
5 |
|
6 |
HF_TOKEN = os.environ.get('HF_TOKEN')
|
7 |
+
|
8 |
HF_RAW_DATASET_NAME = "petrtsv-jb/commit-msg-rewriting"
|
9 |
HF_RAW_DATASET_SPLIT = 'train'
|
10 |
|
11 |
+
HF_FULL_COMMITS_DATASET_NAME = "JetBrains-Research/lca-commit-message-generation"
|
12 |
+
HF_FULL_COMMITS_DATASET_SUBNAME = "commitchronicle-py-long"
|
13 |
+
HF_FULL_COMMITS_DATASET_SPLIT = "test"
|
14 |
+
|
15 |
+
HF_SYNTHETIC_DATASET_NAME = "petrtsv-jb/synthetic-commit-msg-rewriting"
|
16 |
+
HF_SYNTHETIC_DATASET_SPLIT = 'train'
|
17 |
+
|
18 |
CACHE_DIR = Path("cache")
|
19 |
CACHE_DIR.mkdir(exist_ok=True)
|
20 |
|
generate_annotated_diffs.py
CHANGED
@@ -36,3 +36,10 @@ def manual_data_with_annotated_diffs():
|
|
36 |
annotated = df.apply(annotated_diff_for_row_manual_df, axis=1)
|
37 |
df['annotated_diff'] = annotated
|
38 |
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
annotated = df.apply(annotated_diff_for_row_manual_df, axis=1)
|
37 |
df['annotated_diff'] = annotated
|
38 |
return df
|
39 |
+
|
40 |
+
|
41 |
+
def synthetic_data_with_annotated_diffs():
|
42 |
+
df = hf_data_loader.load_synthetic_dataset_as_pandas()
|
43 |
+
annotated = df.apply(annotated_diff_for_row_synthetic_df, axis=1)
|
44 |
+
df['annotated_diff'] = annotated
|
45 |
+
return df
|
generate_synthetic_dataset.py
CHANGED
@@ -58,7 +58,8 @@ def generate_synthetic_dataset():
|
|
58 |
initial_messages_pred = []
|
59 |
|
60 |
for prompt in tqdm(df['initial_msg_prompt']):
|
61 |
-
|
|
|
62 |
|
63 |
df['initial_msg_pred'] = initial_messages_pred
|
64 |
|
|
|
58 |
initial_messages_pred = []
|
59 |
|
60 |
for prompt in tqdm(df['initial_msg_prompt']):
|
61 |
+
output = generate_initial_msg(prompt)
|
62 |
+
initial_messages_pred.append(output)
|
63 |
|
64 |
df['initial_msg_pred'] = initial_messages_pred
|
65 |
|
hf_data_loader.py
CHANGED
@@ -11,8 +11,15 @@ def load_raw_rewriting_dataset_as_pandas():
|
|
11 |
|
12 |
|
13 |
def load_full_commit_dataset_as_pandas():
|
14 |
-
return load_dataset(
|
15 |
-
|
16 |
-
split=
|
17 |
cache_dir=config.CACHE_DIR).to_pandas().rename(
|
18 |
columns={'message': 'reference'})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
|
13 |
def load_full_commit_dataset_as_pandas():
|
14 |
+
return load_dataset(path=config.HF_FULL_COMMITS_DATASET_NAME,
|
15 |
+
name=config.HF_FULL_COMMITS_DATASET_SUBNAME,
|
16 |
+
split=config.HF_FULL_COMMITS_DATASET_SPLIT,
|
17 |
cache_dir=config.CACHE_DIR).to_pandas().rename(
|
18 |
columns={'message': 'reference'})
|
19 |
+
|
20 |
+
|
21 |
+
def load_synthetic_dataset_as_pandas():
|
22 |
+
load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
|
23 |
+
split=config.HF_SYNTHETIC_DATASET_SPLIT,
|
24 |
+
token=config.HF_TOKEN,
|
25 |
+
cache_dir=config.CACHE_DIR).to_pandas()
|