File size: 1,278 Bytes
f26a894
 
 
 
a8a595d
 
 
 
 
 
 
 
 
 
 
 
f26a894
 
a8a595d
 
 
f26a894
 
 
a8a595d
 
 
 
 
 
 
 
f26a894
4017643
 
f26a894
 
 
e2a35c0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import numpy as np
import pandas as pd


def get_statistics(start_msg, end_msg, annotated_msg):
    sum_deletions = 0
    sum_insertions = 0
    for text, change_type in annotated_msg:
        if change_type == '-':
            sum_deletions += len(text)
        elif change_type == '+':
            sum_insertions += len(text)

    sum_changes = sum_deletions + sum_insertions
    end_length = len(end_msg)
    start_length = len(start_msg)

    return {
        "deletions": sum_deletions / start_length,
        "insertions": sum_insertions / end_length,
        "changes": sum_changes / end_length
    }


def get_statistics_for_df(df: pd.DataFrame, start_col, end_col, annotated_col):
    stats = [get_statistics(row[start_col], row[end_col], row[annotated_col]) for _, row in df.iterrows()]

    assert len(stats) > 0

    return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]}


def get_statistics_for_manual_df(df):
    return get_statistics_for_df(df, start_col="commit_msg_start", end_col='commit_msg_end',
                                 annotated_col='annotated_diff')


def get_statistics_for_synthetic_df(df):
    return get_statistics_for_df(df, start_col="initial_msg_pred", end_col='reference', annotated_col='annotated_diff')