File size: 2,909 Bytes
461c45d
 
 
2be75e8
 
 
7117f63
461c45d
8e38fa9
2be75e8
66a3725
2be75e8
 
 
 
1662e36
2be75e8
461c45d
 
249a7b0
 
1662e36
249a7b0
66a3725
 
 
 
2f7b0b5
 
 
 
 
249a7b0
 
 
2be75e8
28f08c2
461c45d
28f08c2
66a3725
 
461c45d
28f08c2
66a3725
 
 
28f08c2
41e4b90
e6e88b9
2f7b0b5
b8be992
41e4b90
 
461c45d
249a7b0
28f08c2
461c45d
28f08c2
 
 
 
 
 
 
 
 
 
 
41e4b90
28f08c2
41e4b90
28f08c2
98c3786
 
 
 
 
41e4b90
98c3786
28f08c2
41e4b90
98c3786
8e7670b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import streamlit as st
import json
import pandas as pd
from datasets import load_dataset

st.set_page_config(page_title="The Stack data Inspection", layout="wide")
st.sidebar.title("The Stack data Inspection")

df = pd.read_csv("new_extension_distribution.csv")
all_extensions = df["extension"].tolist()

tags = {}
for index, row in df.iterrows():
    if row["language"] not in tags:
        tags[row["language"]] = []
    tags[row["language"]].append(str(row["extension"]))
all_languages = list(tags.keys())


@st.cache(max_entries=100)
def load_data(language, ext, min_alphanum, max_line_length, max_mean_line_length, non_lexable):
    ext = None if ext == "nan" else ext
    samples = load_dataset(
        "loubnabnl/the-stack-inspection-data",
        data_dir=f"data/{language}/{ext}",
        split="train",
    )
    samples = samples.filter(
        lambda x: x["alphanum_fraction"] < min_alphanum
        and x["max_line_length"] > max_line_length
        and x["avg_line_length"] > max_mean_line_length
    )
    if non_lexable:
        samples = samples.filter(lambda x: not x["lexable"])
    return samples

col1, col2, _ = st.columns([1, 1, 4])
with col1:
    chosen_language = st.sidebar.selectbox(
        label="Select a programming language", options=all_languages, index=0
    )
with col2:
    chosen_ext = st.sidebar.selectbox(
        label="Select an extension", options=tags[chosen_language], index=0
    )

st.sidebar.header("Filters")
not_lexable = st.sidebar.checkbox("Not lexable")
min_alphanum = st.sidebar.slider("Minimum alphanumeric fraction", 0.0, 1.0, 1.0)
max_line_length = st.sidebar.slider("Maximum line length", 0, 1200, 0, step=100)
max_mean_line_length = st.sidebar.slider("Maximum average line length", 0, 500, 0, step=100)
st.sidebar.markdown("Printed files have `max_line_length`  and `average_line_length` larger than the selected values.\
`alphanumeric_fraction` is smaller than the selected value.")

samples = load_data(chosen_language, chosen_ext, min_alphanum, max_line_length, max_mean_line_length, not_lexable)

max_docs = len(samples)

if max_docs > 0:
    col_1, _ = st.columns([3, 3])
    with col_1:
        index_example = st.number_input(
            f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:",
            min_value=0,
            max_value=max_docs - 1,
            value=0,
            step=1,
        )

    example = samples[index_example]

    st.markdown("#### File content:")
    content = str(example["content"])
    
    if len(content)>10_000:
        content = example["content"][:10_000] + "\n[MORE CODE, DISPLAYING FIRST 10k CHARACTERS]"
    
    if example["lexable"]:
        st.code(content, language=chosen_language)
    else:
        st.text(f"File can't be lexed so we remove syntax highlighting.\nContent:\n")
        st.text(str(content))
else:
    st.text("The dataset is empty after the filtering!")