Spaces:
Sleeping
Sleeping
File size: 2,909 Bytes
461c45d 2be75e8 7117f63 461c45d 8e38fa9 2be75e8 66a3725 2be75e8 1662e36 2be75e8 461c45d 249a7b0 1662e36 249a7b0 66a3725 2f7b0b5 249a7b0 2be75e8 28f08c2 461c45d 28f08c2 66a3725 461c45d 28f08c2 66a3725 28f08c2 41e4b90 e6e88b9 2f7b0b5 b8be992 41e4b90 461c45d 249a7b0 28f08c2 461c45d 28f08c2 41e4b90 28f08c2 41e4b90 28f08c2 98c3786 41e4b90 98c3786 28f08c2 41e4b90 98c3786 8e7670b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import streamlit as st
import json
import pandas as pd
from datasets import load_dataset
st.set_page_config(page_title="The Stack data Inspection", layout="wide")
st.sidebar.title("The Stack data Inspection")
df = pd.read_csv("new_extension_distribution.csv")
all_extensions = df["extension"].tolist()
tags = {}
for index, row in df.iterrows():
if row["language"] not in tags:
tags[row["language"]] = []
tags[row["language"]].append(str(row["extension"]))
all_languages = list(tags.keys())
@st.cache(max_entries=100)
def load_data(language, ext, min_alphanum, max_line_length, max_mean_line_length, non_lexable):
ext = None if ext == "nan" else ext
samples = load_dataset(
"loubnabnl/the-stack-inspection-data",
data_dir=f"data/{language}/{ext}",
split="train",
)
samples = samples.filter(
lambda x: x["alphanum_fraction"] < min_alphanum
and x["max_line_length"] > max_line_length
and x["avg_line_length"] > max_mean_line_length
)
if non_lexable:
samples = samples.filter(lambda x: not x["lexable"])
return samples
col1, col2, _ = st.columns([1, 1, 4])
with col1:
chosen_language = st.sidebar.selectbox(
label="Select a programming language", options=all_languages, index=0
)
with col2:
chosen_ext = st.sidebar.selectbox(
label="Select an extension", options=tags[chosen_language], index=0
)
st.sidebar.header("Filters")
not_lexable = st.sidebar.checkbox("Not lexable")
min_alphanum = st.sidebar.slider("Minimum alphanumeric fraction", 0.0, 1.0, 1.0)
max_line_length = st.sidebar.slider("Maximum line length", 0, 1200, 0, step=100)
max_mean_line_length = st.sidebar.slider("Maximum average line length", 0, 500, 0, step=100)
st.sidebar.markdown("Printed files have `max_line_length` and `average_line_length` larger than the selected values.\
`alphanumeric_fraction` is smaller than the selected value.")
samples = load_data(chosen_language, chosen_ext, min_alphanum, max_line_length, max_mean_line_length, not_lexable)
max_docs = len(samples)
if max_docs > 0:
col_1, _ = st.columns([3, 3])
with col_1:
index_example = st.number_input(
f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:",
min_value=0,
max_value=max_docs - 1,
value=0,
step=1,
)
example = samples[index_example]
st.markdown("#### File content:")
content = str(example["content"])
if len(content)>10_000:
content = example["content"][:10_000] + "\n[MORE CODE, DISPLAYING FIRST 10k CHARACTERS]"
if example["lexable"]:
st.code(content, language=chosen_language)
else:
st.text(f"File can't be lexed so we remove syntax highlighting.\nContent:\n")
st.text(str(content))
else:
st.text("The dataset is empty after the filtering!") |