import streamlit as st
import re
import json
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
from graphviz import Digraph
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

def remove_timestamps(text):
    return re.sub(r'\d{1,2}:\d{2}\n', '', text)

def process_text(text):
    lines = text.split("\n")
    processed_lines = []

    for line in lines:
        if line:
            processed_lines.append(line)

    outline = ""
    for i, line in enumerate(processed_lines):
        if i % 2 == 0:
            outline += f"**{line}**\n"
        else:
            outline += f"- {line} 😄\n"

    return outline

def create_jsonl_list(text):
    lines = text.split("\n")
    jsonl_list = []

    for line in lines:
        if line:
            jsonl_list.append({"text": line})

    return jsonl_list

def unit_test(input_text):
    st.write("Test Text without Timestamps:")
    test_text_without_timestamps = remove_timestamps(input_text)
    st.write(test_text_without_timestamps)

    st.write("Test JSONL List:")
    test_jsonl_list = create_jsonl_list(test_text_without_timestamps)
    st.write(test_jsonl_list)


def extract_high_information_words(text, top_n=10):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]

    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    freq_dist = FreqDist(filtered_words)
    high_information_words = [word for word, _ in freq_dist.most_common(top_n)]

    return high_information_words


def create_relationship_graph(words):
    graph = Digraph()

    for index, word in enumerate(words):
        graph.node(str(index), word)

        if index > 0:
            graph.edge(str(index - 1), str(index), label=str(index))

    return graph


def display_relationship_graph(words):
    graph = create_relationship_graph(words)
    st.graphviz_chart(graph)


text_input = st.text_area("Enter text:", value="", height=300)
text_without_timestamps = remove_timestamps(text_input)

st.markdown("**Text without Timestamps:**")
st.write(text_without_timestamps)

processed_text = process_text(text_without_timestamps)
st.markdown("**Markdown Outline with Emojis:**")
st.markdown(processed_text)

unit_test_text = '''
1:42
program the does very very well on your data then you will achieve the best
1:48
generalization possible with a little bit of modification you can turn it into a precise theorem
1:54
and on a very intuitive level it's easy to see what it should be the case if you
2:01
have some data and you're able to find a shorter program which generates this
2:06
data then you've essentially extracted all the all conceivable regularity from
2:11
this data into your program and then you can use these objects to make the best predictions possible like if if you have
2:19
data which is so complex but there is no way to express it as a shorter program
2:25
then it means that your data is totally random there is no way to extract any regularity from it whatsoever now there
2:32
is little known mathematical theory behind this and the proofs of these statements actually not even that hard
2:38
but the one minor slight disappointment is that it's actually not possible at
2:44
least given today's tools and understanding to find the best short program that explains or generates or
2:52
solves your problem given your data this problem is computationally intractable
'''

unit_test(unit_test_text)

unit_test_text_2 = '''
5
to talk a little bit about reinforcement learning so reinforcement learning is a framework it's a framework of evaluating
6:53
agents in their ability to achieve goals and complicated stochastic environments
6:58
you've got an agent which is plugged into an environment as shown in the figure right here and for any given
7:06
agent you can simply run it many times and compute its average reward now the
7:13
thing that's interesting about the reinforcement learning framework is that there exist interesting useful
7:20
reinforcement learning algorithms the framework existed for a long time it
7:25
became interesting once we realized that good algorithms exist now these are there are perfect algorithms but they
7:31
are good enough todo interesting things and all you want the mathematical
7:37
problem is one where you need to maximize the expected reward now one
7:44
important way in which the reinforcement learning framework is not quite complete is that it assumes that the reward is
7:50
given by the environment you see this picture the agent sends an action while
7:56
the reward sends it an observation in a both the observation and the reward backwards that's what the environment
8:01
communicates back the way in which this is not the case in the real world is that we figure out
8:11
what the reward is from the observation we reward ourselves we are not told
8:16
environment doesn't say hey here's some negative reward it's our interpretation over census that lets us determine what
8:23
the reward is and there is only one real true reward in life and this is
8:28
existence or nonexistence and everything else is a corollary of that so well what
8:35
should our agent be you already know the answer should be a neural network because whenever you want to do
8:41
something dense it's going to be a neural network and you want the agent to map observations to actions so you let
8:47
it be parametrized with a neural net and you apply learning algorithm so I want to explain to you how reinforcement
8:53
learning works this is model free reinforcement learning the reinforcement learning has actually been used in practice everywhere but it's
'''

unit_test(unit_test_text_2)

unit_test_text_3 = '''
ort try something new add
9:17
randomness directions and compare the result to your expectation if the result
9:25
surprises you if you find that the results exceeded your expectation then
9:31
change your parameters to take those actions in the future that's it this is
9:36
the fool idea of reinforcement learning try it out see if you like it and if you do do more of that in the future and
9:44
that's it that's literally it this is the core idea now it turns out it's not
9:49
difficult to formalize mathematically but this is really what's going on if in a neural network 

'''

unit_test(unit_test_text_3)


# Adding new functionality to the existing code
text_without_timestamps = remove_timestamps(unit_test_text_2)
top_words = extract_high_information_words(text_without_timestamps, 10)
st.markdown("**Top 10 High Information Words:**")
st.write(top_words)

st.markdown("**Relationship Graph:**")
display_relationship_graph(top_words)