Spaces:

pikto
/

GPT3-Dataset-Generator

Runtime error

App Files Files Community

pikto commited on Aug 11, 2023

Commit

336a071

•

1 Parent(s): 655a0d2

Upload 6 files

Browse files

Files changed (6) hide show

.streamlit/config.toml +6 -0
Gifs/arrow_small_new.gif +0 -0
Gifs/blue_grey_arrow.gif +0 -0
Gifs/boat_new.gif +0 -0
openai.png +0 -0
streamlit_app.py +826 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,6 @@

+[theme]
+base="light"
+#old
+#primaryColor="#18447c"
+#new
+primaryColor="#2BB5E8"

Gifs/arrow_small_new.gif ADDED Viewed

Gifs/blue_grey_arrow.gif ADDED Viewed

Gifs/boat_new.gif ADDED Viewed

openai.png ADDED Viewed

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,826 @@

+# ----------------------Importing libraries----------------------
+import streamlit as st
+from streamlit_pills import pills
+import pandas as pd
+import openai
+# Imports for AgGrid
+from st_aggrid import AgGrid, GridUpdateMode, JsCode
+from st_aggrid.grid_options_builder import GridOptionsBuilder
+# ----------------------Importing utils.py----------------------
+# For Snowflake (from Tony's utils.py)
+import io
+from utils import (
+    connect_to_snowflake,
+    load_data_to_snowflake,
+    load_data_to_postgres,
+    connect_to_postgres,
+)
+# ----------------------Page config--------------------------------------
+st.set_page_config(page_title="GPT3 Dataset Generator", page_icon="🤖")
+# ----------------------Sidebar section--------------------------------
+# st.image(
+#    "Gifs/header.gif",
+# )
+st.image("Gifs/boat_new.gif")
+c30, c31, c32 = st.columns([0.2, 0.1, 3])
+with c30:
+    st.caption("")
+    st.image("openai.png", width=60)
+with c32:
+    st.title("GPT3 Dataset Generator")
+st.write(
+    "This app generates datasets using GPT3. It was created for the ❄️ Snowflake Snowvation Hackathon"
+)
+tabMain, tabInfo, tabTo_dos = st.tabs(["Main", "Info", "To-do's"])
+with tabInfo:
+    st.write("")
+    st.write("")
+    st.subheader("🤖 What is GPT-3?")
+    st.markdown(
+        "[GPT-3](https://en.wikipedia.org/wiki/GPT-3) is a large language generation model developed by [OpenAI](https://openai.com/) that can generate human-like text. It has a capacity of 175 billion parameters and is trained on a vast dataset of internet text. It can be used for tasks such as language translation, chatbot language generation, and content generation etc."
+    )
+    st.subheader("🎈 What is Streamlit?")
+    st.markdown(
+        "[Streamlit](https://streamlit.io) is an open-source Python library that allows users to create interactive, web-based data visualization and machine learning applications without the need for extensive web development knowledge"
+    )
+    st.write("---")
+    st.subheader("📖 Resources")
+    st.markdown(
+        """
+    - OpenAI
+        - [OpenAI Playground](https://beta.openai.com/playground)
+        - [OpenAI Documentation](https://beta.openai.com/docs)
+    - Streamlit
+        - [Documentation](https://docs.streamlit.io/)
+        - [Gallery](https://streamlit.io/gallery)
+        - [Cheat sheet](https://docs.streamlit.io/library/cheatsheet)
+        - [Book](https://www.amazon.com/dp/180056550X) (Getting Started with Streamlit for Data Science)
+        - Deploy your apps using [Streamlit Community Cloud](https://streamlit.io/cloud) in just a few clicks
+    """
+    )
+with tabTo_dos:
+    with st.expander("To-do", expanded=True):
+        st.write(
+            """
+        - [p2] Currently, the results are displayed even if the submit button isn't pressed.
+        - [p2] There is still an issue with the index where the first element from the JSON is not being displayed.
+        - [Post Hackathon] To limit the number of API calls and costs, let's cap the maximum number - of results to 5. Alternatively, we can consider removing the free API key.
+        """
+        )
+        st.write("")
+    with st.expander("Done", expanded=True):
+        st.write(
+            """
+        - [p2] Check if the Json file is working
+        - [p2] On Github, remove any unused images and GIFs.
+        - [p1] Add that for postgress - localhost is required
+        - [p2] Rename the CSV and JSON as per the st-pills variable
+        - [p2] Change the color of the small arrow
+        - [p1] Adjust the size of the Gifs
+        - Add a streamlit badge in the `ReadMe` file
+        - Add the message "Please enter your API key or choose the `Free Key` option."
+        - Include a `ReadMe` file
+        - Add a section for the Snowflake credentials
+        - Remove password from the Python file
+        - Add screenshots to the `ReadMe` file
+        - Include forms in the snowflake postgres section
+        - Remove the hashed code in the Python file
+        - Include additional information in the 'info' tab
+        - p1] Fix the download issue by sorting it via session state
+        - [p1] Make the dataframe from this app editable
+        - Add more gifs to the app
+        - Change the color scheme to Snowflake Blue
+        - Include a section for Snowflake credentials
+        - Change the colors of the arrows, using this tool (https://lottiefiles.com/lottie-to-gif/convert)
+        - Try new prompts and implement the best ones
+        - Add a config file for the color scheme
+        - Include an option menu using this tool (https://github.com/victoryhb/streamlit-option-menu)
+        - Display a message when the API key is not provided
+        - Fix the arrow and rearrange the layout for the API key message
+        - Check and improve the quality of the prompt output
+        - Send the app to Tony and upload it to GitHub
+        - Re-arrange the data on the sidebar
+        - Change the colors of both gifs to match the overall color scheme
+        - Add context about the app being part of the snowvation project
+        - Add a button to convert the data to JSON format
+        - Include the Snowflake logo
+        - Add a submit button to block API calls unless pressed
+        - Add a tab with additional information
+        - Resize the columns in the st.form section
+        - Add the ability to add the dataset to Snowflake
+        - Create a section with pills, showcasing examples
+        - Change the main emoji
+        - Change the emoji in the tab (page_icon)
+        - [INFO] Sort out the issue with credits
+        """
+        )
+        st.write("")
+    with st.expander("Not needed", expanded=True):
+        st.write(
+            """
+            - Check index issue in readcsv (not an issue as I've changed the script)
+            - Add the mouse gif (doesn't fit)
+            - Ask Lukas - automatically resize the columns of a DataFrame
+        """
+        )
+        st.write("")
+    st.write("")
+    st.write("")
+    st.write("")
+with tabMain:
+    key_choice = st.sidebar.radio(
+        "",
+        (
+            "Your Key",
+            "Free Key (capped)",
+        ),
+        horizontal=True,
+    )
+    if key_choice == "Your Key":
+        API_Key = st.sidebar.text_input(
+            "First, enter your OpenAI API key", type="password"
+        )
+    elif key_choice == "Free Key (capped)":
+        API_Key = st.secrets["API_KEY"]
+    image_arrow = st.sidebar.image(
+        "Gifs/blue_grey_arrow.gif",
+    )
+    if key_choice == "Free Key (capped)":
+        image_arrow.empty()
+    else:
+        st.write("")
+        st.sidebar.caption(
+            "No OpenAI API key? Get yours [here!](https://openai.com/blog/api-no-waitlist/)"
+        )
+        pass
+    st.write("")
+    c30, c31, c32 = st.columns([0.2, 0.1, 3])
+    st.subheader("① Build your dataset")
+    example = pills(
+        "",
+        [
+            "Sci-fi Movies",
+            "Animals",
+            "Pop Songs",
+            "POTUS's Twitter",
+            "Blank",
+        ],
+        [
+            "🍿",
+            "🐎",
+            "🎵",
+            "🇺🇸",
+            "👻",
+        ],
+        label_visibility="collapsed",
+    )
+    if "counter" not in st.session_state:
+        st.session_state.counter = 0
+    def increment():
+        st.session_state.counter += 1
+    if example == "Sci-fi Movies":
+        with st.form("my_form"):
+            text_input = st.text_input(
+                "What is the topic of your dataset?", value="Sci-fi movies"
+            )
+            col1, col2, col3 = st.columns(3, gap="small")
+            with col1:
+                column_01 = st.text_input("1st column", value="Title")
+            with col2:
+                column_02 = st.text_input("2nd column", value="Year")
+            with col3:
+                column_03 = st.text_input("3rd column", value="PG rating")
+            col1, col2 = st.columns(2, gap="medium")
+            with col1:
+                number = st.number_input(
+                    "How many rows do you want?",
+                    value=5,
+                    min_value=1,
+                    max_value=20,
+                    step=5,
+                    help="The maximum number of rows is 20.",
+                )
+            with col2:
+                engine = st.radio(
+                    "GPT3 engine",
+                    (
+                        "Davinci",
+                        "Curie",
+                        "Babbage",
+                    ),
+                    horizontal=True,
+                    help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.",
+                )
+                if engine == "Davinci":
+                    engine = "davinci-instruct-beta-v3"
+                elif engine == "Curie":
+                    engine = "curie-instruct-beta-v2"
+                elif engine == "Babbage":
+                    engine = "babbage-instruct-beta"
+            st.write("")
+            submitted = st.form_submit_button("Build my dataset! ✨", on_click=increment)
+    elif example == "Animals":
+        with st.form("my_form"):
+            text_input = st.text_input(
+                "What is the topic of your dataset?", value="Fastest animals on earth"
+            )
+            col1, col2, col3 = st.columns(3, gap="small")
+            with col1:
+                column_01 = st.text_input("1st column", value="Animal")
+            with col2:
+                column_02 = st.text_input("2nd column", value="Speed")
+            with col3:
+                column_03 = st.text_input("3rd column", value="Weight")
+            col1, col2 = st.columns(2, gap="medium")
+            with col1:
+                number = st.number_input(
+                    "How many rows do you want?",
+                    value=5,
+                    min_value=1,
+                    max_value=20,
+                    step=5,
+                    help="The maximum number of rows is 50.",
+                )
+            with col2:
+                engine = st.radio(
+                    "GPT3 engine",
+                    (
+                        "Davinci",
+                        "Curie",
+                        "Babbage",
+                    ),
+                    horizontal=True,
+                    help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.",
+                )
+                if engine == "Davinci":
+                    engine = "davinci-instruct-beta-v3"
+                elif engine == "Curie":
+                    engine = "curie-instruct-beta-v2"
+                elif engine == "Babbage":
+                    engine = "babbage-instruct-beta"
+            st.write("")
+            submitted = st.form_submit_button("Build my dataset! ✨", on_click=increment)
+    elif example == "Stocks":
+        with st.form("my_form"):
+            text_input = st.text_input(
+                "What is the topic of your dataset?", value="Stocks"
+            )
+            col1, col2, col3 = st.columns(3, gap="small")
+            with col1:
+                column_01 = st.text_input("1st column", value="Ticker")
+            with col2:
+                column_02 = st.text_input("2nd column", value="Price")
+            with col3:
+                column_03 = st.text_input("3rd column", value="Exchange")
+            col1, col2 = st.columns(2, gap="medium")
+            with col1:
+                number = st.number_input(
+                    "How many rows do you want?",
+                    value=5,
+                    min_value=1,
+                    max_value=20,
+                    step=5,
+                    help="The maximum number of rows is 50.",
+                )
+            with col2:
+                engine = st.radio(
+                    "GPT3 engine",
+                    (
+                        "Davinci",
+                        "Curie",
+                        "Babbage",
+                    ),
+                    horizontal=True,
+                    help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.",
+                )
+                if engine == "Davinci":
+                    engine = "davinci-instruct-beta-v3"
+                elif engine == "Curie":
+                    engine = "curie-instruct-beta-v2"
+                elif engine == "Babbage":
+                    engine = "babbage-instruct-beta"
+            st.write("")
+            submitted = st.form_submit_button("Build my dataset! ✨", on_click=increment)
+    elif example == "POTUS's Twitter":
+        with st.form("my_form"):
+            text_input = st.text_input(
+                "What is the topic of your dataset?", value="POTUS's Twitter accounts"
+            )
+            col1, col2, col3 = st.columns(3, gap="small")
+            with col1:
+                column_01 = st.text_input("1st column", value="Name")
+            with col2:
+                column_02 = st.text_input("2nd column", value="Twitter handle")
+            with col3:
+                column_03 = st.text_input("3rd column", value="# of followers")
+            col1, col2 = st.columns(2, gap="medium")
+            with col1:
+                number = st.number_input(
+                    "How many rows do you want?",
+                    value=5,
+                    min_value=1,
+                    max_value=20,
+                    step=5,
+                    help="The maximum number of rows is 50.",
+                )
+            with col2:
+                engine = st.radio(
+                    "GPT3 engine",
+                    (
+                        "Davinci",
+                        "Curie",
+                        "Babbage",
+                    ),
+                    horizontal=True,
+                    help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.",
+                )
+                if engine == "Davinci":
+                    engine = "davinci-instruct-beta-v3"
+                elif engine == "Curie":
+                    engine = "curie-instruct-beta-v2"
+                elif engine == "Babbage":
+                    engine = "babbage-instruct-beta"
+            st.write("")
+            submitted = st.form_submit_button("Build my dataset! ✨")
+    elif example == "Pop Songs":
+        with st.form("my_form"):
+            text_input = st.text_input(
+                "What is the topic of your dataset?",
+                value="Most famous songs of all time",
+            )
+            col1, col2, col3 = st.columns(3, gap="small")
+            with col1:
+                column_01 = st.text_input("1st column", value="Song")
+            with col2:
+                column_02 = st.text_input("2nd column", value="Artist")
+            with col3:
+                column_03 = st.text_input("3rd column", value="Genre")
+            col1, col2 = st.columns(2, gap="medium")
+            with col1:
+                number = st.number_input(
+                    "How many rows do you want?",
+                    value=5,
+                    min_value=1,
+                    max_value=20,
+                    step=5,
+                    help="The maximum number of rows is 50.",
+                )
+            with col2:
+                engine = st.radio(
+                    "GPT3 engine",
+                    (
+                        "Davinci",
+                        "Curie",
+                        "Babbage",
+                    ),
+                    horizontal=True,
+                    help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.",
+                )
+                if engine == "Davinci":
+                    engine = "davinci-instruct-beta-v3"
+                elif engine == "Curie":
+                    engine = "curie-instruct-beta-v2"
+                elif engine == "Babbage":
+                    engine = "babbage-instruct-beta"
+            st.write("")
+            submitted = st.form_submit_button("Build my dataset! ✨")
+    elif example == "Blank":
+        with st.form("my_form"):
+            text_input = st.text_input("What is the topic of your dataset?", value="")
+            col1, col2, col3 = st.columns(3, gap="small")
+            with col1:
+                column_01 = st.text_input("1st column", value="")
+            with col2:
+                column_02 = st.text_input("2nd column", value="")
+            with col3:
+                column_03 = st.text_input("3rd column", value="")
+            col1, col2 = st.columns(2, gap="medium")
+            with col1:
+                number = st.number_input(
+                    "How many rows do you want?",
+                    value=5,
+                    min_value=1,
+                    max_value=20,
+                    step=5,
+                    help="The maximum number of rows is 50.",
+                )
+            with col2:
+                engine = st.radio(
+                    "GPT3 engine",
+                    (
+                        "Davinci",
+                        "Curie",
+                        "Babbage",
+                    ),
+                    horizontal=True,
+                    help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.",
+                )
+                if engine == "Davinci":
+                    engine = "davinci-instruct-beta-v3"
+                elif engine == "Curie":
+                    engine = "curie-instruct-beta-v2"
+                elif engine == "Babbage":
+                    engine = "babbage-instruct-beta"
+            st.write("")
+            submitted = st.form_submit_button("Build my dataset! ✨")
+    # ----------------------API key section----------------------------------
+    number = number + 1
+    if not API_Key and not submitted:
+        st.stop()
+    if not API_Key and submitted:
+        st.info("Please enter your API key or choose the `Free Key` option.")
+        st.stop()
+    if st.session_state.counter >= 100:
+        pass
+    # ----------------------API key section----------------------------------
+    if not submitted and st.session_state.counter == 0:
+        c30, c31, c32 = st.columns([1, 0.01, 4])
+        with c30:
+            st.image("Gifs/arrow_small_new.gif")
+            st.caption("")
+        with c32:
+            st.caption("")
+            st.caption("")
+            st.info(
+                "Enter your dataset's criteria and click the button to generate it."
+            )
+            st.stop()
+    elif st.session_state.counter > 0:
+        c30, c31, c32 = st.columns([1, 0.9, 3])
+        openai.api_key = API_Key
+        # ----------------------API call section----------------------------------
+        response = openai.Completion.create(
+            model=engine,
+            prompt=f"Please provide a list of the top {number} {text_input} along with the following information in a three-column spreadsheet: {column_01}, {column_02}, and {column_03}. The columns should be labeled as follows: {column_01} | {column_02} | {column_03}",
+            temperature=0.5,
+            max_tokens=1707,
+            top_p=1,
+            best_of=2,
+            frequency_penalty=0,
+            presence_penalty=0,
+        )
+        st.write("___")
+        st.subheader("② Check the results")
+        with st.expander("See the API Json output"):
+            response
+        output_code = response["choices"][0]["text"]
+        # ----------------------Dataframe section----------------------------------
+        # create pandas DataFrame from string
+        df = pd.read_csv(io.StringIO(output_code), sep="|")
+        # get the number of columns in the dataframe
+        num_columns = len(df.columns)
+        # create a list of column names
+        column_names = ["Column {}".format(i) for i in range(1, num_columns + 1)]
+        # add the header to the dataframe
+        df.columns = column_names
+        # specify the mapping of old column names to new column names
+        column_mapping = {
+            "Column 1": column_01,
+            "Column 2": column_02,
+            "Column 3": column_03,
+        }
+        # rename the columns of the dataframe
+        df = df.rename(columns=column_mapping)
+        st.write("")
+        # ----------------------AgGrid section----------------------------------
+        gd = GridOptionsBuilder.from_dataframe(df)
+        gd.configure_pagination(enabled=True)
+        gd.configure_default_column(editable=True, groupable=True)
+        gd.configure_selection(selection_mode="multiple")
+        gridoptions = gd.build()
+        grid_table = AgGrid(
+            df,
+            gridOptions=gridoptions,
+            update_mode=GridUpdateMode.SELECTION_CHANGED,
+            theme="material",
+        )
+        # df
+        # ----------------------Download section--------------------------------------
+        c30, c31, c32, c33 = st.columns([1, 0.01, 1, 2.5])
+        with c30:
+            @st.cache
+            def convert_df(df):
+                return df.to_csv().encode("utf-8")
+            csv = convert_df(df)
+            st.download_button(
+                label="Download CSV",
+                data=csv,
+                file_name=f"{example} dataset .csv",
+                mime="text/csv",
+            )
+        with c32:
+            json_string = df.to_json(orient="records")
+            st.download_button(
+                label="Download JSON",
+                data=json_string,
+                file_name="data_set_sample.json",
+                mime="text/csv",
+            )
+    st.write("___")
+    st.subheader("③ Load data to Databases")
+    # Data to load to database(s)
+    # df = pd.read_csv("philox-testset-1.csv")
+    # Get user input for data storage option
+    storage_option = st.radio(
+        "Select data storage option:",
+        (
+            "Snowflake",
+            "PostgreSQL",
+        ),
+        horizontal=True,
+    )
+    # Get user input for data storage option
+    # Snowflake = st.selectbox(
+    #    "Select data storage option:", ["Snowflake", "Snowflake"]
+    # )
+    @st.cache(allow_output_mutation=True)
+    def reset_form_fields():
+        user = ""
+        password = ""
+        account = ""
+        warehouse = ""
+        database = ""
+        schema = ""
+        table = ""
+        host = ""
+        port = ""
+    if storage_option == "Snowflake":
+        st.subheader("`Enter Snowflake Credentials`👇")
+        # Get user input for Snowflake credentials
+        with st.form("my_form_db"):
+            col1, col2 = st.columns(2, gap="small")
+            with col1:
+                user = st.text_input("Username:", value="TONY")
+            with col2:
+                password = st.text_input("Password:", type="password")
+            with col1:
+                account = st.text_input("Account:", value="jn27194.us-east4.gcp")
+            with col2:
+                warehouse = st.text_input("Warehouse:", value="NAH")
+            with col1:
+                database = st.text_input("Database:", value="SNOWVATION")
+            with col2:
+                schema = st.text_input("Schema:", value="PUBLIC")
+            table = st.text_input("Table:")
+            st.write("")
+            submitted = st.form_submit_button("Load to Snowflake")
+        # Load the data to Snowflake
+        if submitted:
+            # if st.button("Load data to Snowflake"):
+            if (
+                user
+                and password
+                and account
+                and warehouse
+                and database
+                and schema
+                and table
+            ):
+                conn = connect_to_snowflake(
+                    username=user,
+                    password=password,
+                    account=account,
+                    warehouse=warehouse,
+                    database=database,
+                    schema=schema,
+                )
+                if conn:
+                    load_data_to_snowflake(df, conn, table)
+            else:
+                st.warning("Please enter all Snowflake credentials")
+    elif storage_option == "PostgreSQL":
+        st.subheader("`Enter PostgreSQL Credentials`👇")
+        st.error("Localhost only")
+        # Get user input for PostgreSQL credentials
+        with st.form("my_form_db"):
+            col1, col2 = st.columns(2, gap="small")
+            with col1:
+                user = st.text_input("Username:", value="postgres")
+            with col2:
+                password = st.text_input("Password:", type="password")
+            with col1:
+                host = st.selectbox("Host:", ["localhost", "other"])
+                if host == "other":
+                    host = st.text_input("Enter host:")
+            with col2:
+                port = st.text_input("Port:", value="5432")
+            with col1:
+                database = st.text_input("Database:", value="snowvation")
+            with col2:
+                table = st.text_input("Table:")
+            st.write("")
+            submitted = st.form_submit_button("Load to PostgreSQL")
+        # Load the data to PostgreSQL
+        # if st.button("Load data to PostgreSQL"):
+        if submitted:
+            if user and password and host and port and database and table:
+                conn = connect_to_postgres(
+                    username=user,
+                    password=password,
+                    host=host,
+                    port=port,
+                    database=database,
+                )
+                if conn:
+                    load_data_to_postgres(df, conn, table)
+            else:
+                st.warning("Please enter all PostgreSQL credentials and table name")
+    # Reset form fields when storage_option changes
+    reset_form_fields()