tonic commited on
Commit
e7ece9c
β€’
1 Parent(s): 1d0ed3f

adding scitonic demo

Browse files
.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Tonic AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,82 @@
1
- ---
2
- title: Scitonic
3
- emoji: 🐒
4
- colorFrom: red
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 4.15.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🌟 Sci-Tonic: Your Ultimate Technical Research Assistant πŸš€
2
+
3
+ Welcome to **Sci-Tonic** πŸŽ‰, the groundbreaking technical research assistant designed for professionals, researchers, and enthusiasts alike! If you're looking to dive deep into the world of data, ranging from financial figures πŸ“ˆ to scientific articles 🧬, and transform them into insightful, long-form multimedia outputs πŸ“ŠπŸ“š, you've just found your new best friend! πŸ€–πŸ‘©β€πŸ”¬πŸ‘¨β€πŸ’Ό
4
+
5
+ ## Features 🌈
6
+
7
+ Sci-Tonic is packed with amazing features:
8
+
9
+ - **Data Retrieval**: Effortlessly fetch data from a vast array of sources. Financial reports, scientific papers, complex texts - you name it, Sci-Tonic retrieves it! πŸŒπŸ”
10
+ - **Advanced Analysis**: Using cutting-edge AI, Sci-Tonic analyzes and interprets your data, providing you with deep insights. πŸ§ πŸ’‘
11
+ - **Multimedia Output**: Get your results the way you want them. Text, infographics, video summaries - Sci-Tonic does it all! πŸ“πŸŽ₯πŸ“Š
12
+ - **User-Friendly Interface**: Whether you're a tech guru or a newbie, our intuitive interface makes your research journey smooth and enjoyable. πŸ–₯️😊
13
+ - **Collaboration Tools**: Teamwork makes the dream work! Collaborate seamlessly with colleagues or classmates. πŸ‘₯🀝
14
+
15
+ ## Getting Started 🚦
16
+
17
+
18
+ ### Installation πŸ“₯
19
+ ```bash
20
+ # Clone the repository
21
+ git clone https://github.com/Tonic-AI/scitonic.git
22
+
23
+ # Navigate to the repository
24
+ cd multitonic
25
+
26
+ # Install dependencies
27
+ pip install -r requirements.txt
28
+ ```
29
+
30
+ ## Usage πŸ–₯️
31
+ - **Set Up Your Database**: Follow our step-by-step guide to initialize your databases.
32
+ - **Import Data**: Utilize our tools to import and cleanse your data.
33
+ - **Analyze and Query**: Access a range of pre-built queries or create your own for tailored insights.
34
+ - **Visualize Results**: Use our multimedia tools to visualize and present your findings.
35
+
36
+ # CONTRIBUTING GUIDE
37
+
38
+ ## Introduction
39
+ Welcome to the `multitonic` repository! This guide is designed to provide a streamlined process for contributing to our project. We value your input and are excited to collaborate with you.
40
+
41
+ ## Prerequisites
42
+ Before contributing, make sure you have a GitHub account. You should also join our Tonic-AI Discord to communicate with other contributors and the core team.
43
+
44
+ ## How to Contribute
45
+
46
+ ### Reporting Issues
47
+ - **Create an Issue**: If you find a bug or have a feature request, please create an issue to report it. Use clear and descriptive titles and provide as much information as possible.
48
+ - **Use the Issue Template**: Follow the issue template provided to ensure all relevant information is included.
49
+ - **Discuss in Discord**: For immediate feedback or discussion, bring up your issue in the `#multitonic-discussion` channel on Discord.
50
+
51
+ ### Making Changes
52
+ - **Fork the Repository**: Start by forking the repository to your own GitHub account.
53
+ - **Create a Branch**: Create a branch in your forked repository for your proposed changes. Name the branch something relevant to the changes you're making (e.g., `feature-add-login` or `bugfix-header-alignment`).
54
+ ```bash
55
+ git checkout -b your-branch-name
56
+ ```
57
+ - **Make Your Changes**: Perform the necessary changes to the codebase or documentation.
58
+ - **Commit Your Changes**: Use meaningful commit messages that describe what you've done.
59
+
60
+ ```bash
61
+ git commit -m "Your detailed commit message"
62
+ ```
63
+
64
+ - **Push to Your Fork**: Push your changes to your forked repository on GitHub.
65
+
66
+ ```bash
67
+ git push origin your-branch-name
68
+ ```
69
+
70
+ ### Submitting a Pull Request
71
+ - **Pull Request (PR)**: Go to the original `multitonic` repository and click on "Pull Request" to start the process.
72
+ - **PR Template**: Fill in the PR template with all the necessary details, linking the issue you're addressing.
73
+ - **Code Review**: Wait for the core team or community to review your PR. Be responsive to feedback.
74
+ - **Merge**: Once your PR has been approved and passes all checks, it will be merged into the main codebase.
75
+
76
+ ## Code of Conduct
77
+ Please adhere to the Code of Conduct laid out in the `CODE_OF_CONDUCT.md` [file](src/documentation/CODE_OF_CONDUCT.md). Respectful collaboration is key to a healthy open-source environment.
78
+
79
+ ## Questions or Additional Help
80
+ If you need further assistance or have any questions, please don't hesitate to ask in our Discord community or directly in GitHub issues.
81
+
82
+ Thank you for contributing to `multitonic`!
eval/Tonicvalidate.ipynb ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "6300c7d8-d6c8-4178-94c5-4ee767cfd825",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "Defaulting to user installation because normal site-packages is not writeable\n",
14
+ "Collecting tonic-validate\n",
15
+ " Downloading tonic_validate-2.1.1-py3-none-any.whl (14 kB)\n",
16
+ "Requirement already satisfied: openai>=1.0.0 in /home/mn/.local/lib/python3.10/site-packages (from tonic-validate) (1.4.0)\n",
17
+ "Requirement already satisfied: pandas>=1.2.3 in /home/mn/.local/lib/python3.10/site-packages (from tonic-validate) (2.1.4)\n",
18
+ "Requirement already satisfied: sniffio in /home/mn/.local/lib/python3.10/site-packages (from openai>=1.0.0->tonic-validate) (1.3.0)\n",
19
+ "Requirement already satisfied: tqdm>4 in /home/mn/.local/lib/python3.10/site-packages (from openai>=1.0.0->tonic-validate) (4.66.1)\n",
20
+ "Requirement already satisfied: typing-extensions<5,>=4.5 in /home/mn/.local/lib/python3.10/site-packages (from openai>=1.0.0->tonic-validate) (4.8.0)\n",
21
+ "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai>=1.0.0->tonic-validate) (1.7.0)\n",
22
+ "Requirement already satisfied: pydantic<3,>=1.9.0 in /home/mn/.local/lib/python3.10/site-packages (from openai>=1.0.0->tonic-validate) (2.5.2)\n",
23
+ "Requirement already satisfied: httpx<1,>=0.23.0 in /home/mn/.local/lib/python3.10/site-packages (from openai>=1.0.0->tonic-validate) (0.25.2)\n",
24
+ "Requirement already satisfied: anyio<5,>=3.5.0 in /home/mn/.local/lib/python3.10/site-packages (from openai>=1.0.0->tonic-validate) (3.7.1)\n",
25
+ "Requirement already satisfied: tzdata>=2022.1 in /home/mn/.local/lib/python3.10/site-packages (from pandas>=1.2.3->tonic-validate) (2023.3)\n",
26
+ "Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3/dist-packages (from pandas>=1.2.3->tonic-validate) (2022.1)\n",
27
+ "Requirement already satisfied: numpy<2,>=1.22.4 in /home/mn/.local/lib/python3.10/site-packages (from pandas>=1.2.3->tonic-validate) (1.26.2)\n",
28
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /home/mn/.local/lib/python3.10/site-packages (from pandas>=1.2.3->tonic-validate) (2.8.2)\n",
29
+ "Requirement already satisfied: idna>=2.8 in /usr/lib/python3/dist-packages (from anyio<5,>=3.5.0->openai>=1.0.0->tonic-validate) (3.3)\n",
30
+ "Requirement already satisfied: exceptiongroup in /home/mn/.local/lib/python3.10/site-packages (from anyio<5,>=3.5.0->openai>=1.0.0->tonic-validate) (1.2.0)\n",
31
+ "Requirement already satisfied: httpcore==1.* in /home/mn/.local/lib/python3.10/site-packages (from httpx<1,>=0.23.0->openai>=1.0.0->tonic-validate) (1.0.2)\n",
32
+ "Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from httpx<1,>=0.23.0->openai>=1.0.0->tonic-validate) (2020.6.20)\n",
33
+ "Requirement already satisfied: h11<0.15,>=0.13 in /home/mn/.local/lib/python3.10/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai>=1.0.0->tonic-validate) (0.14.0)\n",
34
+ "Requirement already satisfied: annotated-types>=0.4.0 in /home/mn/.local/lib/python3.10/site-packages (from pydantic<3,>=1.9.0->openai>=1.0.0->tonic-validate) (0.6.0)\n",
35
+ "Requirement already satisfied: pydantic-core==2.14.5 in /home/mn/.local/lib/python3.10/site-packages (from pydantic<3,>=1.9.0->openai>=1.0.0->tonic-validate) (2.14.5)\n",
36
+ "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas>=1.2.3->tonic-validate) (1.16.0)\n",
37
+ "Installing collected packages: tonic-validate\n",
38
+ "Successfully installed tonic-validate-2.1.1\n",
39
+ "Note: you may need to restart the kernel to use updated packages.\n"
40
+ ]
41
+ }
42
+ ],
43
+ "source": [
44
+ "pip install tonic-validate"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 10,
50
+ "id": "31b2d0ec-def3-4df9-9ab6-ba46810590a2",
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "import openai\n"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 11,
60
+ "id": "5acf03c7-9e4b-421d-a808-a97ab3da4ed7",
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
65
+ "from langchain.document_loaders import TextLoader\n",
66
+ "from langchain.vectorstores import Chroma\n",
67
+ "import os\n",
68
+ "from langchain.document_loaders import DirectoryLoader\n",
69
+ "from dotenv import load_dotenv\n",
70
+ "from langchain.llms import OpenAI\n",
71
+ "import json\n",
72
+ "from langchain.embeddings.openai import OpenAIEmbeddings\n",
73
+ "from langchain.chat_models import ChatOpenAI\n",
74
+ "from langchain.chains import RetrievalQA\n",
75
+ "import pandas as pd"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 12,
81
+ "id": "7f23e6c0-0c41-4437-a8ec-ee09e1805e3a",
82
+ "metadata": {},
83
+ "outputs": [
84
+ {
85
+ "name": "stdout",
86
+ "output_type": "stream",
87
+ "text": [
88
+ "page_content='According to a study from the University of Cambridge, at least half of developers’ efforts are spent debugging and not actively programming, which costs the software industry an estimated $312 billion per year. But so far, only a handful of code-generating AI systems have been made freely available to the public β€” reflecting the commercial incentives of the organizations building them (see: Replit).\\n\\nStarCoder, which by contrast is licensed to allow for royalty-free use by anyone, including corporations, was trained on over 80 programming languages as well as text from GitHub repositories, including documentation and programming notebooks. StarCoder integrates with Microsoft’s Visual Studio Code code editor and, like OpenAI’s ChatGPT, can follow basic instructions (e.g., β€œcreate an app UI”) and answer questions about code.' metadata={'source': 'new_articles/05-04-hugging-face-and-servicenow-release-a-free-code-generating-model.txt'}\n"
89
+ ]
90
+ }
91
+ ],
92
+ "source": [
93
+ "#text splitting\n",
94
+ "loader = DirectoryLoader('./new_articles/', glob=\"./*.txt\", loader_cls=TextLoader)\n",
95
+ "documents = loader.load()\n",
96
+ "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
97
+ "texts = text_splitter.split_documents(documents)\n",
98
+ "print(texts[1]) "
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "execution_count": 13,
104
+ "id": "6bcbef63-0c39-4236-bb27-87e11dbee5ff",
105
+ "metadata": {},
106
+ "outputs": [
107
+ {
108
+ "data": {
109
+ "text/plain": [
110
+ "True"
111
+ ]
112
+ },
113
+ "execution_count": 13,
114
+ "metadata": {},
115
+ "output_type": "execute_result"
116
+ }
117
+ ],
118
+ "source": [
119
+ "#load env variables(keys}\n",
120
+ "load_dotenv()"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": 14,
126
+ "id": "84d648f5-c84a-40b6-9765-9f04d615906d",
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "from openai import OpenAI\n",
131
+ "OPENAI_API_KEY= os.getenv(\"OPENAI_API_KEY\")\n",
132
+ "# embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key)\n",
133
+ "embedding_function = OpenAIEmbeddings(\n",
134
+ "api_key=OPENAI_API_KEY)"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": 15,
140
+ "id": "1b08991b-6190-41c2-883c-05360d7e4682",
141
+ "metadata": {},
142
+ "outputs": [],
143
+ "source": [
144
+ "vectordb = Chroma.from_documents(documents=texts, \n",
145
+ " embedding=OpenAIEmbeddings())"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": 33,
151
+ "id": "1a845b21-278a-499e-bddc-735577960e2c",
152
+ "metadata": {},
153
+ "outputs": [],
154
+ "source": [
155
+ "llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0.8)\n",
156
+ "chain = RetrievalQA.from_chain_type(llm=llm,\n",
157
+ " chain_type=\"stuff\",\n",
158
+ " retriever=vectordb.as_retriever())\n",
159
+ "from langchain.chains.question_answering import load_qa_chain\n",
160
+ "chain = load_qa_chain(llm, chain_type=\"stuff\")"
161
+ ]
162
+ },
163
+ {
164
+ "cell_type": "code",
165
+ "execution_count": 17,
166
+ "id": "89166788-2e9f-48ab-8d17-7dd353b0ff52",
167
+ "metadata": {},
168
+ "outputs": [],
169
+ "source": [
170
+ "#Load 10 questions and answers about the Paul Graham essays as a benchmark for how the RAG system should answer questions.\n",
171
+ "with open(\"question_and_answer_list.json\", \"r\") as f:\n",
172
+ " question_and_answer_list =json.load(f)"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": 18,
178
+ "id": "238595b9-be93-4b8b-9de6-88f042fb3596",
179
+ "metadata": {},
180
+ "outputs": [
181
+ {
182
+ "data": {
183
+ "text/plain": [
184
+ "'What is ChatGPT, and how has it been used in various applications?'"
185
+ ]
186
+ },
187
+ "execution_count": 18,
188
+ "metadata": {},
189
+ "output_type": "execute_result"
190
+ }
191
+ ],
192
+ "source": [
193
+ "ex_q_and_a = question_and_answer_list[1]\n",
194
+ "ex_q_and_a[\"question\"]"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": 21,
200
+ "id": "8baa0b92-5816-4746-9928-2324ca9e2962",
201
+ "metadata": {},
202
+ "outputs": [
203
+ {
204
+ "data": {
205
+ "text/plain": [
206
+ "'ChatGPT is a text-generating AI chatbot developed by OpenAI. It has been widely used for writing essays, code, and more based on short text prompts, enhancing productivity. Major brands have experimented with it for generating ad and marketing copy. OpenAI continually invests in ChatGPT, upgrading it to GPT-4, a more advanced language-writing model. The chatbot has been integrated into various applications, including search engines, customer service, and even an iPhone customization app called SuperChat.'"
207
+ ]
208
+ },
209
+ "execution_count": 21,
210
+ "metadata": {},
211
+ "output_type": "execute_result"
212
+ }
213
+ ],
214
+ "source": [
215
+ "ex_q_and_a[\"answer\"]"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": 22,
221
+ "id": "d545ac1d-cd2f-4704-86f7-da1af5ecc8f7",
222
+ "metadata": {},
223
+ "outputs": [],
224
+ "source": [
225
+ "from tonic_validate import ValidateApi, ValidateScorer, Benchmark, LLMResponse\n",
226
+ "from tonic_validate.metrics import AnswerConsistencyMetric, AugmentationAccuracyMetric"
227
+ ]
228
+ },
229
+ {
230
+ "cell_type": "code",
231
+ "execution_count": 23,
232
+ "id": "6d2c0ecc-bb57-488a-965f-1ea1bff1137b",
233
+ "metadata": {},
234
+ "outputs": [],
235
+ "source": [
236
+ "from tonic_validate.validate_scorer import ValidateScorer \n",
237
+ "# metrics\n",
238
+ "from tonic_validate.metrics.answer_consistency_metric import AnswerConsistencyMetric\n",
239
+ "from tonic_validate.metrics.answer_similarity_metric import AnswerSimilarityMetric\n",
240
+ "from tonic_validate.metrics.augmentation_accuracy_metric import AugmentationAccuracyMetric\n",
241
+ "from tonic_validate.metrics.augmentation_precision_metric import AugmentationPrecisionMetric\n",
242
+ "from tonic_validate.metrics.retrieval_precision_metric import RetrievalPrecisionMetric\n",
243
+ "# llm utils\n",
244
+ "from tonic_validate.classes.llm_response import LLMResponse\n",
245
+ "from tonic_validate.classes.benchmark import BenchmarkItem"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": 24,
251
+ "id": "118570f4-e116-4462-a6fd-01cf4fb0ed4a",
252
+ "metadata": {},
253
+ "outputs": [],
254
+ "source": [
255
+ "metrics = [\n",
256
+ " AnswerSimilarityMetric(),\n",
257
+ " RetrievalPrecisionMetric(),\n",
258
+ " AugmentationAccuracyMetric(),\n",
259
+ " AugmentationPrecisionMetric(),\n",
260
+ " AnswerConsistencyMetric()\n",
261
+ "]\n",
262
+ "# can use an OpenAI chat completion model\n",
263
+ "# llm_evaluator = \"gpt-3.5-turbo\"\n",
264
+ "llm_evaluator = \"gpt-4-1106-preview\"\n",
265
+ "validate_scorer = ValidateScorer(\n",
266
+ " metrics, llm_evaluator\n",
267
+ ")\n"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "markdown",
272
+ "id": "21c86d7f-de97-4fc2-801a-3a963825f88e",
273
+ "metadata": {},
274
+ "source": [
275
+ "For one benchMark"
276
+ ]
277
+ },
278
+ {
279
+ "cell_type": "code",
280
+ "execution_count": 34,
281
+ "id": "de465c2e-b0f4-453d-82f9-d5017f8b300f",
282
+ "metadata": {},
283
+ "outputs": [],
284
+ "source": [
285
+ "# example BenchmarkItem\n",
286
+ "question = ex_q_and_a[\"question\"]\n",
287
+ "reference_answer = ex_q_and_a[\"answer\"]\n",
288
+ "benchmark_item = BenchmarkItem(\n",
289
+ " question=question,\n",
290
+ " answer=reference_answer\n",
291
+ ")\n",
292
+ "\n",
293
+ "# example LLMResponse\n",
294
+ "llm_answer = response\n",
295
+ "context_list = [document.page_content for document in matching_docs]\n",
296
+ "llm_response = LLMResponse(\n",
297
+ " llm_answer=llm_answer,\n",
298
+ " llm_context_list=context_list,\n",
299
+ " benchmark_item=benchmark_item\n",
300
+ ")\n",
301
+ "\n",
302
+ "responses = [llm_response]\n",
303
+ "\n",
304
+ "response_scores = validate_scorer.score_run(responses)"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "execution_count": 35,
310
+ "id": "094656b2-01e6-497f-b132-e46e7068fa0c",
311
+ "metadata": {},
312
+ "outputs": [
313
+ {
314
+ "data": {
315
+ "text/plain": [
316
+ "{'answer_similarity': 4.0,\n",
317
+ " 'retrieval_precision': 1.0,\n",
318
+ " 'augmentation_accuracy': 0.5,\n",
319
+ " 'augmentation_precision': 0.5,\n",
320
+ " 'answer_consistency': 1.0}"
321
+ ]
322
+ },
323
+ "execution_count": 35,
324
+ "metadata": {},
325
+ "output_type": "execute_result"
326
+ }
327
+ ],
328
+ "source": [
329
+ "response_scores.run_data[0].scores"
330
+ ]
331
+ },
332
+ {
333
+ "cell_type": "raw",
334
+ "id": "2b4ee7b4-a1ca-4b1d-9e2a-7491105d0349",
335
+ "metadata": {},
336
+ "source": [
337
+ "For Multiple BenchMarks"
338
+ ]
339
+ },
340
+ {
341
+ "cell_type": "code",
342
+ "execution_count": 30,
343
+ "id": "e27f4bd2-aef0-45ed-b239-9a587521ca90",
344
+ "metadata": {},
345
+ "outputs": [],
346
+ "source": [
347
+ "# responses = []\n",
348
+ "\n",
349
+ "# for q_and_a in question_and_answer_list:\n",
350
+ "# query = ex_q_and_a[\"question\"]\n",
351
+ "# matching_docs =vectordb.similarity_search(query)\n",
352
+ "# response=chain.run(input_documents=matching_docs, question=query)\n",
353
+ " \n",
354
+ "# benchmark_item = BenchmarkItem(\n",
355
+ "# question=q_and_a[\"question\"],\n",
356
+ "# answer=q_and_a[\"answer\"]\n",
357
+ "# )\n",
358
+ "\n",
359
+ "# llm_response = LLMResponse(\n",
360
+ "# llm_answer=response,\n",
361
+ "# llm_context_list=[document.page_content for document in matching_docs],\n",
362
+ "# benchmark_item=benchmark_item\n",
363
+ "# )\n",
364
+ "\n",
365
+ "# responses.append(llm_response)"
366
+ ]
367
+ },
368
+ {
369
+ "cell_type": "code",
370
+ "execution_count": 31,
371
+ "id": "f4bdd2ac-630d-4313-975b-8220fc69fea6",
372
+ "metadata": {},
373
+ "outputs": [],
374
+ "source": [
375
+ "# response_scores = validate_scorer.score_run(responses)"
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "code",
380
+ "execution_count": 36,
381
+ "id": "7bd5591a-faf9-47ec-82bb-9bbe694a3029",
382
+ "metadata": {},
383
+ "outputs": [],
384
+ "source": [
385
+ "validate_api = ValidateApi(\"lYNVCKKhUtssnDKSQGX48HkK4Mr_62dr9Hj7U0_K98c\")\n",
386
+ "validate_api.upload_run(\"944f02fe-c106-45c0-af5d-74e4bd0518b7\", response_scores)"
387
+ ]
388
+ },
389
+ {
390
+ "cell_type": "code",
391
+ "execution_count": 17,
392
+ "id": "6147a691-a919-4a4f-9ba7-00ed3c0efb88",
393
+ "metadata": {},
394
+ "outputs": [],
395
+ "source": [
396
+ "# def make_scores_df(response_scores):\n",
397
+ "# scores_df = {\n",
398
+ "# \"question\": [],\n",
399
+ "# \"reference_answer\": [],\n",
400
+ "# \"llm_answer\": [],\n",
401
+ "# \"retrieved_context\": []\n",
402
+ "# }\n",
403
+ "# for score_name in response_scores.overall_scores:\n",
404
+ "# scores_df[score_name] = []\n",
405
+ "# for data in response_scores.run_data:\n",
406
+ "# scores_df[\"question\"].append(data.reference_question)\n",
407
+ "# scores_df[\"reference_answer\"].append(data.reference_answer)\n",
408
+ "# scores_df[\"llm_answer\"].append(data.llm_answer)\n",
409
+ "# scores_df[\"retrieved_context\"].append(data.llm_context)\n",
410
+ "# for score_name, score in data.scores.items():\n",
411
+ "# scores_df[score_name].append(score)\n",
412
+ "# return pd.DataFrame(scores_df)\n",
413
+ " \n"
414
+ ]
415
+ },
416
+ {
417
+ "cell_type": "code",
418
+ "execution_count": 18,
419
+ "id": "d94ac537-38d6-401d-b9bb-55a7f1c5598b",
420
+ "metadata": {},
421
+ "outputs": [],
422
+ "source": [
423
+ "# scores_df = make_scores_df(response_scores)"
424
+ ]
425
+ },
426
+ {
427
+ "cell_type": "code",
428
+ "execution_count": 21,
429
+ "id": "a8179a51-6d77-4599-9b30-4553faf29d2d",
430
+ "metadata": {},
431
+ "outputs": [
432
+ {
433
+ "data": {
434
+ "text/html": [
435
+ "<div>\n",
436
+ "<style scoped>\n",
437
+ " .dataframe tbody tr th:only-of-type {\n",
438
+ " vertical-align: middle;\n",
439
+ " }\n",
440
+ "\n",
441
+ " .dataframe tbody tr th {\n",
442
+ " vertical-align: top;\n",
443
+ " }\n",
444
+ "\n",
445
+ " .dataframe thead th {\n",
446
+ " text-align: right;\n",
447
+ " }\n",
448
+ "</style>\n",
449
+ "<table border=\"1\" class=\"dataframe\">\n",
450
+ " <thead>\n",
451
+ " <tr style=\"text-align: right;\">\n",
452
+ " <th></th>\n",
453
+ " <th>question</th>\n",
454
+ " <th>reference_answer</th>\n",
455
+ " <th>llm_answer</th>\n",
456
+ " <th>retrieved_context</th>\n",
457
+ " <th>answer_similarity</th>\n",
458
+ " <th>retrieval_precision</th>\n",
459
+ " <th>augmentation_accuracy</th>\n",
460
+ " <th>augmentation_precision</th>\n",
461
+ " <th>answer_consistency</th>\n",
462
+ " </tr>\n",
463
+ " </thead>\n",
464
+ " <tbody>\n",
465
+ " <tr>\n",
466
+ " <th>0</th>\n",
467
+ " <td>What does Pando plan to use the $30 million ra...</td>\n",
468
+ " <td>Pando intends to use the funds for expanding i...</td>\n",
469
+ " <td>Pando plans to use the $30 million raised in i...</td>\n",
470
+ " <td>[Signaling that investments in the supply chai...</td>\n",
471
+ " <td>4.0</td>\n",
472
+ " <td>0.5</td>\n",
473
+ " <td>0.25</td>\n",
474
+ " <td>0.5</td>\n",
475
+ " <td>1.0</td>\n",
476
+ " </tr>\n",
477
+ " </tbody>\n",
478
+ "</table>\n",
479
+ "</div>"
480
+ ],
481
+ "text/plain": [
482
+ " question \\\n",
483
+ "0 What does Pando plan to use the $30 million ra... \n",
484
+ "\n",
485
+ " reference_answer \\\n",
486
+ "0 Pando intends to use the funds for expanding i... \n",
487
+ "\n",
488
+ " llm_answer \\\n",
489
+ "0 Pando plans to use the $30 million raised in i... \n",
490
+ "\n",
491
+ " retrieved_context answer_similarity \\\n",
492
+ "0 [Signaling that investments in the supply chai... 4.0 \n",
493
+ "\n",
494
+ " retrieval_precision augmentation_accuracy augmentation_precision \\\n",
495
+ "0 0.5 0.25 0.5 \n",
496
+ "\n",
497
+ " answer_consistency \n",
498
+ "0 1.0 "
499
+ ]
500
+ },
501
+ "execution_count": 21,
502
+ "metadata": {},
503
+ "output_type": "execute_result"
504
+ }
505
+ ],
506
+ "source": [
507
+ "# scores_df.head()"
508
+ ]
509
+ },
510
+ {
511
+ "cell_type": "code",
512
+ "execution_count": null,
513
+ "id": "3c1f4bce-d84e-4e9f-a422-c1d146eb1a32",
514
+ "metadata": {},
515
+ "outputs": [],
516
+ "source": []
517
+ }
518
+ ],
519
+ "metadata": {
520
+ "kernelspec": {
521
+ "display_name": "Python 3 (ipykernel)",
522
+ "language": "python",
523
+ "name": "python3"
524
+ },
525
+ "language_info": {
526
+ "codemirror_mode": {
527
+ "name": "ipython",
528
+ "version": 3
529
+ },
530
+ "file_extension": ".py",
531
+ "mimetype": "text/x-python",
532
+ "name": "python",
533
+ "nbconvert_exporter": "python",
534
+ "pygments_lexer": "ipython3",
535
+ "version": "3.10.12"
536
+ }
537
+ },
538
+ "nbformat": 4,
539
+ "nbformat_minor": 5
540
+ }
eval/e5connector.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
eval/new_articles.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2d127bf68a398131f7a613130f3ea70195cd417d8408bd995fe06d4b29fb5c3
3
+ size 39304
eval/question_and_answer_list.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "question": "What does Pando plan to use the $30 million raised in its recent Series B round for?",
4
+ "answer": "Pando intends to use the funds for expanding its global sales, marketing, and delivery capabilities.",
5
+ "reference_article": "Pando Raises $30M in Series B Funding for Fulfillment Management Technologies",
6
+ "reference_text": "Signaling that investments in the supply chain sector remain robust, Pando, a startup developing fulfillment management technologies, today announced that it raised $30 million in a Series B round, bringing its total raised to $45 million. Iron Pillar and Uncorrelated Ventures led the round, with participation from existing investors Nexus Venture Partners, Chiratae Ventures and Next47. CEO and founder Nitin Jayakrishnan says that the new capital will be put toward expanding Pando’s global sales, marketing and delivery capabilities."
7
+
8
+ },
9
+ {
10
+ "question": "What is ChatGPT, and how has it been used in various applications?",
11
+ "answer": "ChatGPT is a text-generating AI chatbot developed by OpenAI. It has been widely used for writing essays, code, and more based on short text prompts, enhancing productivity. Major brands have experimented with it for generating ad and marketing copy. OpenAI continually invests in ChatGPT, upgrading it to GPT-4, a more advanced language-writing model. The chatbot has been integrated into various applications, including search engines, customer service, and even an iPhone customization app called SuperChat.",
12
+ "reference_article": "ChatGPT: Everything you need to know about the AI-powered chatbot",
13
+ "reference_text": "ChatGPT, OpenAI’s text-generating AI chatbot, has taken the world by storm. It’s able to write essays, code and more given short text prompts, hyper-charging productivity. But it also has a more…nefarious side... (full article)"
14
+ },
15
+ {
16
+ "question": "What is Checks, and how is it transitioning within Google?",
17
+ "answer": "Checks is an AI-powered tool developed at Google's in-house incubator Area 120 to check mobile apps for compliance with privacy rules and regulations. Originally part of Area 120, Checks is now officially moving into Google as a privacy product for mobile developers. Co-founders Fergus Hurley and Nia Castelly will hold the titles of GM and Legal Lead, respectively, for Checks under Google. The tool utilizes artificial intelligence and machine learning to scan apps and their code, identifying potential privacy and data protection rule violations. It provides suggestions for remediation, making it easier for developers to ensure compliance.",
18
+ "reference_article": "Google integrates AI tool Checks into its privacy-focused products",
19
+ "reference_text": "After Google cut all but three of the projects at its in-house incubator Area 120 and shifted it to work on AI projects across Google, one of the legacy efforts β€” coincidentally also an AI project β€” is now officially exiting to Google. Checks, an AI-powered tool to check mobile apps for compliance with various privacy rules and regulations, is moving into Google proper as a privacy product aimed at mobile developers..."
20
+ },
21
+ {
22
+ "question": "What acquisition has Databricks recently announced, and what is the focus of the acquired company's technology?",
23
+ "answer": "Databricks has recently acquired Okera, a data governance platform with a focus on AI. Okera's technology uses an AI-powered system to automatically discover, classify, and apply rules to personally identifiable information, with a particular emphasis on metadata. Additionally, Okera's isolation technology enforces governance control on arbitrary workloads without significant overhead. Databricks plans to integrate Okera's technology into its Unity Catalog, enhancing its existing governance solution for data and AI assets.",
24
+ "reference_article": "Databricks acquires AI-focused data governance platform Okera",
25
+ "reference_text": "Databricks today announced that it has acquired Okera, a data governance platform with a focus on AI. The two companies did not disclose the purchase price. According to Crunchbase, Okera previously raised just under $30 million. Investors include Felicis, Bessemer Venture Partners, Cyber Mentor Fund, ClearSky and Emergent Ventures."
26
+ },
27
+ {
28
+ "question": "What is the latest evolution in Slack's platform, particularly concerning AI, as announced at the Salesforce World Tour event?",
29
+ "answer": "Slack has advanced from a pure communications platform to one facilitating direct integration with enterprise applications. At the Salesforce World Tour event in NYC, the company unveiled plans to place AI at the forefront of the user experience, aiming to enhance information retrieval and workflow creation. Notably, these features are still in development. The incorporation of AI into Slack involves various integrations, including SlackGPT, the company's generative AI built on the Slack platform. SlackGPT leverages the wealth of institutional knowledge within Slack's messages, files, and shared content to enable users and developers to build AI-driven experiences. The goal is to bring AI natively into the user experience with features like AI-powered conversation summaries and writing assistance directly available in Slack. Additionally, developers can integrate AI into workflows, tapping into external apps and large language models. EinsteinGPT, Salesforce's generative AI, will also be integrated into Slack, allowing employees to ask questions directly related to Salesforce content, enhancing teams' understanding of customer data. While these capabilities are still in development, Slack aims to provide users with flexibility and choice in incorporating AI into their work. SlackGPT and EinsteinGPT integration are in the development phase, but developers can already build custom integrations with various large language models (LLMs). Workflow Builder with SlackGPT AI connectors will be available this summer, allowing customers to connect ChatGPT or Claude to workflows or build custom connectors for their own LLMs.",
30
+ "reference_article": "Slack integrates AI into its platform, unveiling plans for AI-driven experiences",
31
+ "reference_text": "Slack has evolved from a pure communications platform to one that enables companies to link directly to enterprise applications without having to resort to dreaded task switching. Today, at the Salesforce World Tour event in NYC, the company announced the next step in its platform’s evolution where it will be putting AI at the forefront of the user experience, making it easier to get information and build workflows...\n"
32
+ },
33
+ {
34
+ "question": "What are the two new products announced by Nova, the startup building generative AI tools to protect brand integrity?",
35
+ "answer": "Nova has announced two new products: BrandGuard and BrandGPT. BrandGuard ingests a company's brand guidelines and style guide, using a series of models to check content against those rules for compliance, quality, adherence to style, and alignment with campaign goals. BrandGPT serves as an interface for asking questions about a brand's content rules in a ChatGPT-style interaction. These tools are designed to help brands safeguard their brand integrity when incorporating generative AI into their creative workflows.",
36
+ "reference_article": "Nova introduces BrandGuard and BrandGPT to protect brand integrity in AI-generated content",
37
+ "reference_text": "Nova is an early-stage startup building a suite of generative AI tools designed to protect brand integrity, and today, the company is announcing two new products to help brands police AI-generated content: BrandGuard and BrandGPT."
38
+ },
39
+ {
40
+ "question": "What is the startup Spawning AI doing to address the legal issues between artists and companies training AI on their artwork?",
41
+ "answer": "Spawning AI, co-founded by Jordan Meyer and Mathew Dryhurst, has created HaveIBeenTrained, a website that allows creators to opt out of the training dataset for one art-generating AI model called Stable Diffusion v3. Spawning raised $3 million in a seed round led by True Ventures to further develop IP standards for the AI era, establish more robust opt-out and opt-in standards, and build the consent layer for AI. The company aims to make it easier for AI model trainers to honor opt-out requests, offer more services to organizations protecting artists' work, and grow to address different domains in the AI economy.",
42
+ "reference_article": "Spawning raises $3M to help artists opt out of AI training data",
43
+ "reference_text": "In an effort to grant artists more control over how β€” and where β€” their art’s used, Jordan Meyer and Mathew Dryhurst co-founded the startup Spawning AI. Spawning created HaveIBeenTrained, a website that allows creators to opt out of the training dataset for one art-generating AI model, Stable Diffusion v3, due to be released in the coming months."
44
+ },
45
+ {
46
+ "question": "What is the U.K.'s Competition and Markets Authority (CMA) reviewing regarding AI?",
47
+ "answer": "The CMA is conducting an initial review of 'AI foundational models,' which include large language models (LLMs) like OpenAI's ChatGPT and Microsoft's New Bing. The review aims to explore competition and consumer protection considerations in the development and use of AI foundational models. The CMA will examine how competitive markets for these models could evolve, explore opportunities and risks for competition and consumer protection, and produce guiding principles to support competition and protect consumers as AI foundation models develop. The review is in line with the U.K. government's instructions to regulators to analyze potential enforcements related to dangerous, unfair, and unaccountable applications of AI.",
48
+ "reference_article": "U.K. competition watchdog launches review of AI foundational models",
49
+ "reference_text": "The U.K.’s competition watchdog has announced an initial review of β€œAI foundational models”, such as the large language models (LLMs) which underpin OpenAI’s ChatGPT and Microsoft’s New Bing. Generative AI models which power AI art platforms such as OpenAI’s DALL-E or Midjourney will also likely fall in scope."
50
+ },
51
+ {
52
+ "question": "What is StarCoder and who developed it?",
53
+ "answer": "StarCoder is a free alternative to code-generating AI systems, similar to GitHub's Copilot, developed by AI startup Hugging Face and ServiceNow Research, ServiceNow’s R&D division. It is part of Hugging Face's and ServiceNow’s BigCode project, which involves over 600 contributors. StarCoder is licensed for royalty-free use and was trained on over 80 programming languages using text from GitHub repositories. It integrates with Microsoft's Visual Studio Code code editor and claims to match or outperform the AI model from OpenAI used in the initial versions of Copilot.",
54
+ "reference_article": "Hugging Face and ServiceNow Research release StarCoder, a free alternative to GitHub Copilot",
55
+ "reference_text": "AI startup Hugging Face and ServiceNow Research, ServiceNow’s R&D division, have released StarCoder, a free alternative to code-generating AI systems along the lines of GitHub’s Copilot."
56
+ },
57
+ {
58
+ "question": "What are the new features coming to Bing, and how does Microsoft plan to enhance its search experience?",
59
+ "answer": "Microsoft is introducing new features to enhance Bing's search experience, focusing on AI and visual elements. Bing Chat, powered by OpenAI's GPT-4 and DALL-E 2 models, will offer more image- and graphic-centric answers. The chatbot will become more visual and personalized, allowing users to export their chat histories and integrate content from third-party plugins. Bing Chat will answer questions within the context of images. Bing will also improve transparency by providing citations for fact-based responses. The Bing Image Creator tool will understand more languages, and Bing Chat will gain the ability to create charts and graphs. Microsoft aims to make Bing more multimodal, allowing users to upload images for related searches. New chat features include chat history storage, export and share functionalities, and the addition of plugins from partners like OpenTable and Wolfram Alpha. Edge, Microsoft's browser, will also receive updates, featuring rounded corners, improved design elements, and actions that translate Bing Chat prompts into automations within the browser.",
60
+ "reference_article": "Microsoft doubles down on AI with new Bing features",
61
+ "reference_text": "Microsoft is embarking on the next phase of Bing’s expansion. And β€” no surprise β€” it heavily revolves around AI."
62
+ }
63
+ ]
main.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import autogen
4
+ from src.mapper.e5map import E5Mapper
5
+ from src.mapper.scimap import scimap
6
+ from src.mapper.parser import MapperParser
7
+ from src.datatonic.dataloader import DataLoader
8
+ from src.teams.agentteam import codingteam, covid19team, financeteam, debateteam, homeworkteam, consultingteam
9
+
10
+ title = """# Welcome to πŸ‘©πŸ»β€πŸ”¬πŸ§ͺSciTonic
11
+ this is a highly adaptive technical operator that will listen to your query and load datasets and multi-agent teams based on those. Simply describe your problem in detail, ask a question and provide a reasoning method to get started:
12
+ """
13
+
14
+ def update_config_file(api_key):
15
+ config_path = "./config/OAI_CONFIG_LIST.json"
16
+ with open(config_path, "r") as file:
17
+ config = json.load(file)
18
+
19
+ for item in config:
20
+ item["api_key"] = api_key
21
+
22
+ with open(config_path, "w") as file:
23
+ json.dump(config, file, indent=4)
24
+
25
+
26
+ def process_audio_image_input(input_type, input_data, MODEL_ID):
27
+ PAT = os.getenv("CLARIFAI_PAT")
28
+ if not PAT:
29
+ raise ValueError("Clarifai Personal Access Token not set in environment variables")
30
+
31
+ channel = ClarifaiChannel.get_grpc_channel()
32
+ stub = service_pb2_grpc.V2Stub(channel)
33
+ metadata = (("authorization", "Key " + PAT),)
34
+
35
+ if input_type == "audio":
36
+ file_bytes = input_data
37
+ elif input_type == "image":
38
+ file_bytes = base64.b64encode(input_data).decode("utf-8")
39
+
40
+ post_model_outputs_response = stub.PostModelOutputs(
41
+ service_pb2.PostModelOutputsRequest(
42
+ model_id=MODEL_ID,
43
+ inputs=[
44
+ resources_pb2.Input(
45
+ data=resources_pb2.Data(
46
+ audio=resources_pb2.Audio(base64=file_bytes) if input_type == "audio" else None,
47
+ image=resources_pb2.Image(base64=file_bytes) if input_type == "image" else None
48
+ )
49
+ )
50
+ ],
51
+ ),
52
+ metadata=metadata,
53
+ )
54
+
55
+ if post_model_outputs_response.status.code != status_code_pb2.SUCCESS:
56
+ print(post_model_outputs_response.status)
57
+ raise Exception(
58
+ "Post model outputs failed, status: "
59
+ + post_model_outputs_response.status.description
60
+ )
61
+
62
+ output = post_model_outputs_response.outputs[0]
63
+ return output.data.text.raw
64
+
65
+
66
+ def process_query(oai_key, query, max_auto_reply):
67
+ update_config_file(oai_key)
68
+ os.environ['OAI_KEY'] = oai_key
69
+ llm_config = autogen.config_list_from_json(
70
+ env_or_file="./config/OAI_CONFIG_LIST.json",
71
+ filter_dict={"model": {"gpt-4", "gpt-3.5-turbo-16k", "gpt-4-1106-preview"}}
72
+ )
73
+
74
+ # Initialize mappers
75
+ taskmapper = E5Mapper(oai_key)
76
+ teammapper = scimap(oai_key)
77
+
78
+ # Get responses from mappers
79
+ taskmap_response = taskmapper.get_completion(query)
80
+ teammap_response = teammapper.get_completion(query)
81
+
82
+ # Parse responses
83
+ task = MapperParser.parse_taskmapper_response(taskmap_response)
84
+ team = MapperParser.parse_teammapper_response(teammap_response)
85
+
86
+ # Load dataset based on task
87
+ data_loader = DataLoader()
88
+ dataset = data_loader.load_and_process(task.lower())
89
+
90
+ # Save dataset to a JSON file and get the file path
91
+ json_file_path = "./src/datatonic" # Define the JSON file path
92
+ data_loader.save_to_json(dataset, json_file_path)
93
+
94
+ # Initialize AgentsFactory with the path to the JSON file
95
+ agents_factory = AgentsFactory(llm_config, json_file_path)
96
+
97
+ # Retrieve the Boss Assistant agent
98
+ boss_assistant = agents_factory.scitonic()
99
+
100
+ # Select and initiate team based on team mapping
101
+ team_function = {
102
+ "CodingTeam": codingteam,
103
+ "Covid19Team": covid19team,
104
+ "FinanceTeam": financeteam,
105
+ "DebateTeam": debateteam,
106
+ "HomeworkTeam": homeworkteam,
107
+ "ConsultingTeam": consultingteam
108
+ }
109
+
110
+ team_action = team_function.get(team, lambda: "No appropriate team found for the given input.")
111
+ return team_action()
112
+
113
+ def main():
114
+ with gr.Blocks() as demo:
115
+ gr.Markdown(title)
116
+ with gr.Row():
117
+ txt_oai_key = gr.Textbox(label="OpenAI API Key", type="password")
118
+ txt_pat = gr.Textbox(label="Clarifai PAT", type="password", placeholder="Enter Clarifai PAT here")
119
+ txt_query = gr.Textbox(label="Describe your problem in detail:")
120
+ txt_max_auto_reply = gr.Number(label="Max Auto Replies", value=50)
121
+ audio_input = gr.Audio(label="Or speak your problem here:", type="numpy", optional=True)
122
+ image_input = gr.Image(label="Or upload an image related to your problem:", type="numpy", optional=True)
123
+ btn_submit = gr.Button("Submit")
124
+ output = gr.Textbox(label="Output", readonly=True)
125
+
126
+ def process_and_submit(oai_key, pat, query, max_auto_reply, audio, image):
127
+ os.environ['CLARIFAI_PAT'] = pat
128
+ os.environ['OAI_KEY'] = oai_key
129
+
130
+ if audio is not None:
131
+ query = process_audio_image_input("audio", audio, "asr-wav2vec2-base-960h-english")
132
+ elif image is not None:
133
+ query = process_audio_image_input("image", image, "general-english-image-caption-blip")
134
+ return process_query(oai_key, query, max_auto_reply)
135
+
136
+ btn_submit.click(
137
+ process_and_submit,
138
+ inputs=[txt_oai_key, txt_pat, txt_query, txt_max_auto_reply, audio_input, image_input],
139
+ outputs=output
140
+ )
141
+
142
+ demo.launch()
143
+
144
+ if __name__ == "__main__":
145
+ main()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ gradio
3
+ datasets
4
+ autogen
5
+ chromadb
6
+ semantic-kernel
7
+ llama-index
8
+ llama-hub
9
+ langchain
10
+ huggingface_hub
11
+ openai
src/add_your_files_here/example.db ADDED
File without changes
src/agentics/Image_agent.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import random
4
+ import time
5
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
6
+
7
+ import matplotlib.pyplot as plt
8
+ import requests
9
+ from PIL import Image
10
+ from termcolor import colored
11
+
12
+ import autogen
13
+ from autogen import Agent, AssistantAgent, ConversableAgent, UserProxyAgent
14
+ from autogen.agentchat.contrib.multimodal_conversable_agent import MultimodalConversableAgent
15
+
16
+ config_list_4v = autogen.config_list_from_json(
17
+ "OAI_CONFIG_LIST",
18
+ filter_dict={
19
+ "model": ["gpt-4-vision-preview"],
20
+ },
21
+ )
22
+
23
+
24
+ config_list_gpt4 = autogen.config_list_from_json(
25
+ "OAI_CONFIG_LIST",
26
+ filter_dict={
27
+ "model": ["gpt-4", "gpt-4-0314", "gpt4", "gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-v0314"],
28
+ },
29
+ )
30
+
31
+ gpt4_llm_config = {"config_list": config_list_gpt4, "cache_seed": 42}
32
+
33
+ agent1 = MultimodalConversableAgent(
34
+ name="image-explainer-1",
35
+ max_consecutive_auto_reply=10,
36
+ llm_config={"config_list": config_list_4v, "temperature": 0.5, "max_tokens": 300},
37
+ system_message="Your image description is poetic and engaging.",
38
+ )
39
+ agent2 = MultimodalConversableAgent(
40
+ name="image-explainer-2",
41
+ max_consecutive_auto_reply=10,
42
+ llm_config={"config_list": config_list_4v, "temperature": 0.5, "max_tokens": 300},
43
+ system_message="Your image description is factual and to the point.",
44
+ )
45
+
46
+ def imagechat():
47
+ def _reset_agents():
48
+ agent1.reset()
49
+ agent2.reset()
50
+ user_proxy.reset()
51
+ _reset_agents()
52
+
53
+ # User Proxy Agent
54
+ user_proxy = autogen.UserProxyAgent(
55
+ name="User_proxy",
56
+ system_message="Ask both image explainer 1 and 2 for their description.",
57
+ human_input_mode="TERMINATE", # Try between ALWAYS, NEVER, and TERMINATE
58
+ max_consecutive_auto_reply=10,
59
+ code_execution_config={
60
+ "use_docker": False
61
+ },
62
+ )
63
+
64
+ # # Group Chat setup
65
+ # groupchat = autogen.GroupChat(agents=[agent1, agent2, user_proxy], messages=[], max_round=5)
66
+ # group_chat_manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=gpt4_llm_config)
67
+
68
+ # # Initiate the chat with a message
69
+ # user_proxy.initiate_chat(
70
+ # group_chat_manager,
71
+ # message="""Describe the image:
72
+ # <img path/url of image>."""
73
+ # )
src/agentics/agents.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import autogen
2
+ from autogen import AssistantAgent
3
+ from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
4
+ import chromadb
5
+
6
+ config_list = autogen.config_list_from_json(
7
+ "OAI_CONFIG_LIST",
8
+ file_location=".src/config/",
9
+ filter_dict={
10
+ "model": ["gpt-3.5-turbo", "gpt-35-turbo", "gpt-35-turbo-0613", "gpt-4", "gpt4", "gpt-4-32k"],
11
+ },
12
+ )
13
+
14
+ print("LLM models: ", [config_list[i]["model"] for i in range(len(config_list))])
15
+
16
+ llm_config = {
17
+ "timeout": 60,
18
+ "cache_seed": 42,
19
+ "config_list": config_list,
20
+ "temperature": 0,
21
+ }
22
+
23
+ class AgentsFactory:
24
+ def __init__(self, llm_config, db_path):
25
+ self.llm_config = llm_config
26
+ self.db_path = db_path
27
+
28
+ def termination_msg(self, x):
29
+ return isinstance(x, dict) and "TERMINATE" == str(x.get("content", ""))[-9:].upper()
30
+
31
+ def tonic(self) :
32
+ return autogen.UserProxyAgent(
33
+ name="Boss",
34
+ is_termination_msg=termination_msg,
35
+ human_input_mode="NEVER",
36
+ system_message="The boss who asks questions and gives tasks.",
37
+ code_execution_config=False,
38
+ default_auto_reply="Reply `TERMINATE` if the task is done.",
39
+ )
40
+
41
+ # Create the RetrieveUserProxyAgent (Boss Assistant)
42
+ def scitonic(self) :
43
+ return RetrieveUserProxyAgent(
44
+ name="Boss_Assistant",
45
+ is_termination_msg=termination_msg,
46
+ system_message="Assistant who has extra content retrieval power for solving difficult problems.",
47
+ human_input_mode="NEVER",
48
+ max_consecutive_auto_reply=3,
49
+ retrieve_config={
50
+ "task": "QuoraRetrieval",
51
+ "docs_path": self.db_path, ,
52
+ "chunk_token_size": 1000,
53
+ "model": llm_config["config_list"][0]["model"],
54
+ "client": chromadb.PersistentClient(path="/tmp/chromadb"),
55
+ "collection_name": "groupchat",
56
+ "get_or_create": True,
57
+ },
58
+ code_execution_config=False,
59
+ )
60
+ # Placeholder definitions for agents used in team functions
61
+ def coder(self) :
62
+ return AssistantAgent(
63
+ name="Coder",
64
+ system_message="You are a coder. Help in writing and reviewing code.",
65
+ llm_config=llm_config
66
+ )
67
+
68
+ def pm(self) :
69
+ return AssistantAgent(
70
+ name="Project_Manager",
71
+ system_message="You are a project manager. Coordinate tasks and ensure project success.",
72
+ llm_config=llm_config
73
+ )
74
+
75
+ def reviewer(self) :
76
+ return AssistantAgent(
77
+ name="Reviewer",
78
+ system_message="You are a code reviewer. Provide feedback on code quality.",
79
+ llm_config=llm_config
80
+ )
81
+
82
+ # Define more agents for each team
83
+ def finance_expert(self) :
84
+ return AssistantAgent(
85
+ name="Finance_Expert",
86
+ system_message="You are a finance expert. Provide insights on financial matters.",
87
+ llm_config=llm_config
88
+ )
89
+
90
+ def debate_champion(self) :
91
+ return AssistantAgent(
92
+ name="Debate_Champion",
93
+ system_message="You are a debate champion. Contribute to meaningful debates.",
94
+ llm_config=llm_config
95
+ )
96
+
97
+ def academic_whiz(self) :
98
+ return AssistantAgent(
99
+ name="Academic_Whiz",
100
+ system_message="You are an academic whiz. Offer solutions to academic challenges.",
101
+ llm_config=llm_config
102
+ )
103
+
104
+ def consulting_pro(self) :
105
+ return AssistantAgent(
106
+ name="Consulting_Pro",
107
+ system_message="You are a consulting professional. Offer professional advice and solutions.",
108
+ llm_config=llm_config
109
+ )
110
+ def covid19_scientist(self) :
111
+ return AssistantAgent(
112
+ name="Covid19_Scientist",
113
+ system_message="You are a scientist studying Covid-19 trends. Provide analysis and insights.",
114
+ llm_config=llm_config
115
+ )
116
+
117
+ def healthcare_expert(self) :
118
+ return AssistantAgent(
119
+ name="Healthcare_Expert",
120
+ system_message="You are a healthcare expert focused on managing and mitigating the impact of Covid-19.",
121
+ llm_config=llm_config
122
+ )
123
+
124
+ def finance_analyst(self) :
125
+ return AssistantAgent(
126
+ name="Finance_Analyst",
127
+ system_message="You are a finance analyst. Provide insights on the economic impact of Covid-19.",
128
+ llm_config=llm_config
129
+ )
130
+
131
+ def debate_expert(self) :
132
+ return AssistantAgent(
133
+ name="Debate_Expert",
134
+ system_message="You are an expert in debate strategies and communication. Participate in meaningful debates.",
135
+ llm_config=llm_config
136
+ )
137
+
138
+ def academic_expert(self) :
139
+ return AssistantAgent(
140
+ name="Academic_Expert",
141
+ system_message="You are an academic expert. Provide assistance and insights for academic challenges.",
142
+ llm_config=llm_config
143
+ )
src/config/OAI_CONFIG_LIST.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "model": "gpt-3.5-turbo-preview",
4
+ "api_key": "your OpenAI Key goes here",
5
+ "base_url": "https://api.openai.com/v1",
6
+ "api_version": "2023-06-01-preview"
7
+ },
8
+ {
9
+ "model": "gpt-4-preview",
10
+ "api_key": "your OpenAI Key goes here",
11
+ "base_url": "https://api.openai.com/v1",
12
+ "api_version": "2023-06-01-preview"
13
+ },
14
+ {
15
+ "model": "gpt-4-vision-preview",
16
+ "api_key": "your OpenAI Key goes here",
17
+ "base_url": "https://api.openai.com/v1",
18
+ "api_version": "2023-06-01-preview"
19
+ },
20
+ {
21
+ "model": "dall-e-3",
22
+ "api_key": "your OpenAI Key goes here",
23
+ "base_url": "https://api.openai.com/v1",
24
+ "api_version": "2023-06-01-preview"
25
+ }//,
26
+ // {
27
+ // "model": "e5",
28
+ // "api_key": "NULL",
29
+ // "base_url": "https://tonic-e5.hf.space/--replicas/7o447/",
30
+ // "api_type": "openai"
31
+ // }
32
+ //,{
33
+ // "model": "gemini-pro-vision",
34
+ // "api_key": "your Google's GenAI Key goes here",
35
+ // "base_url": "https://genai.google.com/v1",
36
+ // "api_type": "google"
37
+ // }
38
+ ]
src/config/QDRANT_CONFIG.yaml ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level: INFO
2
+
3
+ storage:
4
+ # Where to store all the data
5
+ storage_path: ./storage
6
+
7
+ # Where to store snapshots
8
+ snapshots_path: ./snapshots
9
+
10
+ # Where to store temporary files
11
+ # If null, temporary snapshot are stored in: storage/snapshots_temp/
12
+ temp_path: null
13
+
14
+ # If true - point's payload will not be stored in memory.
15
+ # It will be read from the disk every time it is requested.
16
+ # This setting saves RAM by (slightly) increasing the response time.
17
+ # Note: those payload values that are involved in filtering and are indexed - remain in RAM.
18
+ on_disk_payload: true
19
+
20
+ # Maximum number of concurrent updates to shard replicas
21
+ # If `null` - maximum concurrency is used.
22
+ update_concurrency: null
23
+
24
+ # Write-ahead-log related configuration
25
+ wal:
26
+ # Size of a single WAL segment
27
+ wal_capacity_mb: 32
28
+
29
+ # Number of WAL segments to create ahead of actual data requirement
30
+ wal_segments_ahead: 0
31
+
32
+ # Normal node - receives all updates and answers all queries
33
+ node_type: "Normal"
34
+
35
+ # Listener node - receives all updates, but does not answer search/read queries
36
+ # Useful for setting up a dedicated backup node
37
+ # node_type: "Listener"
38
+
39
+ performance:
40
+ # Number of parallel threads used for search operations. If 0 - auto selection.
41
+ max_search_threads: 0
42
+ # Max total number of threads, which can be used for running optimization processes across all collections.
43
+ # Note: Each optimization thread will also use `max_indexing_threads` for index building.
44
+ # So total number of threads used for optimization will be `max_optimization_threads * max_indexing_threads`
45
+ max_optimization_threads: 1
46
+
47
+ # Prevent DDoS of too many concurrent updates in distributed mode.
48
+ # One external update usually triggers multiple internal updates, which breaks internal
49
+ # timings. For example, the health check timing and consensus timing.
50
+ # If null - auto selection.
51
+ update_rate_limit: null
52
+
53
+ optimizers:
54
+ # The minimal fraction of deleted vectors in a segment, required to perform segment optimization
55
+ deleted_threshold: 0.2
56
+
57
+ # The minimal number of vectors in a segment, required to perform segment optimization
58
+ vacuum_min_vector_number: 1000
59
+
60
+ # Target amount of segments optimizer will try to keep.
61
+ # Real amount of segments may vary depending on multiple parameters:
62
+ # - Amount of stored points
63
+ # - Current write RPS
64
+ #
65
+ # It is recommended to select default number of segments as a factor of the number of search threads,
66
+ # so that each segment would be handled evenly by one of the threads.
67
+ # If `default_segment_number = 0`, will be automatically selected by the number of available CPUs
68
+ default_segment_number: 0
69
+
70
+ # Do not create segments larger this size (in KiloBytes).
71
+ # Large segments might require disproportionately long indexation times,
72
+ # therefore it makes sense to limit the size of segments.
73
+ #
74
+ # If indexation speed have more priority for your - make this parameter lower.
75
+ # If search speed is more important - make this parameter higher.
76
+ # Note: 1Kb = 1 vector of size 256
77
+ # If not set, will be automatically selected considering the number of available CPUs.
78
+ max_segment_size_kb: null
79
+
80
+ # Maximum size (in KiloBytes) of vectors to store in-memory per segment.
81
+ # Segments larger than this threshold will be stored as read-only memmaped file.
82
+ # To enable memmap storage, lower the threshold
83
+ # Note: 1Kb = 1 vector of size 256
84
+ # To explicitly disable mmap optimization, set to `0`.
85
+ # If not set, will be disabled by default.
86
+ memmap_threshold_kb: null
87
+
88
+ # Maximum size (in KiloBytes) of vectors allowed for plain index.
89
+ # Default value based on https://github.com/google-research/google-research/blob/master/scann/docs/algorithms.md
90
+ # Note: 1Kb = 1 vector of size 256
91
+ # To explicitly disable vector indexing, set to `0`.
92
+ # If not set, the default value will be used.
93
+ indexing_threshold_kb: 20000
94
+
95
+ # Interval between forced flushes.
96
+ flush_interval_sec: 5
97
+
98
+ # Max number of threads, which can be used for optimization per collection.
99
+ # Note: Each optimization thread will also use `max_indexing_threads` for index building.
100
+ # So total number of threads used for optimization will be `max_optimization_threads * max_indexing_threads`
101
+ # If `max_optimization_threads = 0`, optimization will be disabled.
102
+ max_optimization_threads: 1
103
+
104
+ # Default parameters of HNSW Index. Could be overridden for each collection or named vector individually
105
+ hnsw_index:
106
+ # Number of edges per node in the index graph. Larger the value - more accurate the search, more space required.
107
+ m: 16
108
+ # Number of neighbours to consider during the index building. Larger the value - more accurate the search, more time required to build index.
109
+ ef_construct: 100
110
+ # Minimal size (in KiloBytes) of vectors for additional payload-based indexing.
111
+ # If payload chunk is smaller than `full_scan_threshold_kb` additional indexing won't be used -
112
+ # in this case full-scan search should be preferred by query planner and additional indexing is not required.
113
+ # Note: 1Kb = 1 vector of size 256
114
+ full_scan_threshold_kb: 10000
115
+ # Number of parallel threads used for background index building. If 0 - auto selection.
116
+ max_indexing_threads: 0
117
+ # Store HNSW index on disk. If set to false, index will be stored in RAM. Default: false
118
+ on_disk: false
119
+ # Custom M param for hnsw graph built for payload index. If not set, default M will be used.
120
+ payload_m: null
121
+
122
+
123
+ service:
124
+
125
+ # Maximum size of POST data in a single request in megabytes
126
+ max_request_size_mb: 32
127
+
128
+ # Number of parallel workers used for serving the api. If 0 - equal to the number of available cores.
129
+ # If missing - Same as storage.max_search_threads
130
+ max_workers: 0
131
+
132
+ # Host to bind the service on
133
+ host: 0.0.0.0
134
+
135
+ # HTTP(S) port to bind the service on
136
+ http_port: 6333
137
+
138
+ # gRPC port to bind the service on.
139
+ # If `null` - gRPC is disabled. Default: null
140
+ # Comment to disable gRPC:
141
+ grpc_port: 6334
142
+
143
+ # Enable CORS headers in REST API.
144
+ # If enabled, browsers would be allowed to query REST endpoints regardless of query origin.
145
+ # More info: https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS
146
+ # Default: true
147
+ enable_cors: true
148
+
149
+ # Enable HTTPS for the REST and gRPC API
150
+ enable_tls: false
151
+
152
+ # Check user HTTPS client certificate against CA file specified in tls config
153
+ verify_https_client_certificate: false
154
+
155
+ # Set an api-key.
156
+ # If set, all requests must include a header with the api-key.
157
+ # example header: `api-key: <API-KEY>`
158
+ #
159
+ # If you enable this you should also enable TLS.
160
+ # (Either above or via an external service like nginx.)
161
+ # Sending an api-key over an unencrypted channel is insecure.
162
+ #
163
+ # Uncomment to enable.
164
+ # api_key: your_secret_api_key_here
165
+
166
+ # Set an api-key for read-only operations.
167
+ # If set, all requests must include a header with the api-key.
168
+ # example header: `api-key: <API-KEY>`
169
+ #
170
+ # If you enable this you should also enable TLS.
171
+ # (Either above or via an external service like nginx.)
172
+ # Sending an api-key over an unencrypted channel is insecure.
173
+ #
174
+ # Uncomment to enable.
175
+ # read_only_api_key: your_secret_read_only_api_key_here
176
+
177
+ cluster:
178
+ # Use `enabled: true` to run Qdrant in distributed deployment mode
179
+ enabled: false
180
+
181
+ # Configuration of the inter-cluster communication
182
+ p2p:
183
+ # Port for internal communication between peers
184
+ port: 6335
185
+
186
+ # Use TLS for communication between peers
187
+ enable_tls: false
188
+
189
+ # Configuration related to distributed consensus algorithm
190
+ consensus:
191
+ # How frequently peers should ping each other.
192
+ # Setting this parameter to lower value will allow consensus
193
+ # to detect disconnected nodes earlier, but too frequent
194
+ # tick period may create significant network and CPU overhead.
195
+ # We encourage you NOT to change this parameter unless you know what you are doing.
196
+ tick_period_ms: 100
197
+
198
+
199
+ # Set to true to prevent service from sending usage statistics to the developers.
200
+ # Read more: https://qdrant.tech/documentation/guides/telemetry
201
+ telemetry_disabled: false
202
+
203
+
204
+ # TLS configuration.
205
+ # Required if either service.enable_tls or cluster.p2p.enable_tls is true.
206
+ tls:
207
+ # Server certificate chain file
208
+ cert: ./tls/cert.pem
209
+
210
+ # Server private key file
211
+ key: ./tls/key.pem
212
+
213
+ # Certificate authority certificate file.
214
+ # This certificate will be used to validate the certificates
215
+ # presented by other nodes during inter-cluster communication.
216
+ #
217
+ # If verify_https_client_certificate is true, it will verify
218
+ # HTTPS client certificate
219
+ #
220
+ # Required if cluster.p2p.enable_tls is true.
221
+ ca_cert: ./tls/cacert.pem
222
+
223
+ # TTL in seconds to reload certificate from disk, useful for certificate rotations.
224
+ # Only works for HTTPS endpoints. Does not support gRPC (and intra-cluster communication).
225
+ # If `null` - TTL is disabled.
226
+ cert_ttl: 3600
src/datatonic/dataloader.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/datatonic/dataloader.py
2
+
3
+ from datasets import load_dataset
4
+ import json
5
+
6
+ class DataLoader:
7
+ def __init__(self):
8
+ self.datasets = {
9
+ "gpl-fiqa": self.load_gpl_fiqa,
10
+ "msmarco": self.load_msmarco,
11
+ "nfcorpus": self.load_nfcorpus,
12
+ "covid19": self.load_covid19,
13
+ "gpl-webis-touche2020": self.load_gpl_webis_touche2020,
14
+ "gpl-hotpotqa": self.load_gpl_hotpotqa,
15
+ "gpl-nq": self.load_gpl_nq,
16
+ "gpl-fever": self.load_gpl_fever,
17
+ "gpl-scidocs": self.load_gpl_scidocs,
18
+ "gpl-scifact": self.load_gpl_scifact,
19
+ "gpl-cqadupstack": self.load_gpl_cqadupstack,
20
+ "gpl-arguana": self.load_gpl_arguana,
21
+ "gpl-climate-fever": self.load_gpl_climate_fever,
22
+ "gpl-dbpedia-entity": self.load_gpl_dbpedia_entity,
23
+ "gpl-all-mix-450k": self.load_gpl_all_mix_450k,
24
+ }
25
+
26
+ def load_dataset_generic(self, dataset_name):
27
+ dataset = load_dataset(dataset_name)
28
+ return self.process_dataset(dataset)
29
+
30
+ def load_gpl_fiqa(self):
31
+ return self.load_dataset_generic("nthakur/gpl-fiqa")
32
+
33
+ def load_msmarco(self):
34
+ return self.load_dataset_generic("nthakur/msmarco-passage-sampled-100k")
35
+
36
+ def load_nfcorpus(self):
37
+ return self.load_dataset_generic("nthakur/gpl-nfcorpus")
38
+
39
+ def load_covid19(self):
40
+ return self.load_dataset_generic("nthakur/gpl-trec-covid")
41
+
42
+ def load_gpl_webis_touche2020(self):
43
+ return self.load_dataset_generic("nthakur/gpl-webis-touche2020")
44
+
45
+ def load_gpl_hotpotqa(self):
46
+ return self.load_dataset_generic("nthakur/gpl-hotpotqa")
47
+
48
+ def load_gpl_nq(self):
49
+ return self.load_dataset_generic("nthakur/gpl-nq")
50
+
51
+ def load_gpl_fever(self):
52
+ return self.load_dataset_generic("nthakur/gpl-fever")
53
+
54
+ def load_gpl_scidocs(self):
55
+ return self.load_dataset_generic("nthakur/gpl-scidocs")
56
+
57
+ def load_gpl_scifact(self):
58
+ return self.load_dataset_generic("nthakur/gpl-scifact")
59
+
60
+ def load_gpl_cqadupstack(self):
61
+ return self.load_dataset_generic("nthakur/gpl-cqadupstack")
62
+
63
+ def load_gpl_arguana(self):
64
+ return self.load_dataset_generic("nthakur/gpl-arguana")
65
+
66
+ def load_gpl_climate_fever(self):
67
+ return self.load_dataset_generic("nthakur/gpl-climate-fever")
68
+
69
+ def load_gpl_dbpedia_entity(self):
70
+ return self.load_dataset_generic("nthakur/gpl-dbpedia-entity")
71
+
72
+ def load_gpl_all_mix_450k(self):
73
+ return self.load_dataset_generic("nthakur/gpl-all-mix-450k")
74
+
75
+ def process_dataset(self, dataset):
76
+ # Process the dataset to fit the required JSON structure
77
+ processed_data = []
78
+ for entry in dataset['train']:
79
+ # Adjust the processing based on the actual structure of each dataset
80
+ processed_entry = {
81
+ "query": entry.get("query", ""),
82
+ "positive_passages": entry.get("positive_passages", []),
83
+ "negative_passages": entry.get("negative_passages", [])
84
+ }
85
+ processed_data.append(processed_entry)
86
+ return processed_data
87
+
88
+ def load_and_process(self, dataset_name):
89
+ if dataset_name in self.datasets:
90
+ return self.datasets[dataset_name]()
91
+ else:
92
+ raise ValueError(f"Dataset {dataset_name} not supported.")
93
+
94
+ def save_to_json(self, data, file_name):
95
+ with open(file_name, 'w') as f:
96
+ json.dump(data, f, indent=4)
src/documentation/CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CODE_OF_CONDUCT.md
2
+
3
+ ## Our Pledge
4
+ In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a futuristic, harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
5
+
6
+ ## Our Standards
7
+ Here are some behaviors that contribute to creating a positive environment:
8
+
9
+ - **Be Friendly and Patient**: We understand that everyone has different experiences and backgrounds, so be kind and patient with others.
10
+ - **Be Welcoming**: We strive to be a community that welcomes and supports people of all backgrounds and identities.
11
+ - **Be Considerate**: Your work will be used by other people, and you in turn will depend on the work of others. Any decision you take will affect users and colleagues, and you should take those consequences into account when making decisions.
12
+ - **Be Respectful**: Not all of us will agree all the time, but disagreement is no excuse for poor behavior and poor manners. We might all experience some frustration now and then, but we cannot allow that frustration to turn into a personal attack.
13
+ - **Be Creative**: Think outside the box. We're here to solve problems, set trends, and lead the way in the industry. Fresh ideas and innovative thinking are the currencies of our realm.
14
+ - **Focus on What's Best for the Community**: We're building this project for the future, so always keep the long-term health of the community in mind.
15
+
16
+ ## Our Responsibilities
17
+ Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
18
+
19
+ Maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
20
+
21
+ ## Scope
22
+ This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event.
23
+
24
+ ## Conflict Resolution
25
+ We believe peer to peer discussions, feedback, and corrections can help build a stronger, safer, and more welcoming community. If you see someone who is making an extra effort to ensure our community is welcoming, friendly, and encourages all participants to contribute to the fullest extent, a friendly nod in their direction is always appreciated.
26
+
27
+ If you have a dispute with one or more users, we encourage you to first try to resolve it with the people involved. If you are unable to resolve the conflict, and you feel it breaches the Code of Conduct, open an issue or contact one of the project maintainers directly.
28
+
29
+ ## Reporting Issues
30
+ If you experience or witness unacceptable behaviorβ€”or have any other concernsβ€”please report it by contacting the project team at [[email protected]] All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident.
31
+
32
+ ## Enforcement
33
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at [[email protected]] All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
34
+
35
+ Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
36
+
37
+ ## Attribution
38
+ This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
src/documentation/CONTRIBUTING.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CONTRIBUTING GUIDE
2
+
3
+ ## Introduction
4
+ Welcome to the `multitonic` repository! This guide is designed to provide a streamlined process for contributing to our project. We value your input and are excited to collaborate with you.
5
+
6
+ ## Prerequisites
7
+ Before contributing, make sure you have a GitHub account. You should also join our Tonic-AI Discord to communicate with other contributors and the core team.
8
+
9
+ ## How to Contribute
10
+
11
+ ### Reporting Issues
12
+ - **Create an Issue**: If you find a bug or have a feature request, please create an issue to report it. Use clear and descriptive titles and provide as much information as possible.
13
+ - **Use the Issue Template**: Follow the issue template provided to ensure all relevant information is included.
14
+ - **Discuss in Discord**: For immediate feedback or discussion, bring up your issue in the `#multitonic-discussion` channel on Discord.
15
+
16
+ ### Making Changes
17
+ - **Fork the Repository**: Start by forking the repository to your own GitHub account.
18
+ - **Create a Branch**: Create a branch in your forked repository for your proposed changes. Name the branch something relevant to the changes you're making (e.g., `feature-add-login` or `bugfix-header-alignment`).
19
+ ```bash
20
+ git checkout -b your-branch-name
21
+ ```
22
+ - **Make Your Changes**: Perform the necessary changes to the codebase or documentation.
23
+ - **Commit Your Changes**: Use meaningful commit messages that describe what you've done.
24
+
25
+ ```bash
26
+ git commit -m "Your detailed commit message"
27
+ ```
28
+
29
+ - **Push to Your Fork**: Push your changes to your forked repository on GitHub.
30
+
31
+ ```bash
32
+ git push origin your-branch-name
33
+ ```
34
+
35
+ ### Submitting a Pull Request
36
+ - **Pull Request (PR)**: Go to the original `multitonic` repository and click on "Pull Request" to start the process.
37
+ - **PR Template**: Fill in the PR template with all the necessary details, linking the issue you're addressing.
38
+ - **Code Review**: Wait for the core team or community to review your PR. Be responsive to feedback.
39
+ - **Merge**: Once your PR has been approved and passes all checks, it will be merged into the main codebase.
40
+
41
+ ## Code of Conduct
42
+ Please adhere to the Code of Conduct laid out in the `CODE_OF_CONDUCT.md` [file](src/documentation/CODE_OF_CONDUCT.md) file. Respectful collaboration is key to a healthy open-source environment.
43
+
44
+ ## Questions or Additional Help
45
+ If you need further assistance or have any questions, please don't hesitate to ask in our Discord community or directly in GitHub issues.
46
+
47
+ Thank you for contributing to `multitonic`!
src/documentation/INSTALL.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Getting Started 🚦
3
+
4
+
5
+ ### Installation πŸ“₯
6
+ ```bash
7
+ # Clone the repository
8
+ git clone https://github.com/Tonic-AI/scitonic.git
9
+
10
+ # Navigate to the repository
11
+ cd multitonic
12
+
13
+ # Install dependencies
14
+ pip install -r requirements.txt
15
+ ```
16
+
17
+ ## Usage πŸ–₯️
18
+ - **Set Up Your Database**: Follow our step-by-step guide to initialize your databases.
19
+ - **Import Data**: Utilize our tools to import and cleanse your data.
20
+ - **Analyze and Query**: Access a range of pre-built queries or create your own for tailored insights.
21
+ - **Visualize Results**: Use our multimedia tools to visualize and present your findings.
src/documentation/PROJECT.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🌟 Sci-Tonic: Your Ultimate Technical Research Assistant πŸš€
2
+
3
+ Welcome to **Sci-Tonic** πŸŽ‰, the groundbreaking technical research assistant designed for professionals, researchers, and enthusiasts alike! If you're looking to dive deep into the world of data, ranging from financial figures πŸ“ˆ to scientific articles 🧬, and transform them into insightful, long-form multimedia outputs πŸ“ŠπŸ“š, you've just found your new best friend! πŸ€–πŸ‘©β€πŸ”¬πŸ‘¨β€πŸ’Ό
4
+
5
+ ## Features 🌈
6
+
7
+ Sci-Tonic is packed with amazing features:
8
+
9
+ - **Data Retrieval**: Effortlessly fetch data from a vast array of sources. Financial reports, scientific papers, complex texts - you name it, Sci-Tonic retrieves it! πŸŒπŸ”
10
+ - **Advanced Analysis**: Using cutting-edge AI, Sci-Tonic analyzes and interprets your data, providing you with deep insights. πŸ§ πŸ’‘
11
+ - **Multimedia Output**: Get your results the way you want them. Text, infographics, video summaries - Sci-Tonic does it all! πŸ“πŸŽ₯πŸ“Š
12
+ - **User-Friendly Interface**: Whether you're a tech guru or a newbie, our intuitive interface makes your research journey smooth and enjoyable. πŸ–₯️😊
13
+ - **Collaboration Tools**: Teamwork makes the dream work! Collaborate seamlessly with colleagues or classmates. πŸ‘₯🀝
src/mapper/e5map.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+
3
+ class E5Mapper:
4
+ def __init__(self, api_key):
5
+ self.client = openai.OpenAI(api_key=api_key)
6
+
7
+ def get_completion(self, user_input, temperature=1, max_tokens=256, top_p=1, frequency_penalty=0, presence_penalty=0):
8
+ messages = [
9
+ {
10
+ "role": "system",
11
+ "content": "You are a subject matter technical expert. You select ONLY ONE from the list provided. ALWAYS respond in complete JSON. Always respond with the best possible task selected with YES or NO. ONLY\nselect ONE TASK:\n \"task\": {\n \"ArguAna\": {\n \"type\": \"boolean\",\n \"description\": \"select this task if it requires that given a claim, find documents that refute the claim\"\n },\n \"ClimateFEVER\": {\n \"type\": \"boolean\",\n \"description\": \"select this task if it requires that given a claim about climate change, retrieve documents that support or refute the claim\"\n },\n \"DBPedia\": {\n \"type\": \"boolean\",\n \"description\": \"select this task if it requires that given a query, retrieve relevant entity descriptions from DBPedia\"\n },\n \"FEVER\": {\n \"type\": \"boolean\",\n \"description\": \"select this task if it requires that given a claim, retrieve documents that support or refute the claim\"\n },\n \"FiQA2018\": {\n \"type\": \"boolean\",\n \"description\": \"select this task if it requires that given a financial question, retrieve user replies that best answer the question\"\n },\n \"HotpotQA\": {\n \"type\": \"boolean\",\n \"description\": \"select this task if it requires that given a multi-hop question, retrieve documents that can help answer the question\"\n },\n \"MSMARCO\": {\n \"type\": \"boolean\",\n \"description\": \"select this task if it requires that given a web search query, retrieve relevant passages that answer the query\"\n },\n \"NFCorpus\": {\n \"type\": \"boolean\",\n \"description\": \"select this task if it requires that given a question, retrieve relevant documents that best answer the question\"\n },\n \"NQ\": {\n \"type\": \"boolean\",\n \"description\": \"select this task if it requires that given a question, retrieve Wikipedia passages that answer the question\"\n },\n \"QuoraRetrieval\": {\n \"type\": \"boolean\",\n \"description\": \"select this task if it requires that given a question, retrieve questions that are semantically equivalent to the given question\"\n },\n \"SCIDOCS\": {\n \"type\": \"boolean\",\n \"description\": \"select this task if it requires that given a scientific paper title, retrieve paper abstracts that are cited by the given paper\"\n },\n \"SciFact\": {\n \"type\": \"boolean\",\n \"description\": \"select this task if it requires that given a scientific claim, retrieve documents that support or refute the claim\"\n },\n \"Touche2020\": {\n \"type\": \"boolean\",\n \"description\": \"select this task if it requires that given a question, retrieve detailed and persuasive arguments that answer the question\"\n },\n \"TRECCOVID\": {\n \"type\": \"boolean\",\n \"description\": \"select this task if it requires that given a query on COVID-19, retrieve documents that answer the query\"\n },\n },\n \"required\": [\"ArguAna\", \"ClimateFEVER\" , \"DBPedia\", \"FEVER\" , \"FiQA2018\" , \"HotpotQA\" , \"MSMARCO\" , \"NFCorpus\", \"NQ\", \"QuoraRetrieval\", \"SCIDOCS\", \"SciFact\", \"Touche2020\" , \"TRECCOVID\"]\n }\n }\n }\n]"
12
+ },
13
+ {
14
+ "role": "user",
15
+ "content": user_input
16
+ },
17
+ {
18
+ "role": "assistant",
19
+ "content": "This tool is a function called \"Choose the most appropriate specialty.\" It is used to select a specific task based on a given set of options. The function requires parameters such as \"ArguAna\", \"ClimateFEVER\", \"DBPedia\", \"FEVER\", \"FiQA2018\", \"HotpotQA\", \"MSMARCO\", \"NFCorpus\", \"NQ\", \"QuoraRetrieval\", \"SCIDOCS\", \"SciFact\", \"Touche2020\", and \"TRECCOVID\", each with a boolean type and description. The required response is either \"YES\" or \"NO\" for each task. The function is designed for subject matter technical experts to select the best possible task from the provided list."
20
+ }
21
+ ]
22
+
23
+ response = self.client.chat.completions.create(
24
+ model="gpt-4-1106-preview",
25
+ messages=messages,
26
+ temperature=temperature,
27
+ max_tokens=max_tokens,
28
+ top_p=top_p,
29
+ frequency_penalty=frequency_penalty,
30
+ presence_penalty=presence_penalty
31
+ )
32
+ return response
33
+
34
+ # ### Example Response :
35
+
36
+ # ```json
37
+ # {
38
+ # "task": {
39
+ # "DBPedia": "YES"
40
+ # }
41
+ # }
42
+ # ```
src/mapper/parser.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class MapperParser:
2
+ @staticmethod
3
+ def parse_taskmapper_response(response):
4
+ """Parses the response from the taskmapper and returns the task name."""
5
+ if not response or 'task' not in response:
6
+ return "No task identified"
7
+ task_info = response['task']
8
+ for task, is_selected in task_info.items():
9
+ if is_selected == "YES":
10
+ return task
11
+ return "No task identified"
12
+
13
+ @staticmethod
14
+ def parse_teammapper_response(response):
15
+ """Parses the response from the teammapper and returns the team name."""
16
+ if not response or 'Team' not in response:
17
+ return "No team identified"
18
+ team_info = response['Team']
19
+ for team, is_selected in team_info.items():
20
+ if is_selected:
21
+ return team
22
+ return "No team identified"
src/mapper/scimap.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+
3
+ class scimap:
4
+ def __init__(self, api_key):
5
+ self.client = openai.OpenAI(api_key=api_key)
6
+
7
+ def get_completion(self, user_input, temperature=1, max_tokens=256, top_p=1, frequency_penalty=0, presence_penalty=0):
8
+ messages = [
9
+ {
10
+ "role": "system",
11
+ "content": "You are a subject matter technical expert. You select ONLY ONE from the list provided. ALWAYS respond in complete JSON. Always respond with the best possible team selected with YES or NO. ONLY\nselect ONE TEAM:\n \"Team\": {\n \"ClimateTeam\": {\n \"type\": \"boolean\",\n \"description\": \"select this team if it the task might relate to climate and environmental science\"\n },\n \"Covid19Team\": {\n \"type\": \"boolean\",\n \"description\": \"select this team if the user requires engaging with covid19topics\"\n },\n \"FinanceTeam\": {\n \"type\": \"boolean\",\n \"description\": \"select this team the user requires analysis and advice about financials and financial literature\"\n },\n \"CodingTeam\": {\n \"type\": \"boolean\",\n \"description\": \"select this team if the taskk requires producing code or technology\"\n },\n \"DebateTeam\": {\n \"type\": \"boolean\",\n \"description\": \"select this team if the user requires debating a topic thoroughly\"\n },\n \"HomeworkTeam\": {\n \"type\": \"boolean\",\n \"description\": \"select this team if the user requires help with homework or coursework\"\n },\n \"ConsultingTeam\": {\n \"type\": \"boolean\",\n \"description\": \"select this team if the user requires business consulting\"\n }\n },\n \"required\": [\"ClimateTeam\", \"Covid19Team\" , \"FinanceTeam\", \"CodingTeam\" , \"DebateTeam\" , \"HomeworkTeam\" , \"ConsultingTeam\"]\n }\n }\n }\n]"
12
+ },
13
+ {
14
+ "role": "user",
15
+ "content": user_input
16
+ },
17
+ {
18
+ "role": "assistant",
19
+ "content": "This tool is a function called \"Choose the most appropriate Team.\" It is used to select a specific Team based on a given set of tasks. The function requires parameters such as \"ClimateTeam\", \"Covid19Team\", \"FinanceTeam\", \"CodingTeam\", \"DebateTeam\", \"HomeworkTeam\", \"ConsultingTeam\", each with a boolean type and description. The required response is either \"YES\" or \"NO\" for each task. The function is designed for subject matter technical experts to select the best possible task from the provided list."
20
+ }
21
+ ]
22
+
23
+ response = self.client.chat.completions.create(
24
+ model="gpt-4-1106-preview",
25
+ messages=messages,
26
+ temperature=temperature,
27
+ max_tokens=max_tokens,
28
+ top_p=top_p,
29
+ frequency_penalty=frequency_penalty,
30
+ presence_penalty=presence_penalty
31
+ )
32
+ return response
33
+
34
+ # ### Example Response :
35
+ # ```json
36
+ # {
37
+ # "Team": {
38
+ # "ClimateTeam": false,
39
+ # "Covid19Team": false,
40
+ # "FinanceTeam": false,
41
+ # "CodingTeam": true,
42
+ # "DebateTeam": false,
43
+ # "HomeworkTeam": false,
44
+ # "ConsultingTeam": false
45
+ # }
46
+ # }
47
+ # ```
src/memory/imvectorstore.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from chromadb import Client, ClientAPI
2
+
3
+ class Chroma:
4
+ """
5
+ Chroma class to instantiate a vector db in memory.
6
+ """
7
+ def __init__(self, default_database: str = "default", first_collection_name: str = "test", top_k: int = 1):
8
+ self.api: ClientAPI = Client()
9
+ self.collection_pointer = self.api.create_collection(first_collection_name)
10
+ self.top_k = top_k
11
+
12
+ def new_collection(self, name: str, **kwargs):
13
+ try:
14
+ self.api.create_collection(name, **kwargs)
15
+ except Exception as e:
16
+ print(e)
17
+
18
+ def add_data_to(self, data):
19
+ try:
20
+ self.collection_pointer.add(
21
+ embeddings=data.get("embeddings"),
22
+ documents=data.get("contents"),
23
+ metadatas=data.get("metadatas"),
24
+ ids=data.get("ids")
25
+ )
26
+ except Exception as e:
27
+ print(e)
28
+
29
+ def switch_collection(self, new_pointer: str):
30
+ try:
31
+ self.collection_pointer = self.api.get_collection(new_pointer)
32
+ except Exception as e:
33
+ print(e)
src/teams/agentteam.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import autogen
2
+ from ..agentics.agents import AgentsFactory
3
+
4
+ # agents_factory = AgentsFactory()
5
+
6
+ # Function to reset agents
7
+ def _reset_agents():
8
+ boss_aid.reset()
9
+
10
+
11
+ # Define functions for each team
12
+ def codingteam():
13
+ _reset_agents()
14
+ team = autogen.GroupChat(
15
+ agents=[scitonic, coder, pm, reviewer],
16
+ messages=[],
17
+ max_round=12,
18
+ speaker_selection_method="round_robin"
19
+ )
20
+
21
+ manager = autogen.GroupChatManager(groupchat=team, llm_config=llm_config)
22
+ boss_aid.initiate_chat(manager, problem=PROBLEM, n_results=3)
23
+
24
+ def covid19team():
25
+ _reset_agents()
26
+ team = autogen.GroupChat(
27
+ agents=[scitonic, covid19_scientist, healthcare_expert, finance_analyst],
28
+ messages=[],
29
+ max_round=12
30
+ )
31
+
32
+ manager = autogen.GroupChatManager(groupchat=team, llm_config=llm_config)
33
+ boss_aid.initiate_chat(manager, covid19_problem=COVID19_PROBLEM, n_results=3)
34
+
35
+ def financeteam():
36
+ _reset_agents()
37
+ team = autogen.GroupChat(
38
+ agents=[scitonic, finance_analyst, pm, reviewer, finance_expert],
39
+ messages=[],
40
+ max_round=12,
41
+ speaker_selection_method="round_robin"
42
+ )
43
+
44
+ manager = autogen.GroupChatManager(groupchat=team, llm_config=llm_config)
45
+ boss_aid.initiate_chat(manager, finance_problem=FINANCE_PROBLEM, n_results=3)
46
+
47
+ def debateteam():
48
+ _reset_agents()
49
+ team = autogen.GroupChat(
50
+ agents=[scitonic, debate_expert, pm, reviewer, debate_champion],
51
+ messages=[],
52
+ max_round=12,
53
+ speaker_selection_method="round_robin"
54
+ )
55
+
56
+ manager = autogen.GroupChatManager(groupchat=team, llm_config=llm_config)
57
+ boss_aid.initiate_chat(manager, debate_problem=DEBATE_PROBLEM, n_results=3)
58
+
59
+ def homeworkteam():
60
+ _reset_agents()
61
+ team = autogen.GroupChat(
62
+ agents=[scitonic, academic_expert, pm, reviewer, academic_whiz],
63
+ messages=[],
64
+ max_round=12,
65
+ speaker_selection_method="round_robin"
66
+ )
67
+
68
+ manager = autogen.GroupChatManager(groupchat=team, llm_config=llm_config)
69
+ boss_aid.initiate_chat(manager, homework_problem=HOMEWORK_PROBLEM, n_results=3)
70
+
71
+ def consultingteam():
72
+ _reset_agents()
73
+ team = autogen.GroupChat(
74
+ agents=[scitonic, consultant, pm, reviewer, consulting_pro],
75
+ messages=[],
76
+ max_round=12,
77
+ speaker_selection_method="round_robin"
78
+ )
79
+
80
+ manager = autogen.GroupChatManager(groupchat=team, llm_config=llm_config)
81
+ boss_aid.initiate_chat(manager, consulting_problem=CONSULTING_PROBLEM, n_results=3)