|
import gradio as gr |
|
import hnswlib |
|
import pandas as pd |
|
from sentence_transformers import SentenceTransformer, CrossEncoder |
|
import os |
|
from together import Together |
|
from dotenv import load_dotenv |
|
from cryptography.fernet import Fernet |
|
import gzip |
|
import io |
|
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
load_dotenv() |
|
|
|
client = Together(api_key=os.environ.get("API_KEY")) |
|
|
|
|
|
fernet = Fernet(os.environ.get("KEY2").encode("utf-8")) |
|
|
|
with gzip.open("corpus.gz",'rb') as f: |
|
bytes_enc = f.read() |
|
pq_bytes = fernet.decrypt(bytes_enc) |
|
pq_file = io.BytesIO(pq_bytes) |
|
corpus = pd.read_parquet(pq_file) |
|
|
|
biencoder = SentenceTransformer("intfloat/multilingual-e5-small", device="cpu") |
|
embedding_size = biencoder.get_sentence_embedding_dimension() |
|
|
|
crossencoder = CrossEncoder("KennethTM/MiniLM-L6-danish-reranker", device="cpu") |
|
|
|
index = hnswlib.Index(space = 'cosine', dim = embedding_size) |
|
index.load_index("corpus.index") |
|
index.set_ef(40) |
|
|
|
state = {} |
|
|
|
source_label = {"wiki": "Wikipedia", "lex": "lex.dk", "mfkn": "MFKN", "dce": "DCE"} |
|
|
|
def format_markdown(results): |
|
result_template = '### {idx}. [{title}]({url}) ({source}):\n"{text}"' |
|
result_join = "\n\n".join([result_template.format(idx=i+1, source=source_label[source], title=title, url=url, text=text) |
|
for i, (title, source, url, text) in enumerate(zip(results["title"], results["source"], results["url"], results["text_chunks"]))]) |
|
results_formatted = f"## Referencer:\n\n{result_join}" |
|
|
|
return(results_formatted) |
|
|
|
def format_context(results): |
|
result_template = "Kilde {idx}:\n{text}" |
|
result_join = "\n\n".join([result_template.format(idx=i+1, text=text) for i, text in enumerate(results["text_chunks"])]) |
|
|
|
return(result_join) |
|
|
|
def search(query, top_k): |
|
|
|
query_embedding = biencoder.encode(query, prompt = "query: ") |
|
|
|
biencoder_hits = int(top_k)*2 |
|
ids, _ = index.knn_query(query_embedding, k = biencoder_hits) |
|
ids = ids[0] |
|
|
|
results = corpus.iloc[ids].copy() |
|
results["scores"] = crossencoder.predict([(query, i) for i in results["text_chunks"]]) |
|
results = results.sort_values("scores", ascending=False) |
|
results = results[:int(top_k)] |
|
|
|
results_markdown = format_markdown(results) |
|
results_context = format_context(results) |
|
|
|
state["context"] = results_context |
|
state["query"] = query |
|
|
|
return(results_markdown) |
|
|
|
def search_summary(): |
|
context = state["context"] |
|
query = state["query"] |
|
|
|
prompt = [{"role": "system", "content": "Svar på spørgsmålet. Anvend kilderne i konteksten hvis de kan bruges til besvarelsen. Besvar kun på dansk."}, |
|
{"role": "user", "content": f"Kontekst:\n{context}\n\nSpørgsmål:\n{query}"}] |
|
|
|
stream = client.chat.completions.create( |
|
model="meta-llama/Llama-3-8b-chat-hf", |
|
messages=prompt, |
|
stream=True, |
|
max_tokens=1024 |
|
) |
|
|
|
partial_message = "" |
|
for chunk in stream: |
|
partial_message += chunk.choices[0].delta.content or "" |
|
yield partial_message |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
|
gr.Markdown("# Natur og miljø BOT") |
|
gr.Markdown("Dette er en simpel spørgsmål-svar applikation indenfor Danmarks natur og miljø. Svar genereres af en sprogmodel (LLAMA-3-8B) og anvender relevante referencer i en stor samling af dokumenter. Dette er blandt andet artikler fra [Wikipedia](https://da.wikipedia.org/wiki/Forside), rapporter fra [DCE - Nationalt Center for Miljø og Energi](https://dce.au.dk/udgivelser), [lex.dk - Den Store Danske](https://denstoredanske.lex.dk/) samt sager fra [Miljø og fødevareklagenævnet](https://mfkn.naevneneshus.dk).") |
|
|
|
with gr.Row(): |
|
textbox = gr.Textbox(placeholder="Søg...", lines=1, scale=8, label="Spørgsmål") |
|
num = gr.Number(5, label="Referencer", scale=1, minimum=1, maximum=10) |
|
btn = gr.Button("Søg!", size="sm", scale=2) |
|
|
|
with gr.Row(): |
|
summary = gr.Textbox(interactive=False, lines=10, label="Svar") |
|
|
|
with gr.Row(): |
|
results = gr.Markdown() |
|
|
|
gr.Markdown("*Applikation lavet af Kenneth Thorø Martinsen (email: [email protected])*") |
|
|
|
btn.click(fn=search, inputs=[textbox, num], outputs=results).then(search_summary, inputs=None, outputs=summary) |
|
textbox.submit(fn=search, inputs=[textbox, num], outputs=results).then(search_summary, inputs=None, outputs=summary) |
|
|
|
demo.queue().launch() |