KennethTM commited on
Commit
ddf1044
1 Parent(s): 0d59acc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -0
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import hnswlib
3
+ import pandas as pd
4
+ from sentence_transformers import SentenceTransformer, CrossEncoder
5
+ import os
6
+ from together import Together
7
+ from dotenv import load_dotenv
8
+ from cryptography.fernet import Fernet
9
+ import gzip
10
+ import io
11
+
12
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
13
+
14
+ load_dotenv()
15
+
16
+ client = Together(api_key=os.environ.get("API_KEY"))
17
+
18
+ #read data
19
+ fernet = Fernet(os.environ.get("KEY2").encode("utf-8"))
20
+
21
+ with gzip.open("corpus.gz",'rb') as f:
22
+ bytes_enc = f.read()
23
+ pq_bytes = fernet.decrypt(bytes_enc)
24
+ pq_file = io.BytesIO(pq_bytes)
25
+ corpus = pd.read_parquet(pq_file)
26
+
27
+ biencoder = SentenceTransformer("intfloat/multilingual-e5-small", device="cpu")
28
+ embedding_size = biencoder.get_sentence_embedding_dimension()
29
+
30
+ crossencoder = CrossEncoder("KennethTM/MiniLM-L6-danish-reranker", device="cpu")
31
+
32
+ index = hnswlib.Index(space = 'cosine', dim = embedding_size)
33
+ index.load_index("corpus.index")
34
+ index.set_ef(40)
35
+
36
+ state = {}
37
+
38
+ source_label = {"wiki": "Wikipedia", "lex": "lex.dk", "mfkn": "MFKN", "dce": "DCE"}
39
+
40
+ def format_markdown(results):
41
+ result_template = '### {idx}. [{title}]({url}) ({source}):\n"{text}"'
42
+ result_join = "\n\n".join([result_template.format(idx=i+1, source=source_label[source], title=title, url=url, text=text)
43
+ for i, (title, source, url, text) in enumerate(zip(results["title"], results["source"], results["url"], results["text_chunks"]))])
44
+ results_formatted = f"## Referencer:\n\n{result_join}"
45
+
46
+ return(results_formatted)
47
+
48
+ def format_context(results):
49
+ result_template = "Kilde {idx}:\n{text}"
50
+ result_join = "\n\n".join([result_template.format(idx=i+1, text=text) for i, text in enumerate(results["text_chunks"])])
51
+
52
+ return(result_join)
53
+
54
+ def search(query, top_k):
55
+
56
+ query_embedding = biencoder.encode(query, prompt = "query: ")
57
+
58
+ biencoder_hits = int(top_k)*2
59
+ ids, _ = index.knn_query(query_embedding, k = biencoder_hits)
60
+ ids = ids[0]
61
+
62
+ results = corpus.iloc[ids].copy()
63
+ results["scores"] = crossencoder.predict([(query, i) for i in results["text_chunks"]])
64
+ results = results.sort_values("scores", ascending=False)
65
+ results = results[:int(top_k)]
66
+
67
+ results_markdown = format_markdown(results)
68
+ results_context = format_context(results)
69
+
70
+ state["context"] = results_context
71
+ state["query"] = query
72
+
73
+ return(results_markdown)
74
+
75
+ def search_summary():
76
+ context = state["context"]
77
+ query = state["query"]
78
+
79
+ prompt = [{"role": "system", "content": "Svar på spørgsmålet. Du er ekspert i spørgsmål indenfor natur og miljø. Anvend kilderne i konteksten hvis de kan bruges til besvarelsen. Besvar kun på dansk."},
80
+ {"role": "user", "content": f"Kontekst:\n{context}\n\nSpørgsmål:\n{query}"}]
81
+
82
+ stream = client.chat.completions.create(
83
+ model="meta-llama/Llama-3-8b-chat-hf",
84
+ messages=prompt,
85
+ stream=True,
86
+ max_tokens=1024
87
+ )
88
+
89
+ partial_message = ""
90
+ for chunk in stream:
91
+ partial_message += chunk.choices[0].delta.content or ""
92
+ yield partial_message
93
+
94
+
95
+ with gr.Blocks() as demo:
96
+
97
+ gr.Markdown("# Natur og miljø BOT")
98
+ gr.Markdown("Dette er en simpel spørgsmål-svar applikation indenfor Danmarks natur og miljø. Svar genereres af en sprogmodel (LLAMA-3-8B) og anvender relevante referencer i en stor samling af dokumenter. Dette er blandt andet artikler fra [Wikipedia](https://da.wikipedia.org/wiki/Forside), rapporter fra [DCE - Nationalt Center for Miljø og Energi](https://dce.au.dk/udgivelser), [lex.dk - Den Store Danske](https://denstoredanske.lex.dk/) samt sager fra [Miljø og fødevareklagenævnet](https://mfkn.naevneneshus.dk).")
99
+
100
+ with gr.Row():
101
+ textbox = gr.Textbox(placeholder="Søg...", lines=1, scale=8, label="Spørgsmål")
102
+ num = gr.Number(5, label="Referencer", scale=1, minimum=1, maximum=10)
103
+ btn = gr.Button("Søg!", size="sm", scale=2)
104
+
105
+ with gr.Row():
106
+ summary = gr.Textbox(interactive=False, lines=10, label="Svar")
107
+
108
+ with gr.Row():
109
+ results = gr.Markdown()
110
+
111
+ gr.Markdown("*Applikation lavet af Kenneth Thorø Martinsen (email: [email protected])*")
112
+
113
+ btn.click(fn=search, inputs=[textbox, num], outputs=results).then(search_summary, inputs=None, outputs=summary)
114
+ textbox.submit(fn=search, inputs=[textbox, num], outputs=results).then(search_summary, inputs=None, outputs=summary)
115
+
116
+ demo.queue().launch()