derek-thomas HF staff commited on
Commit
9ae1b66
1 Parent(s): 1a53147

Init commit

Browse files
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import gradio as gr
5
+ from huggingface_hub import WebhookPayload, WebhooksServer
6
+
7
+ from src.utilities import load_datasets, merge_and_update_datasets
8
+ from src.my_logger import setup_logger
9
+ from src.visualize_logs import log_file_to_html_string
10
+
11
+ proj_dir = Path(__name__).parent
12
+
13
+ logger = setup_logger(__name__)
14
+
15
+ SUBREDDIT = os.environ["SUBREDDIT"]
16
+ USERNAME = os.environ["USERNAME"]
17
+ OG_DATASET= f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"
18
+ PROCESSED_DATASET = os.environ['PROCESSED_DATASET']
19
+ HUGGINGFACE_AUTH_TOKEN = os.environ["HUGGINGFACE_AUTH_TOKEN"]
20
+ WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')
21
+
22
+ intro_md = """
23
+ # Processing BORU
24
+ This space is triggered by a webhook for changes on
25
+ [derek-thomas/dataset-creator-reddit-bestofredditorupdates](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-bestofredditorupdates).
26
+ It then takes the updates from that dataset and get embeddings and puts the results in
27
+ [https://huggingface.co/datasets/derek-thomas/reddit-bestofredditorupdates-processed](https://huggingface.co/datasets/derek-thomas/reddit-bestofredditorupdates-processed)
28
+ """
29
+
30
+ with gr.Blocks() as ui:
31
+ with gr.Tab("Application"):
32
+ gr.Markdown(intro_md)
33
+ output = gr.HTML(log_file_to_html_string, every=1)
34
+
35
+ app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)
36
+
37
+
38
+ @app.add_webhook("/dataset_repo")
39
+ async def community(payload: WebhookPayload):
40
+ if payload.event.scope.startswith("repo"):
41
+ logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}")
42
+ else:
43
+ return
44
+
45
+ logger.info(f"Loading new dataset...")
46
+ dataset, original_dataset = load_datasets()
47
+ logger.info(f"Loaded new dataset")
48
+
49
+ logger.info(f"Merging and Updating row...")
50
+ dataset = merge_and_update_datasets(dataset, original_dataset)
51
+
52
+ # Push the augmented dataset to the Hugging Face hub
53
+ logger.debug(f"Pushing processed data to the Hugging Face Hub...")
54
+ dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN)
55
+ logger.info(f"Pushed processed data to the Hugging Face Hub")
56
+
57
+ if __name__ == '__main__':
58
+ app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)
59
+ # ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)
media/automatic-embeddings-cost.png DELETED
Binary file (68 kB)
 
notebooks/automated_embeddings.ipynb DELETED
@@ -1,749 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "5d9aca72-957a-4ee2-862f-e011b9cd3a62",
6
- "metadata": {},
7
- "source": [
8
- "# Introduction\n",
9
- "## Goal\n",
10
- "I have a dataset I want to embed for semantic search (or QA, or RAG), I want the easiest way to do embed this and put it in a new dataset.\n",
11
- "\n",
12
- "## Approach\n",
13
- "Im using a dataset from my favorite subreddit [r/bestofredditorupdates](). Since it has such long entries, I will use the new [jinaai/jina-embeddings-v2-base-en](https://huggingface.co/jinaai/jina-embeddings-v2-base-en) since it has an 8k context length. Since Im GPU-poor I will deploy this using [Inference Endpoint](https://huggingface.co/inference-endpoints) to save money and time. To follow this you will need to add a payment method. To make it even easier, I'll make this fully API based."
14
- ]
15
- },
16
- {
17
- "cell_type": "markdown",
18
- "id": "d2534669-003d-490c-9d7a-32607fa5f404",
19
- "metadata": {},
20
- "source": [
21
- "# Setup"
22
- ]
23
- },
24
- {
25
- "cell_type": "markdown",
26
- "id": "3c830114-dd88-45a9-81b9-78b0e3da7384",
27
- "metadata": {},
28
- "source": [
29
- "## Requirements"
30
- ]
31
- },
32
- {
33
- "cell_type": "code",
34
- "execution_count": null,
35
- "id": "35386f72-32cb-49fa-a108-3aa504e20429",
36
- "metadata": {},
37
- "outputs": [],
38
- "source": [
39
- "!pip install -q -r ../requirements.txt"
40
- ]
41
- },
42
- {
43
- "cell_type": "markdown",
44
- "id": "b6f72042-173d-4a72-ade1-9304b43b528d",
45
- "metadata": {},
46
- "source": [
47
- "## Imports"
48
- ]
49
- },
50
- {
51
- "cell_type": "code",
52
- "execution_count": 1,
53
- "id": "e2beecdd-d033-4736-bd45-6754ec53b4ac",
54
- "metadata": {
55
- "tags": []
56
- },
57
- "outputs": [],
58
- "source": [
59
- "import asyncio\n",
60
- "from getpass import getpass\n",
61
- "import json\n",
62
- "from pathlib import Path\n",
63
- "import time\n",
64
- "\n",
65
- "from aiohttp import ClientSession, ClientTimeout\n",
66
- "from datasets import load_dataset, Dataset, DatasetDict\n",
67
- "from huggingface_hub import notebook_login\n",
68
- "import pandas as pd\n",
69
- "import requests\n",
70
- "from tqdm.auto import tqdm"
71
- ]
72
- },
73
- {
74
- "cell_type": "markdown",
75
- "id": "5eece903-64ce-435d-a2fd-096c0ff650bf",
76
- "metadata": {},
77
- "source": [
78
- "## Config\n",
79
- "You need to fill this in with your desired repos. Note I used 5 for the `MAX_WORKERS` since `jina-embeddings-v2` are quite memory hungry. "
80
- ]
81
- },
82
- {
83
- "cell_type": "code",
84
- "execution_count": 2,
85
- "id": "dcd7daed-6aca-4fe7-85ce-534bdcd8bc87",
86
- "metadata": {
87
- "tags": []
88
- },
89
- "outputs": [],
90
- "source": [
91
- "dataset_in = 'derek-thomas/dataset-creator-reddit-bestofredditorupdates'\n",
92
- "dataset_out = \"processed-bestofredditorupdates\"\n",
93
- "endpoint_name = \"boru-jina-embeddings-demo\"\n",
94
- "\n",
95
- "MAX_WORKERS = 5 "
96
- ]
97
- },
98
- {
99
- "cell_type": "code",
100
- "execution_count": 3,
101
- "id": "88cdbd73-5923-4ae9-9940-b6be935f70fa",
102
- "metadata": {
103
- "tags": []
104
- },
105
- "outputs": [
106
- {
107
- "name": "stdin",
108
- "output_type": "stream",
109
- "text": [
110
- "What is your Hugging Face 🤗 username? (with a credit card) ········\n",
111
- "What is your Hugging Face 🤗 token? ········\n"
112
- ]
113
- }
114
- ],
115
- "source": [
116
- "username = getpass(prompt=\"What is your Hugging Face 🤗 username? (with an added payment method)\")\n",
117
- "hf_token = getpass(prompt='What is your Hugging Face 🤗 token?')"
118
- ]
119
- },
120
- {
121
- "cell_type": "markdown",
122
- "id": "b972a719-2aed-4d2e-a24f-fae7776d5fa4",
123
- "metadata": {},
124
- "source": [
125
- "## Get Dataset"
126
- ]
127
- },
128
- {
129
- "cell_type": "code",
130
- "execution_count": 4,
131
- "id": "27835fa4-3a4f-44b1-a02a-5e31584a1bba",
132
- "metadata": {
133
- "tags": []
134
- },
135
- "outputs": [
136
- {
137
- "data": {
138
- "text/plain": [
139
- "Dataset({\n",
140
- " features: ['date_utc', 'title', 'flair', 'content', 'poster', 'permalink', 'id', 'content_length', 'score'],\n",
141
- " num_rows: 9991\n",
142
- "})"
143
- ]
144
- },
145
- "execution_count": 4,
146
- "metadata": {},
147
- "output_type": "execute_result"
148
- }
149
- ],
150
- "source": [
151
- "dataset = load_dataset(dataset_in, token=hf_token)\n",
152
- "dataset['train']"
153
- ]
154
- },
155
- {
156
- "cell_type": "code",
157
- "execution_count": 5,
158
- "id": "8846087e-4d0d-4c0e-8aeb-ea95d9e97126",
159
- "metadata": {
160
- "tags": []
161
- },
162
- "outputs": [
163
- {
164
- "data": {
165
- "text/plain": [
166
- "(9991,\n",
167
- " {'date_utc': Timestamp('2022-12-31 18:16:22'),\n",
168
- " 'title': 'To All BORU contributors, Thank you :)',\n",
169
- " 'flair': 'CONCLUDED',\n",
170
- " 'content': '[removed]',\n",
171
- " 'poster': 'IsItAcOnSeQuEnCe',\n",
172
- " 'permalink': '/r/BestofRedditorUpdates/comments/10004zw/to_all_boru_contributors_thank_you/',\n",
173
- " 'id': '10004zw',\n",
174
- " 'content_length': 9,\n",
175
- " 'score': 1})"
176
- ]
177
- },
178
- "execution_count": 5,
179
- "metadata": {},
180
- "output_type": "execute_result"
181
- }
182
- ],
183
- "source": [
184
- "documents = dataset['train'].to_pandas().to_dict('records')\n",
185
- "len(documents), documents[0]"
186
- ]
187
- },
188
- {
189
- "cell_type": "markdown",
190
- "id": "93096cbc-81c6-4137-a283-6afb0f48fbb9",
191
- "metadata": {},
192
- "source": [
193
- "# Inference Endpoints\n",
194
- "## Create Inference Endpoint\n",
195
- "We are going to use the [API](https://huggingface.co/docs/inference-endpoints/api_reference) to create an [Inference Endpoint](https://huggingface.co/inference-endpoints). This should provide a few main benefits:\n",
196
- "- It's convenient (No clicking)\n",
197
- "- It's repeatable (We have the code to run it easily)\n",
198
- "- It's cheaper (No time spent waiting for it to load, and automatically shut it down)"
199
- ]
200
- },
201
- {
202
- "cell_type": "code",
203
- "execution_count": 6,
204
- "id": "3a8f67b9-6ac6-4b5e-91ee-e48463191e1b",
205
- "metadata": {
206
- "tags": []
207
- },
208
- "outputs": [],
209
- "source": [
210
- "headers = {\n",
211
- "\t\"Authorization\": f\"Bearer {hf_token}\",\n",
212
- "\t\"Content-Type\": \"application/json\"\n",
213
- "}\n",
214
- "base_url = f\"https://api.endpoints.huggingface.cloud/v2/endpoint/{username}\"\n",
215
- "endpoint_url = f\"https://api.endpoints.huggingface.cloud/v2/endpoint/{username}/{endpoint_name}\""
216
- ]
217
- },
218
- {
219
- "cell_type": "markdown",
220
- "id": "0f2c97dc-34e8-49e9-b60e-f5b7366294c0",
221
- "metadata": {},
222
- "source": [
223
- "There are a few design choices here:\n",
224
- "- I'm using the `g5.2xlarge` since it is big and `jina-embeddings-v2` are memory hungry (remember the 8k context length). \n",
225
- "- I didnt alter the default `MAX_BATCH_TOKENS` or `MAX_CONCURRENT_REQUESTS`\n",
226
- " - You should consider this if you are making this production ready\n",
227
- " - You will need to restrict these to match the HW you are running on\n",
228
- "- As mentioned before, I chose the repo and the corresponding revision\n"
229
- ]
230
- },
231
- {
232
- "cell_type": "code",
233
- "execution_count": 7,
234
- "id": "f1ea29cb-b69d-4340-859f-3646d650c68e",
235
- "metadata": {
236
- "tags": []
237
- },
238
- "outputs": [
239
- {
240
- "name": "stdout",
241
- "output_type": "stream",
242
- "text": [
243
- "202\n"
244
- ]
245
- }
246
- ],
247
- "source": [
248
- "data = {\n",
249
- " \"accountId\": None,\n",
250
- " \"compute\": {\n",
251
- " \"accelerator\": \"gpu\",\n",
252
- " \"instanceType\": \"g5.2xlarge\",\n",
253
- " \"instanceSize\": \"medium\",\n",
254
- " \"scaling\": {\n",
255
- " \"maxReplica\": 1,\n",
256
- " \"minReplica\": 1\n",
257
- " }\n",
258
- " },\n",
259
- " \"model\": {\n",
260
- " \"framework\": \"pytorch\",\n",
261
- " \"image\": {\n",
262
- " \"custom\": {\n",
263
- " \"url\": \"ghcr.io/huggingface/text-embeddings-inference:0.3.0\",\n",
264
- " \"health_route\": \"/health\",\n",
265
- " \"env\": {\n",
266
- " \"MAX_BATCH_TOKENS\": \"16384\",\n",
267
- " \"MAX_CONCURRENT_REQUESTS\": \"512\",\n",
268
- " \"MODEL_ID\": \"/repository\"\n",
269
- " }\n",
270
- " }\n",
271
- " },\n",
272
- " \"repository\": \"jinaai/jina-embeddings-v2-base-en\",\n",
273
- " \"revision\": \"8705ed9657208b2d5220fffad1c3a30980d279d0\",\n",
274
- " \"task\": \"sentence-embeddings\",\n",
275
- " },\n",
276
- " \"name\": endpoint_name,\n",
277
- " \"provider\": {\n",
278
- " \"region\": \"us-east-1\",\n",
279
- " \"vendor\": \"aws\"\n",
280
- " },\n",
281
- " \"type\": \"protected\"\n",
282
- "}\n",
283
- "\n",
284
- "response = requests.post(base_url, headers={**headers, 'accept': 'application/json'}, json=data)\n",
285
- "\n",
286
- "\n",
287
- "print(response.status_code)"
288
- ]
289
- },
290
- {
291
- "cell_type": "markdown",
292
- "id": "96d173b2-8980-4554-9039-c62843d3fc7d",
293
- "metadata": {},
294
- "source": [
295
- "## Wait until its running\n",
296
- "Here we use `tqdm` as a pretty way of displaying our status. It took about ~30s for this model to get the Inference Endpoint running."
297
- ]
298
- },
299
- {
300
- "cell_type": "code",
301
- "execution_count": 8,
302
- "id": "b8aa66a9-3c8a-4040-9465-382c744f36cf",
303
- "metadata": {
304
- "tags": []
305
- },
306
- "outputs": [
307
- {
308
- "data": {
309
- "application/vnd.jupyter.widget-view+json": {
310
- "model_id": "a6f27d86f68b4000aa40e09ae079c6b0",
311
- "version_major": 2,
312
- "version_minor": 0
313
- },
314
- "text/plain": [
315
- "Waiting for status to change: 0s [00:00, ?s/s]"
316
- ]
317
- },
318
- "metadata": {},
319
- "output_type": "display_data"
320
- },
321
- {
322
- "name": "stdout",
323
- "output_type": "stream",
324
- "text": [
325
- "Status is 'running'.\n"
326
- ]
327
- }
328
- ],
329
- "source": [
330
- "with tqdm(desc=\"Waiting for status to change\", unit=\"s\") as pbar:\n",
331
- " while True:\n",
332
- " response_json = requests.get(endpoint_url, headers=headers).json()\n",
333
- " current_status = response_json['status']['state']\n",
334
- "\n",
335
- " if current_status == 'running':\n",
336
- " print(\"Status is 'running'.\")\n",
337
- " break\n",
338
- "\n",
339
- " pbar.set_description(f\"Status: {current_status}\")\n",
340
- " time.sleep(2)\n",
341
- " pbar.update(1)\n",
342
- "\n",
343
- "embedding_url = response_json['status']['url']"
344
- ]
345
- },
346
- {
347
- "cell_type": "markdown",
348
- "id": "063fa066-e4d0-4a65-a82d-cf17db4af8d8",
349
- "metadata": {},
350
- "source": [
351
- "I found that even though the status is running, I want to get a test message to run first before running our batch in parallel."
352
- ]
353
- },
354
- {
355
- "cell_type": "code",
356
- "execution_count": 9,
357
- "id": "66e00960-1d3d-490d-bedc-3eaf1924db76",
358
- "metadata": {},
359
- "outputs": [
360
- {
361
- "data": {
362
- "application/vnd.jupyter.widget-view+json": {
363
- "model_id": "4e03e5a3d07a498ca6b3631605724b62",
364
- "version_major": 2,
365
- "version_minor": 0
366
- },
367
- "text/plain": [
368
- "Waiting for endpoint to accept requests: 0s [00:00, ?s/s]"
369
- ]
370
- },
371
- "metadata": {},
372
- "output_type": "display_data"
373
- },
374
- {
375
- "name": "stdout",
376
- "output_type": "stream",
377
- "text": [
378
- "Endpoint is accepting requests\n"
379
- ]
380
- }
381
- ],
382
- "source": [
383
- "payload = {\"inputs\": \"This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music!\"}\n",
384
- "\n",
385
- "with tqdm(desc=\"Waiting for endpoint to accept requests\", unit=\"s\") as pbar:\n",
386
- " while True:\n",
387
- " try:\n",
388
- " response_json = requests.post(embedding_url, headers=headers, json=payload).json()\n",
389
- "\n",
390
- " # Assuming the successful response has a specific structure\n",
391
- " if len(response_json[0]) == 768:\n",
392
- " print(\"Endpoint is accepting requests\")\n",
393
- " break\n",
394
- "\n",
395
- " except requests.ConnectionError as e:\n",
396
- " pass\n",
397
- "\n",
398
- " # Delay between retries\n",
399
- " time.sleep(5)\n",
400
- " pbar.update(1)\n"
401
- ]
402
- },
403
- {
404
- "cell_type": "markdown",
405
- "id": "f7186126-ef6a-47d0-b158-112810649cd9",
406
- "metadata": {},
407
- "source": [
408
- "# Get Embeddings"
409
- ]
410
- },
411
- {
412
- "cell_type": "markdown",
413
- "id": "1dadfd68-6d46-4ce8-a165-bfeb43b1f114",
414
- "metadata": {},
415
- "source": [
416
- "Here I send a document, update it with the embedding, and return it. This happens in parallel with `MAX_WORKERS`."
417
- ]
418
- },
419
- {
420
- "cell_type": "code",
421
- "execution_count": 10,
422
- "id": "ad3193fb-3def-42a8-968e-c63f2b864ca8",
423
- "metadata": {
424
- "tags": []
425
- },
426
- "outputs": [],
427
- "source": [
428
- "async def request(document, semaphore):\n",
429
- " # Semaphore guard\n",
430
- " async with semaphore:\n",
431
- " payload = {\n",
432
- " \"inputs\": document['content'] or document['title'] or '[deleted]',\n",
433
- " \"truncate\": True\n",
434
- " }\n",
435
- " \n",
436
- " timeout = ClientTimeout(total=10) # Set a timeout for requests (10 seconds here)\n",
437
- "\n",
438
- " async with ClientSession(timeout=timeout, headers=headers) as session:\n",
439
- " async with session.post(embedding_url, json=payload) as resp:\n",
440
- " if resp.status != 200:\n",
441
- " raise RuntimeError(await resp.text())\n",
442
- " result = await resp.json()\n",
443
- " \n",
444
- " document['embedding'] = result[0] # Assuming the API's output can be directly assigned\n",
445
- " return document\n",
446
- "\n",
447
- "async def main(documents):\n",
448
- " # Semaphore to limit concurrent requests. Adjust the number as needed.\n",
449
- " semaphore = asyncio.BoundedSemaphore(MAX_WORKERS)\n",
450
- "\n",
451
- " # Creating a list of tasks\n",
452
- " tasks = [request(document, semaphore) for document in documents]\n",
453
- " \n",
454
- " # Using tqdm to show progress. It's been integrated into the async loop.\n",
455
- " for f in tqdm(asyncio.as_completed(tasks), total=len(documents)):\n",
456
- " await f"
457
- ]
458
- },
459
- {
460
- "cell_type": "code",
461
- "execution_count": 11,
462
- "id": "ec4983af-65eb-4841-808a-3738fb4d682d",
463
- "metadata": {
464
- "tags": []
465
- },
466
- "outputs": [
467
- {
468
- "data": {
469
- "application/vnd.jupyter.widget-view+json": {
470
- "model_id": "cb73af52244e40d2aab8bdac3a55d443",
471
- "version_major": 2,
472
- "version_minor": 0
473
- },
474
- "text/plain": [
475
- " 0%| | 0/9991 [00:00<?, ?it/s]"
476
- ]
477
- },
478
- "metadata": {},
479
- "output_type": "display_data"
480
- },
481
- {
482
- "name": "stdout",
483
- "output_type": "stream",
484
- "text": [
485
- "Embeddings = 9991 documents = 9991\n",
486
- "32 min 14.53 sec\n"
487
- ]
488
- }
489
- ],
490
- "source": [
491
- "start = time.perf_counter()\n",
492
- "\n",
493
- "# Get embeddings\n",
494
- "await main(documents)\n",
495
- "\n",
496
- "# Make sure we got it all\n",
497
- "count = 0\n",
498
- "for document in documents:\n",
499
- " if document['embedding'] and len(document['embedding']) == 768:\n",
500
- " count += 1\n",
501
- "print(f'Embeddings = {count} documents = {len(documents)}')\n",
502
- "\n",
503
- " \n",
504
- "# Print elapsed time\n",
505
- "elapsed_time = time.perf_counter() - start\n",
506
- "minutes, seconds = divmod(elapsed_time, 60)\n",
507
- "print(f\"{int(minutes)} min {seconds:.2f} sec\")"
508
- ]
509
- },
510
- {
511
- "cell_type": "markdown",
512
- "id": "bab97c7b-7bac-4bf5-9752-b528294dadc7",
513
- "metadata": {},
514
- "source": [
515
- "## Pause Inference Endpoint\n",
516
- "Now that we have finished, lets pause the endpoint so we don't incur any extra charges, this will also allow us to analyze the cost."
517
- ]
518
- },
519
- {
520
- "cell_type": "code",
521
- "execution_count": 12,
522
- "id": "540a0978-7670-4ce3-95c1-3823cc113b85",
523
- "metadata": {
524
- "tags": []
525
- },
526
- "outputs": [
527
- {
528
- "name": "stdout",
529
- "output_type": "stream",
530
- "text": [
531
- "200\n",
532
- "paused\n"
533
- ]
534
- }
535
- ],
536
- "source": [
537
- "response = requests.post(endpoint_url + '/pause', headers=headers)\n",
538
- "\n",
539
- "print(response.status_code)\n",
540
- "print(response.json()['status']['state'])"
541
- ]
542
- },
543
- {
544
- "cell_type": "markdown",
545
- "id": "45ad65b7-3da2-4113-9b95-8fb4e21ae793",
546
- "metadata": {},
547
- "source": [
548
- "# Push updated dataset to Hub\n",
549
- "We now have our documents updated with the embeddings we wanted. First we need to convert it back to a `Dataset` format. I find its easiest to go from list of dicts -> `pd.DataFrame` -> `Dataset`"
550
- ]
551
- },
552
- {
553
- "cell_type": "code",
554
- "execution_count": 13,
555
- "id": "9bb993f8-d624-4192-9626-8e9ed9888a1b",
556
- "metadata": {
557
- "tags": []
558
- },
559
- "outputs": [],
560
- "source": [
561
- "df = pd.DataFrame(documents)\n",
562
- "dd = DatasetDict({'train': Dataset.from_pandas(df)})"
563
- ]
564
- },
565
- {
566
- "cell_type": "code",
567
- "execution_count": 14,
568
- "id": "f48e7c55-d5b7-4ed6-8516-272ae38716b1",
569
- "metadata": {
570
- "tags": []
571
- },
572
- "outputs": [
573
- {
574
- "data": {
575
- "application/vnd.jupyter.widget-view+json": {
576
- "model_id": "84a481e0cf74494cb2eb9d9857701212",
577
- "version_major": 2,
578
- "version_minor": 0
579
- },
580
- "text/plain": [
581
- "Pushing dataset shards to the dataset hub: 0%| | 0/1 [00:00<?, ?it/s]"
582
- ]
583
- },
584
- "metadata": {},
585
- "output_type": "display_data"
586
- },
587
- {
588
- "data": {
589
- "application/vnd.jupyter.widget-view+json": {
590
- "model_id": "b8f128dfe7c546bcbc8f04817e3ca48c",
591
- "version_major": 2,
592
- "version_minor": 0
593
- },
594
- "text/plain": [
595
- "Creating parquet from Arrow format: 0%| | 0/10 [00:00<?, ?ba/s]"
596
- ]
597
- },
598
- "metadata": {},
599
- "output_type": "display_data"
600
- },
601
- {
602
- "data": {
603
- "application/vnd.jupyter.widget-view+json": {
604
- "model_id": "2dcc1d54036a49f1a1346a6be64e765a",
605
- "version_major": 2,
606
- "version_minor": 0
607
- },
608
- "text/plain": [
609
- "Upload 1 LFS files: 0%| | 0/1 [00:00<?, ?it/s]"
610
- ]
611
- },
612
- "metadata": {},
613
- "output_type": "display_data"
614
- }
615
- ],
616
- "source": [
617
- "dd.push_to_hub(dataset_out, token=hf_token)"
618
- ]
619
- },
620
- {
621
- "cell_type": "markdown",
622
- "id": "41abea64-379d-49de-8d9a-355c2f4ce1ac",
623
- "metadata": {},
624
- "source": [
625
- "# Analyze Usage\n",
626
- "1. Go to your `dashboard_url` printed below\n",
627
- "1. Click on the Usage & Cost tab\n",
628
- "1. See how much you have spent"
629
- ]
630
- },
631
- {
632
- "cell_type": "code",
633
- "execution_count": 15,
634
- "id": "16815445-3079-43da-b14e-b54176a07a62",
635
- "metadata": {},
636
- "outputs": [
637
- {
638
- "name": "stdout",
639
- "output_type": "stream",
640
- "text": [
641
- "https://ui.endpoints.huggingface.co/HF-test-lab/endpoints/boru-jina-embeddings-demo\n"
642
- ]
643
- }
644
- ],
645
- "source": [
646
- "dashboard_url = f'https://ui.endpoints.huggingface.co/{username}/endpoints/{endpoint_name}'\n",
647
- "print(dashboard_url)"
648
- ]
649
- },
650
- {
651
- "cell_type": "code",
652
- "execution_count": 16,
653
- "id": "81096c6f-d12f-4781-84ec-9066cfa465b3",
654
- "metadata": {},
655
- "outputs": [
656
- {
657
- "name": "stdin",
658
- "output_type": "stream",
659
- "text": [
660
- "Hit enter to continue with the notebook \n"
661
- ]
662
- },
663
- {
664
- "data": {
665
- "text/plain": [
666
- "''"
667
- ]
668
- },
669
- "execution_count": 16,
670
- "metadata": {},
671
- "output_type": "execute_result"
672
- }
673
- ],
674
- "source": [
675
- "input(\"Hit enter to continue with the notebook\")"
676
- ]
677
- },
678
- {
679
- "cell_type": "markdown",
680
- "id": "847d524e-9aa6-4a6f-a275-8a552e289818",
681
- "metadata": {},
682
- "source": [
683
- "We can see that it only took `$0.71` to pay for this!\n",
684
- "\n",
685
- "![Cost](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates/resolve/main/media/automatic-embeddings-cost.png)"
686
- ]
687
- },
688
- {
689
- "cell_type": "markdown",
690
- "id": "b953d5be-2494-4ff8-be42-9daf00c99c41",
691
- "metadata": {},
692
- "source": [
693
- "# Delete Endpoint\n",
694
- "We should see a `200` if everything went correctly."
695
- ]
696
- },
697
- {
698
- "cell_type": "code",
699
- "execution_count": 17,
700
- "id": "c310c0f3-6f12-4d5c-838b-3a4c1f2e54ad",
701
- "metadata": {
702
- "tags": []
703
- },
704
- "outputs": [
705
- {
706
- "name": "stdout",
707
- "output_type": "stream",
708
- "text": [
709
- "200\n"
710
- ]
711
- }
712
- ],
713
- "source": [
714
- "response = requests.delete(endpoint_url, headers=headers)\n",
715
- "\n",
716
- "print(response.status_code)"
717
- ]
718
- },
719
- {
720
- "cell_type": "code",
721
- "execution_count": null,
722
- "id": "5db1b1c3-16c3-403a-9472-a97e730826d5",
723
- "metadata": {},
724
- "outputs": [],
725
- "source": []
726
- }
727
- ],
728
- "metadata": {
729
- "kernelspec": {
730
- "display_name": "Python 3 (ipykernel)",
731
- "language": "python",
732
- "name": "python3"
733
- },
734
- "language_info": {
735
- "codemirror_mode": {
736
- "name": "ipython",
737
- "version": 3
738
- },
739
- "file_extension": ".py",
740
- "mimetype": "text/x-python",
741
- "name": "python",
742
- "nbconvert_exporter": "python",
743
- "pygments_lexer": "ipython3",
744
- "version": "3.10.8"
745
- }
746
- },
747
- "nbformat": 4,
748
- "nbformat_minor": 5
749
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  aiohttp==3.8.3
2
  datasets==2.14.6
3
- huggingface-hub==0.15.1
4
  pandas==1.5.3
5
  requests==2.31.0
6
  tqdm==4.66.1
 
1
  aiohttp==3.8.3
2
  datasets==2.14.6
3
+ huggingface-hub==0.19.4
4
  pandas==1.5.3
5
  requests==2.31.0
6
  tqdm==4.66.1
src/my_logger.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+
4
+ def setup_logger(name: str):
5
+ logger = logging.getLogger(name)
6
+ logger.setLevel(logging.DEBUG)
7
+
8
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
9
+
10
+ # Create a file handler to write logs to a file
11
+ file_handler = logging.FileHandler('mylog.log')
12
+ file_handler.setLevel(logging.DEBUG)
13
+ file_handler.setFormatter(formatter)
14
+ logger.addHandler(file_handler)
15
+
16
+ # Create a stream handler to write logs to the console
17
+ stream_handler = logging.StreamHandler()
18
+ stream_handler.setLevel(logging.DEBUG)
19
+ stream_handler.setFormatter(formatter)
20
+ logger.addHandler(stream_handler)
21
+
22
+ return logger
src/utilities.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from datasets import Dataset, DownloadMode, load_dataset
6
+ from gradio_client import Client
7
+
8
+ from src.my_logger import setup_logger
9
+
10
+ SUBREDDIT = os.environ["SUBREDDIT"]
11
+ USERNAME = os.environ["USERNAME"]
12
+ OG_DATASET= f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"
13
+ PROCESSED_DATASET= os.environ['PROCESSED_DATASET']
14
+
15
+ client = Client("derek-thomas/nomic-embeddings")
16
+ logger = setup_logger(__name__)
17
+
18
+
19
+ async def load_datasets():
20
+ # Get latest datasets locally
21
+ logger.debug(f"Trying to download {PROCESSED_DATASET}")
22
+ dataset = load_dataset(PROCESSED_DATASET, download_mode=DownloadMode.FORCE_REDOWNLOAD)
23
+ logger.debug(f"Loaded {PROCESSED_DATASET}")
24
+
25
+ logger.debug(f"Trying to download {OG_DATASET}")
26
+ original_dataset = load_dataset(OG_DATASET, download_mode=DownloadMode.FORCE_REDOWNLOAD)
27
+ logger.debug(f"Loaded {OG_DATASET}")
28
+ return dataset, original_dataset
29
+
30
+
31
+ def merge_and_update_datasets(dataset, original_dataset):
32
+ # Merge and figure out which rows need to be updated with embeddings
33
+ odf = original_dataset['train'].to_pandas()
34
+ df = dataset['train'].to_pandas()
35
+
36
+ # Step 1: Merge df onto odf
37
+ # We'll bring in 'content' and 'embedding' from df to compare and possibly update 'embedding'
38
+ merged_df = pd.merge(odf, df[['id', 'content', 'embedding']], on='id', how='left', suffixes=('_odf', ''))
39
+ updated_rows = len(merged_df[merged_df.content != merged_df.content_odf])
40
+
41
+ # Step 2: Compare 'content' from odf and df, update 'embedding' if they differ
42
+ merged_df['embedding'] = np.where(merged_df['content_odf'] != merged_df['content'], None, merged_df['embedding'])
43
+
44
+ # Step 3: Cleanup - keep only the necessary columns.
45
+ # Assuming you want to keep 'content' from 'odf' and the updated 'embedding', and drop the rest
46
+ merged_df = merged_df.drop(columns=['content', 'new', 'updated']) # Update columns to match df
47
+ merged_df.rename(columns={'content_odf': 'content'}, inplace=True) # Rename 'content_odf' back to 'content'
48
+
49
+ logger.info(f"Updating {updated_rows} rows...")
50
+ # Iterate over the DataFrame rows where 'embedding' is None
51
+ for index, row in merged_df[merged_df['embedding'].isnull()].iterrows():
52
+ # Update 'embedding' for the current row using our function
53
+ merged_df.at[index, 'embedding'] = update_embeddings(row['content'])
54
+
55
+ dataset['train'] = Dataset.from_pandas(merged_df)
56
+ logger.info(f"Updated {updated_rows} rows")
57
+ return dataset
58
+
59
+
60
+ def update_embeddings(content):
61
+ embedding = client.predict(content, api_name="/embed")
62
+ return np.array(embedding)
src/visualize_logs.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ from rich.console import Console
3
+ from rich.syntax import Syntax
4
+
5
+
6
+ def log_file_to_html_string():
7
+ log_file = "mylog.log"
8
+ num_lines_visualize = 50
9
+
10
+ console = Console(record=True, width=150, style="#272822")
11
+ with open(log_file, "rt") as f:
12
+ # Seek to the end of the file minus 300 lines
13
+ # Read the last 300 lines of the file
14
+ lines = f.readlines()
15
+ lines = lines[-num_lines_visualize:]
16
+
17
+ # Syntax-highlight the last 300 lines of the file using the Python lexer and Monokai style
18
+ output = "".join(lines)
19
+ syntax = Syntax(output, "python", theme="monokai", word_wrap=True)
20
+
21
+ console.print(syntax);
22
+ html_content = console.export_html(inline_styles=True)
23
+
24
+ # Parse the HTML content using BeautifulSoup
25
+ soup = BeautifulSoup(html_content, 'lxml')
26
+
27
+ # Modify the <pre> tag
28
+ pre_tag = soup.pre
29
+ pre_tag['class'] = 'scrollable'
30
+ del pre_tag['style']
31
+
32
+ # Add your custom styles and the .scrollable CSS to the <style> tag
33
+ style_tag = soup.style
34
+ style_content = """
35
+ pre, code {
36
+ background-color: #272822;
37
+ }
38
+ .scrollable {
39
+ font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace;
40
+ height: 500px;
41
+ overflow: auto;
42
+ }
43
+ """
44
+ style_tag.append(style_content)
45
+
46
+ return soup.prettify()