File size: 15,399 Bytes
a6c26b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "This is the repo dir /Users/kwasia/Documents/Projects/ai-starter-kit\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "current_dir = os.getcwd()\n",
    "kit_dir = os.path.abspath(os.path.join(current_dir, '..'))\n",
    "repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))\n",
    "\n",
    "sys.path.append(kit_dir)\n",
    "sys.path.append(repo_dir)\n",
    "\n",
    "print(f'This is the repo dir {repo_dir}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load DotEnv\n",
    "\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv('../../.env')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils.parsing.sambaparse import SambaParse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use Case 1 - Process a Single File"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-06-20 16:15:20,971 - INFO - Deleting contents of output directory: ./output\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-06-20 16:15:20,995 - INFO - Running command: unstructured-ingest local --output-dir ./output --num-processes 2 --strategy auto --ocr-languages eng --encoding utf-8 --fields-include element_id,text,type,metadata,embeddings --metadata-exclude  --metadata-include  --pdf-infer-table-structure --input-path \"./test_docs/samba_turbo.pdf\" --recursive --verbose --partition-by-api --api-key EA6ZX3037WEZUV8THwco --partition-endpoint http://localhost:8005 --pdf-infer-table-structure --chunking-strategy basic --chunk-max-characters 1500 --chunk-overlap 300\n",
      "2024-06-20 16:15:20,996 - INFO - This may take some time depending on the size of your data. Please be patient...\n",
      "2024-06-20 16:15:20,996 - INFO - This may take some time depending on the size of your data. Please be patient...\n",
      "/Users/kwasia/.pyenv/versions/sambaparse/lib/python3.10/site-packages/dataclasses_json/core.py:201: RuntimeWarning: 'NoneType' object value of non-optional type additional_partition_args detected when decoding CliPartitionConfig.\n",
      "  warnings.warn(\n",
      "2024-06-20 16:15:22,908 MainProcess INFO     running pipeline: DocFactory -> Reader -> Partitioner -> Chunker -> Copier with config: {\"reprocess\": false, \"verbose\": true, \"work_dir\": \"/Users/kwasia/.cache/unstructured/ingest/pipeline\", \"output_dir\": \"./output\", \"num_processes\": 2, \"raise_on_error\": false}\n",
      "2024-06-20 16:15:24,658 MainProcess INFO     Running doc factory to generate ingest docs. Source connector: {\"processor_config\": {\"reprocess\": false, \"verbose\": true, \"work_dir\": \"/Users/kwasia/.cache/unstructured/ingest/pipeline\", \"output_dir\": \"./output\", \"num_processes\": 2, \"raise_on_error\": false}, \"read_config\": {\"download_dir\": null, \"re_download\": false, \"preserve_downloads\": false, \"download_only\": false, \"max_docs\": null}, \"connector_config\": {\"input_path\": \"./test_docs/samba_turbo.pdf\", \"recursive\": true, \"file_glob\": null}}\n",
      "2024-06-20 16:15:24,661 MainProcess INFO     processing 1 docs via 2 processes\n",
      "2024-06-20 16:15:24,661 MainProcess INFO     Calling Reader with 1 docs\n",
      "2024-06-20 16:15:24,661 MainProcess INFO     Running source node to download data associated with ingest docs\n",
      "2024-06-20 16:15:26,511 SpawnPoolWorker-3 INFO     File exists: test_docs/samba_turbo.pdf, skipping download\n",
      "2024-06-20 16:15:26,522 MainProcess INFO     Calling Partitioner with 1 docs\n",
      "2024-06-20 16:15:26,523 MainProcess INFO     Running partition node to extract content from json files. Config: {\"pdf_infer_table_structure\": true, \"strategy\": \"auto\", \"ocr_languages\": [\"eng\"], \"encoding\": \"utf-8\", \"additional_partition_args\": null, \"skip_infer_table_types\": null, \"fields_include\": [\"element_id\", \"text\", \"type\", \"metadata\", \"embeddings\"], \"flatten_metadata\": false, \"metadata_exclude\": [\"--metadata-include\"], \"metadata_include\": [], \"partition_endpoint\": \"http://localhost:8005\", \"partition_by_api\": true, \"api_key\": \"*******\", \"hi_res_model_name\": null}, partition kwargs: {}]\n",
      "2024-06-20 16:15:26,523 MainProcess INFO     Creating /Users/kwasia/.cache/unstructured/ingest/pipeline/partitioned\n",
      "2024-06-20 16:15:28,387 SpawnPoolWorker-4 INFO     Processing test_docs/samba_turbo.pdf\n",
      "2024-06-20 16:15:29,836 SpawnPoolWorker-4 DEBUG    Using remote partition (http://localhost:8005)\n",
      "2024-06-20 16:15:40,244 SpawnPoolWorker-4 INFO     writing partitioned content to /Users/kwasia/.cache/unstructured/ingest/pipeline/partitioned/eb87c25354d57b8c7434994ca9c3f796.json\n",
      "2024-06-20 16:15:40,254 MainProcess INFO     Calling Chunker with 1 docs\n",
      "2024-06-20 16:15:40,255 MainProcess INFO     Running chunking node. Chunking config: {\"chunking_strategy\": \"basic\", \"combine_text_under_n_chars\": null, \"include_orig_elements\": true, \"max_characters\": 1500, \"multipage_sections\": true, \"new_after_n_chars\": null, \"overlap\": 300, \"overlap_all\": false}]\n",
      "2024-06-20 16:15:40,255 MainProcess INFO     Creating /Users/kwasia/.cache/unstructured/ingest/pipeline/chunked\n",
      "2024-06-20 16:15:42,318 SpawnPoolWorker-6 INFO     writing chunking content to /Users/kwasia/.cache/unstructured/ingest/pipeline/chunked/df2636b5a36c11e91958dfd7ae81ddb1.json\n",
      "2024-06-20 16:15:42,323 MainProcess INFO     Calling Copier with 1 docs\n",
      "2024-06-20 16:15:42,323 MainProcess INFO     Running copy node to move content to desired output location\n",
      "2024-06-20 16:15:44,114 SpawnPoolWorker-9 INFO     Copying /Users/kwasia/.cache/unstructured/ingest/pipeline/chunked/df2636b5a36c11e91958dfd7ae81ddb1.json -> output/samba_turbo.pdf.json\n",
      "2024-06-20 16:15:44,320 - INFO - Ingest process completed successfully!\n",
      "2024-06-20 16:15:44,321 - INFO - Performing additional processing...\n",
      "2024-06-20 16:15:44,324 - INFO - Additional processing completed.\n"
     ]
    }
   ],
   "source": [
    "config_yaml = './config.yaml'\n",
    "sambaparse = SambaParse(config_yaml)\n",
    "\n",
    "source_type = 'local'\n",
    "input_path = './test_docs/samba_turbo.pdf'\n",
    "additional_metadata = {'key': 'value'}\n",
    "\n",
    "texts, metadata_list, langchain_docs = sambaparse.run_ingest(\n",
    "    source_type, input_path=input_path, additional_metadata=additional_metadata\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "This is the length of the lanchain docs 5\n",
      "This is an example langcahin doc \n",
      "\n",
      " page_content=\"6/20/24, 3:23 PM\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nG\\\\SambaNovar\\n\\nEN\\n\\nBACK TO RESOURCES\\n\\n<\\n\\nPREVIOUS | NEXT\\n\\n>\\n\\nMay 29, 2024\\n\\njn\\n\\nNX\\n\\nfF\\n\\nBS\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nSambaNova is the clear winner of the latest large language model LLM benchmark by Artificial Analysis. Topping the Leaderboad at over 1000 tokens per second (t/s), Samba-1 Turbo sets a new record for Llama 3 8B performance on a single SN40L node and with full precision.\\n\\nWith speeds like this, enterprises can expect to accelerate an array of use cases and will enable innovation around unblocking agentic workflow, copilot, and synthetic data, to name a few. This breakthrough in AI technology is possible because the purpose-built SambaNova SN40L Reconfigurable Dataflow Unit RDU can hold hundreds of models at the same time and can switch between them in microseconds.\\n\\nSpeed for today and tomorrow\" metadata={'filename': 'samba_turbo.pdf', 'filetype': 'application/pdf', 'languages': 'eng', 'page_number': '1', 'orig_elements': 'eJzVl21v2zYQx7/KwW+2AV7DJ1FUMQxI22wrlqZFHrYCbVHw4WhzkSVBkut63b77jvYejCJF7BdDkleCyBN597v/Hak3nyZY4wKb8X0Kk8cwKWzl0TimbZBSVEWprZZRSx20YbJwkylMFjjaYEdL9p8mMdXY2AXmjwe7cPb9uOxd+6gLMdvm6XHdbaZt19XJ2zG1zdHf07VtZks7w4Hm30ywmU3e0WhHI++b5cJhT+P8Txoa8eOY19BHgh0JNQX5WEh49SIv8s/6P6EN9AWZfx5VjIWpKvRWRFlZ6dEJxr3SElEFr8xdR3WRtzhrP1iY2wFc315jA+McgTPGYDyiMdv3CfvHsJqvIY1fDWDBpRkEtDXEtgcKFvuuTwPC8fP9qJQlE8YV0ToeXamKqiAuKI3RJTNFFe6CymakPyBzuxh/fPv2X5L9LoPLNNZ4EwIUpQqxRKYL4RwtaQxHdMYrFSup7lzuu0E8X5DdTUGEGENRVbIqfCU1s0wFGazTVMGFJnZ3HcTJ2X56VFgq6jWoKMOlM0FykmMMEjk6LoW4D3q8lfVu4E+On/4Mly/h/OTi5dX505OLvTQZjUWunZeclwKDFJ4JpqMzUflKlXgfMNyaql0M3+2GfdXQ9jhr+/Q7hstscQMCxlUUUikUSjkjnePCCGFtYJE77x+eEl6dn/zy/OXVBfwBZyevL/fSQSWo+3tqzRELaniyUpoTFKkq5JWLd9KbPodwa6J2IXx/sA5Ky6pYeMZZITPlUAmnOTFBI6lC2IND8MKuQVRTEEyoXRpndLyTKx/wSyQsL5UpiAcLwUWLRisTSsmCcVYUgd8HEgdVxG/NXkVQBCtj0JJjLA1TvhROBaF9QR3IBV48uLjPXu8VNzeVQDR05SF5Oead08JjENFLr6R4ePmOP+wVty+rUvsouLHcM7qHlbzAwAXngU4Zey8Ov8PuAPsd+nQniq5iOieawi4dD5F5Q1WvqkKZ+ODi/j9/ab58dQqs0oXWBT0tp0opQwx0I6GyQS+re3Fe3JrpmymmYUPP12h7WKWmwR7auBmr6SAdRnr0s/yy9QkWbcAaTk9fgMPGzxe2vwa3huN+TDH5RIyPG1uvhzQ8gsu261Iz26x2urmfu9YGsCO0H2ifbcpyBgfo6H1A3zYBvqYsfjOFjY/fcrjM5GhuzIlscAU9mfVhk8rT2i4sSDBP8gI0srCNR2gbMh1o5xrh4kyxU2jIa7C0+CqNc4jLuoaO1kkDYX900ImpSAUycG1N6Zig25NkvIraMvqLlE7ci4o6SAu/ZiJDhxgGqNM1UrbSMN2pkgG8bQA/duhHShdY7wkJUcpEIfNaZ8ksqZ68zeZbzoSY4naUAlIVSS1HRdbtkmaXjatbf521QZE0Y/KwavvrWLerKfi2S3U7TjfrDOuG1JMNMtBp3j/DpPxGXJHCyFfqAmivxzktPZvTZlTVMKKfN23dztZZ4V07DCl74uiPP/uZBdktexrHb90y1SP8VxFbwZxnLcY0W/abEJ7R5tk7uGrSCOfPrjZM5m0dYE4B9RkeMdgUx5AFnncYsqNjWmyVlz8YSH5+Tm6MK9z2rUV2eJF8327VPxymRsOYDFHooL1WwmjDCxc8liW6KnJu74MaD+vvWYib2h7bQMLK5MZ20fZ9u7qhV7/7C2wCbXA=', 'key': 'value', 'type': 'CompositeElement', 'element_id': '34922f62e3c3e7600d32eb0627b79202', 'page': '1'}\n"
     ]
    }
   ],
   "source": [
    "# Inspect the Output\n",
    "\n",
    "# 1. Number of Chunks\n",
    "print(f'This is the length of the lanchain docs {len(langchain_docs)}')\n",
    "\n",
    "# 2. Example Chunk\n",
    "print(f'This is an example langcahin doc \\n\\n {langchain_docs[0]}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use Case 2 - Process Whole Directory "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "config_yaml = './config.yaml'\n",
    "sambaparse = SambaParse(config_yaml)\n",
    "\n",
    "source_type = 'local'\n",
    "input_path = './test_docs'\n",
    "additional_metadata = {'key': 'value'}\n",
    "\n",
    "texts, metadata_list, langchain_docs = sambaparse.run_ingest(\n",
    "    source_type, input_path=input_path, additional_metadata=additional_metadata\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "This is the length of the lanchain docs 44\n",
      "This is an example langcahin doc \n",
      "\n",
      " page_content=\"6/20/24, 3:23 PM\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nG\\\\SambaNovar\\n\\nEN\\n\\nBACK TO RESOURCES\\n\\n<\\n\\nPREVIOUS | NEXT\\n\\n>\\n\\nMay 29, 2024\\n\\njn\\n\\nNX\\n\\nfF\\n\\nBS\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nSambaNova is the clear winner of the latest large language model LLM benchmark by Artificial Analysis. Topping the Leaderboad at over 1000 tokens per second (t/s), Samba-1 Turbo sets a new record for Llama 3 8B performance on a single SN40L node and with full precision.\\n\\nWith speeds like this, enterprises can expect to accelerate an array of use cases and will enable innovation around unblocking agentic workflow, copilot, and synthetic data, to name a few. This breakthrough in AI technology is possible because the purpose-built SambaNova SN40L Reconfigurable Dataflow Unit RDU can hold hundreds of models at the same time and can switch between them in microseconds.\\n\\nSpeed for today and tomorrow\" metadata={'filename': 'samba_turbo.pdf', 'filetype': 'application/pdf', 'languages': 'eng', 'page_number': '1', 'orig_elements': 'eJzVl21v2zYQx7/KwW+2AV7DJ1FUMQxI22wrlqZFHrYCbVHw4WhzkSVBkut63b77jvYejCJF7BdDkleCyBN597v/Hak3nyZY4wKb8X0Kk8cwKWzl0TimbZBSVEWprZZRSx20YbJwkylMFjjaYEdL9p8mMdXY2AXmjwe7cPb9uOxd+6gLMdvm6XHdbaZt19XJ2zG1zdHf07VtZks7w4Hm30ywmU3e0WhHI++b5cJhT+P8Txoa8eOY19BHgh0JNQX5WEh49SIv8s/6P6EN9AWZfx5VjIWpKvRWRFlZ6dEJxr3SElEFr8xdR3WRtzhrP1iY2wFc315jA+McgTPGYDyiMdv3CfvHsJqvIY1fDWDBpRkEtDXEtgcKFvuuTwPC8fP9qJQlE8YV0ToeXamKqiAuKI3RJTNFFe6CymakPyBzuxh/fPv2X5L9LoPLNNZ4EwIUpQqxRKYL4RwtaQxHdMYrFSup7lzuu0E8X5DdTUGEGENRVbIqfCU1s0wFGazTVMGFJnZ3HcTJ2X56VFgq6jWoKMOlM0FykmMMEjk6LoW4D3q8lfVu4E+On/4Mly/h/OTi5dX505OLvTQZjUWunZeclwKDFJ4JpqMzUflKlXgfMNyaql0M3+2GfdXQ9jhr+/Q7hstscQMCxlUUUikUSjkjnePCCGFtYJE77x+eEl6dn/zy/OXVBfwBZyevL/fSQSWo+3tqzRELaniyUpoTFKkq5JWLd9KbPodwa6J2IXx/sA5Ky6pYeMZZITPlUAmnOTFBI6lC2IND8MKuQVRTEEyoXRpndLyTKx/wSyQsL5UpiAcLwUWLRisTSsmCcVYUgd8HEgdVxG/NXkVQBCtj0JJjLA1TvhROBaF9QR3IBV48uLjPXu8VNzeVQDR05SF5Oead08JjENFLr6R4ePmOP+wVty+rUvsouLHcM7qHlbzAwAXngU4Zey8Ov8PuAPsd+nQniq5iOieawi4dD5F5Q1WvqkKZ+ODi/j9/ab58dQqs0oXWBT0tp0opQwx0I6GyQS+re3Fe3JrpmymmYUPP12h7WKWmwR7auBmr6SAdRnr0s/yy9QkWbcAaTk9fgMPGzxe2vwa3huN+TDH5RIyPG1uvhzQ8gsu261Iz26x2urmfu9YGsCO0H2ifbcpyBgfo6H1A3zYBvqYsfjOFjY/fcrjM5GhuzIlscAU9mfVhk8rT2i4sSDBP8gI0srCNR2gbMh1o5xrh4kyxU2jIa7C0+CqNc4jLuoaO1kkDYX900ImpSAUycG1N6Zig25NkvIraMvqLlE7ci4o6SAu/ZiJDhxgGqNM1UrbSMN2pkgG8bQA/duhHShdY7wkJUcpEIfNaZ8ksqZ68zeZbzoSY4naUAlIVSS1HRdbtkmaXjatbf521QZE0Y/KwavvrWLerKfi2S3U7TjfrDOuG1JMNMtBp3j/DpPxGXJHCyFfqAmivxzktPZvTZlTVMKKfN23dztZZ4V07DCl74uiPP/uZBdktexrHb90y1SP8VxFbwZxnLcY0W/abEJ7R5tk7uGrSCOfPrjZM5m0dYE4B9RkeMdgUx5AFnncYsqNjWmyVlz8YSH5+Tm6MK9z2rUV2eJF8327VPxymRsOYDFHooL1WwmjDCxc8liW6KnJu74MaD+vvWYib2h7bQMLK5MZ20fZ9u7qhV7/7C2wCbXA=', 'key': 'value', 'type': 'CompositeElement', 'element_id': '34922f62e3c3e7600d32eb0627b79202', 'page': '1'}\n"
     ]
    }
   ],
   "source": [
    "# Inspect the Output\n",
    "\n",
    "# 1. Number of Chunks\n",
    "print(f'This is the length of the lanchain docs {len(langchain_docs)}')\n",
    "\n",
    "# 2. Example Chunk\n",
    "print(f'This is an example langcahin doc \\n\\n {langchain_docs[0]}')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "aisk-fine-tune-embeddings",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}