Spaces:

sambanovasystems
/

enterprise_knowledge_retriever

Running

File size: 15,399 Bytes

a6c26b1

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "This is the repo dir /Users/kwasia/Documents/Projects/ai-starter-kit\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "current_dir = os.getcwd()\n",
    "kit_dir = os.path.abspath(os.path.join(current_dir, '..'))\n",
    "repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))\n",
    "\n",
    "sys.path.append(kit_dir)\n",
    "sys.path.append(repo_dir)\n",
    "\n",
    "print(f'This is the repo dir {repo_dir}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load DotEnv\n",
    "\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv('../../.env')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils.parsing.sambaparse import SambaParse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use Case 1 - Process a Single File"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-06-20 16:15:20,971 - INFO - Deleting contents of output directory: ./output\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-06-20 16:15:20,995 - INFO - Running command: unstructured-ingest local --output-dir ./output --num-processes 2 --strategy auto --ocr-languages eng --encoding utf-8 --fields-include element_id,text,type,metadata,embeddings --metadata-exclude  --metadata-include  --pdf-infer-table-structure --input-path \"./test_docs/samba_turbo.pdf\" --recursive --verbose --partition-by-api --api-key EA6ZX3037WEZUV8THwco --partition-endpoint http://localhost:8005 --pdf-infer-table-structure --chunking-strategy basic --chunk-max-characters 1500 --chunk-overlap 300\n",
      "2024-06-20 16:15:20,996 - INFO - This may take some time depending on the size of your data. Please be patient...\n",
      "2024-06-20 16:15:20,996 - INFO - This may take some time depending on the size of your data. Please be patient...\n",
      "/Users/kwasia/.pyenv/versions/sambaparse/lib/python3.10/site-packages/dataclasses_json/core.py:201: RuntimeWarning: 'NoneType' object value of non-optional type additional_partition_args detected when decoding CliPartitionConfig.\n",
      "  warnings.warn(\n",
      "2024-06-20 16:15:22,908 MainProcess INFO     running pipeline: DocFactory -> Reader -> Partitioner -> Chunker -> Copier with config: {\"reprocess\": false, \"verbose\": true, \"work_dir\": \"/Users/kwasia/.cache/unstructured/ingest/pipeline\", \"output_dir\": \"./output\", \"num_processes\": 2, \"raise_on_error\": false}\n",
      "2024-06-20 16:15:24,658 MainProcess INFO     Running doc factory to generate ingest docs. Source connector: {\"processor_config\": {\"reprocess\": false, \"verbose\": true, \"work_dir\": \"/Users/kwasia/.cache/unstructured/ingest/pipeline\", \"output_dir\": \"./output\", \"num_processes\": 2, \"raise_on_error\": false}, \"read_config\": {\"download_dir\": null, \"re_download\": false, \"preserve_downloads\": false, \"download_only\": false, \"max_docs\": null}, \"connector_config\": {\"input_path\": \"./test_docs/samba_turbo.pdf\", \"recursive\": true, \"file_glob\": null}}\n",
      "2024-06-20 16:15:24,661 MainProcess INFO     processing 1 docs via 2 processes\n",
      "2024-06-20 16:15:24,661 MainProcess INFO     Calling Reader with 1 docs\n",
      "2024-06-20 16:15:24,661 MainProcess INFO     Running source node to download data associated with ingest docs\n",
      "2024-06-20 16:15:26,511 SpawnPoolWorker-3 INFO     File exists: test_docs/samba_turbo.pdf, skipping download\n",
      "2024-06-20 16:15:26,522 MainProcess INFO     Calling Partitioner with 1 docs\n",
      "2024-06-20 16:15:26,523 MainProcess INFO     Running partition node to extract content from json files. Config: {\"pdf_infer_table_structure\": true, \"strategy\": \"auto\", \"ocr_languages\": [\"eng\"], \"encoding\": \"utf-8\", \"additional_partition_args\": null, \"skip_infer_table_types\": null, \"fields_include\": [\"element_id\", \"text\", \"type\", \"metadata\", \"embeddings\"], \"flatten_metadata\": false, \"metadata_exclude\": [\"--metadata-include\"], \"metadata_include\": [], \"partition_endpoint\": \"http://localhost:8005\", \"partition_by_api\": true, \"api_key\": \"*******\", \"hi_res_model_name\": null}, partition kwargs: {}]\n",
      "2024-06-20 16:15:26,523 MainProcess INFO     Creating /Users/kwasia/.cache/unstructured/ingest/pipeline/partitioned\n",
      "2024-06-20 16:15:28,387 SpawnPoolWorker-4 INFO     Processing test_docs/samba_turbo.pdf\n",
      "2024-06-20 16:15:29,836 SpawnPoolWorker-4 DEBUG    Using remote partition (http://localhost:8005)\n",
      "2024-06-20 16:15:40,244 SpawnPoolWorker-4 INFO     writing partitioned content to /Users/kwasia/.cache/unstructured/ingest/pipeline/partitioned/eb87c25354d57b8c7434994ca9c3f796.json\n",
      "2024-06-20 16:15:40,254 MainProcess INFO     Calling Chunker with 1 docs\n",
      "2024-06-20 16:15:40,255 MainProcess INFO     Running chunking node. Chunking config: {\"chunking_strategy\": \"basic\", \"combine_text_under_n_chars\": null, \"include_orig_elements\": true, \"max_characters\": 1500, \"multipage_sections\": true, \"new_after_n_chars\": null, \"overlap\": 300, \"overlap_all\": false}]\n",
      "2024-06-20 16:15:40,255 MainProcess INFO     Creating /Users/kwasia/.cache/unstructured/ingest/pipeline/chunked\n",
      "2024-06-20 16:15:42,318 SpawnPoolWorker-6 INFO     writing chunking content to /Users/kwasia/.cache/unstructured/ingest/pipeline/chunked/df2636b5a36c11e91958dfd7ae81ddb1.json\n",
      "2024-06-20 16:15:42,323 MainProcess INFO     Calling Copier with 1 docs\n",
      "2024-06-20 16:15:42,323 MainProcess INFO     Running copy node to move content to desired output location\n",
      "2024-06-20 16:15:44,114 SpawnPoolWorker-9 INFO     Copying /Users/kwasia/.cache/unstructured/ingest/pipeline/chunked/df2636b5a36c11e91958dfd7ae81ddb1.json -> output/samba_turbo.pdf.json\n",
      "2024-06-20 16:15:44,320 - INFO - Ingest process completed successfully!\n",
      "2024-06-20 16:15:44,321 - INFO - Performing additional processing...\n",
      "2024-06-20 16:15:44,324 - INFO - Additional processing completed.\n"
     ]
    }
   ],
   "source": [
    "config_yaml = './config.yaml'\n",
    "sambaparse = SambaParse(config_yaml)\n",
    "\n",
    "source_type = 'local'\n",
    "input_path = './test_docs/samba_turbo.pdf'\n",
    "additional_metadata = {'key': 'value'}\n",
    "\n",
    "texts, metadata_list, langchain_docs = sambaparse.run_ingest(\n",
    "    source_type, input_path=input_path, additional_metadata=additional_metadata\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "This is the length of the lanchain docs 5\n",
      "This is an example langcahin doc \n",
      "\n",
      " page_content=\"6/20/24, 3:23 PM\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nG\\\\SambaNovar\\n\\nEN\\n\\nBACK TO RESOURCES\\n\\n<\\n\\nPREVIOUS | NEXT\\n\\n>\\n\\nMay 29, 2024\\n\\njn\\n\\nNX\\n\\nfF\\n\\nBS\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nSambaNova is the clear winner of the latest large language model LLM benchmark by Artificial Analysis. Topping the Leaderboad at over 1000 tokens per second (t/s), Samba-1 Turbo sets a new record for Llama 3 8B performance on a single SN40L node and with full precision.\\n\\nWith speeds like this, enterprises can expect to accelerate an array of use cases and will enable innovation around unblocking agentic workflow, copilot, and synthetic data, to name a few. This breakthrough in AI technology is possible because the purpose-built SambaNova SN40L Reconfigurable Dataflow Unit RDU can hold hundreds of models at the same time and can switch between them in microseconds.\\n\\nSpeed for today and tomorrow\" metadata={'filename': 'samba_turbo.pdf', 'filetype': 'application/pdf', 'languages': 'eng', 'page_number': '1', 'orig_elements': 'eJzVl21v2zYQx7/KwW+2AV7DJ1FUMQxI22wrlqZFHrYCbVHw4WhzkSVBkut63b77jvYejCJF7BdDkleCyBN597v/Hak3nyZY4wKb8X0Kk8cwKWzl0TimbZBSVEWprZZRSx20YbJwkylMFjjaYEdL9p8mMdXY2AXmjwe7cPb9uOxd+6gLMdvm6XHdbaZt19XJ2zG1zdHf07VtZks7w4Hm30ywmU3e0WhHI++b5cJhT+P8Txoa8eOY19BHgh0JNQX5WEh49SIv8s/6P6EN9AWZfx5VjIWpKvRWRFlZ6dEJxr3SElEFr8xdR3WRtzhrP1iY2wFc315jA+McgTPGYDyiMdv3CfvHsJqvIY1fDWDBpRkEtDXEtgcKFvuuTwPC8fP9qJQlE8YV0ToeXamKqiAuKI3RJTNFFe6CymakPyBzuxh/fPv2X5L9LoPLNNZ4EwIUpQqxRKYL4RwtaQxHdMYrFSup7lzuu0E8X5DdTUGEGENRVbIqfCU1s0wFGazTVMGFJnZ3HcTJ2X56VFgq6jWoKMOlM0FykmMMEjk6LoW4D3q8lfVu4E+On/4Mly/h/OTi5dX505OLvTQZjUWunZeclwKDFJ4JpqMzUflKlXgfMNyaql0M3+2GfdXQ9jhr+/Q7hstscQMCxlUUUikUSjkjnePCCGFtYJE77x+eEl6dn/zy/OXVBfwBZyevL/fSQSWo+3tqzRELaniyUpoTFKkq5JWLd9KbPodwa6J2IXx/sA5Ky6pYeMZZITPlUAmnOTFBI6lC2IND8MKuQVRTEEyoXRpndLyTKx/wSyQsL5UpiAcLwUWLRisTSsmCcVYUgd8HEgdVxG/NXkVQBCtj0JJjLA1TvhROBaF9QR3IBV48uLjPXu8VNzeVQDR05SF5Oead08JjENFLr6R4ePmOP+wVty+rUvsouLHcM7qHlbzAwAXngU4Zey8Ov8PuAPsd+nQniq5iOieawi4dD5F5Q1WvqkKZ+ODi/j9/ab58dQqs0oXWBT0tp0opQwx0I6GyQS+re3Fe3JrpmymmYUPP12h7WKWmwR7auBmr6SAdRnr0s/yy9QkWbcAaTk9fgMPGzxe2vwa3huN+TDH5RIyPG1uvhzQ8gsu261Iz26x2urmfu9YGsCO0H2ifbcpyBgfo6H1A3zYBvqYsfjOFjY/fcrjM5GhuzIlscAU9mfVhk8rT2i4sSDBP8gI0srCNR2gbMh1o5xrh4kyxU2jIa7C0+CqNc4jLuoaO1kkDYX900ImpSAUycG1N6Zig25NkvIraMvqLlE7ci4o6SAu/ZiJDhxgGqNM1UrbSMN2pkgG8bQA/duhHShdY7wkJUcpEIfNaZ8ksqZ68zeZbzoSY4naUAlIVSS1HRdbtkmaXjatbf521QZE0Y/KwavvrWLerKfi2S3U7TjfrDOuG1JMNMtBp3j/DpPxGXJHCyFfqAmivxzktPZvTZlTVMKKfN23dztZZ4V07DCl74uiPP/uZBdktexrHb90y1SP8VxFbwZxnLcY0W/abEJ7R5tk7uGrSCOfPrjZM5m0dYE4B9RkeMdgUx5AFnncYsqNjWmyVlz8YSH5+Tm6MK9z2rUV2eJF8327VPxymRsOYDFHooL1WwmjDCxc8liW6KnJu74MaD+vvWYib2h7bQMLK5MZ20fZ9u7qhV7/7C2wCbXA=', 'key': 'value', 'type': 'CompositeElement', 'element_id': '34922f62e3c3e7600d32eb0627b79202', 'page': '1'}\n"
     ]
    }
   ],
   "source": [
    "# Inspect the Output\n",
    "\n",
    "# 1. Number of Chunks\n",
    "print(f'This is the length of the lanchain docs {len(langchain_docs)}')\n",
    "\n",
    "# 2. Example Chunk\n",
    "print(f'This is an example langcahin doc \\n\\n {langchain_docs[0]}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use Case 2 - Process Whole Directory "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "config_yaml = './config.yaml'\n",
    "sambaparse = SambaParse(config_yaml)\n",
    "\n",
    "source_type = 'local'\n",
    "input_path = './test_docs'\n",
    "additional_metadata = {'key': 'value'}\n",
    "\n",
    "texts, metadata_list, langchain_docs = sambaparse.run_ingest(\n",
    "    source_type, input_path=input_path, additional_metadata=additional_metadata\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "This is the length of the lanchain docs 44\n",
      "This is an example langcahin doc \n",
      "\n",
      " page_content=\"6/20/24, 3:23 PM\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nG\\\\SambaNovar\\n\\nEN\\n\\nBACK TO RESOURCES\\n\\n<\\n\\nPREVIOUS | NEXT\\n\\n>\\n\\nMay 29, 2024\\n\\njn\\n\\nNX\\n\\nfF\\n\\nBS\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nSambaNova is the clear winner of the latest large language model LLM benchmark by Artificial Analysis. Topping the Leaderboad at over 1000 tokens per second (t/s), Samba-1 Turbo sets a new record for Llama 3 8B performance on a single SN40L node and with full precision.\\n\\nWith speeds like this, enterprises can expect to accelerate an array of use cases and will enable innovation around unblocking agentic workflow, copilot, and synthetic data, to name a few. This breakthrough in AI technology is possible because the purpose-built SambaNova SN40L Reconfigurable Dataflow Unit RDU can hold hundreds of models at the same time and can switch between them in microseconds.\\n\\nSpeed for today and tomorrow\" metadata={'filename': 'samba_turbo.pdf', 'filetype': 'application/pdf', 'languages': 'eng', 'page_number': '1', 'orig_elements': 'eJzVl21v2zYQx7/KwW+2AV7DJ1FUMQxI22wrlqZFHrYCbVHw4WhzkSVBkut63b77jvYejCJF7BdDkleCyBN597v/Hak3nyZY4wKb8X0Kk8cwKWzl0TimbZBSVEWprZZRSx20YbJwkylMFjjaYEdL9p8mMdXY2AXmjwe7cPb9uOxd+6gLMdvm6XHdbaZt19XJ2zG1zdHf07VtZks7w4Hm30ywmU3e0WhHI++b5cJhT+P8Txoa8eOY19BHgh0JNQX5WEh49SIv8s/6P6EN9AWZfx5VjIWpKvRWRFlZ6dEJxr3SElEFr8xdR3WRtzhrP1iY2wFc315jA+McgTPGYDyiMdv3CfvHsJqvIY1fDWDBpRkEtDXEtgcKFvuuTwPC8fP9qJQlE8YV0ToeXamKqiAuKI3RJTNFFe6CymakPyBzuxh/fPv2X5L9LoPLNNZ4EwIUpQqxRKYL4RwtaQxHdMYrFSup7lzuu0E8X5DdTUGEGENRVbIqfCU1s0wFGazTVMGFJnZ3HcTJ2X56VFgq6jWoKMOlM0FykmMMEjk6LoW4D3q8lfVu4E+On/4Mly/h/OTi5dX505OLvTQZjUWunZeclwKDFJ4JpqMzUflKlXgfMNyaql0M3+2GfdXQ9jhr+/Q7hstscQMCxlUUUikUSjkjnePCCGFtYJE77x+eEl6dn/zy/OXVBfwBZyevL/fSQSWo+3tqzRELaniyUpoTFKkq5JWLd9KbPodwa6J2IXx/sA5Ky6pYeMZZITPlUAmnOTFBI6lC2IND8MKuQVRTEEyoXRpndLyTKx/wSyQsL5UpiAcLwUWLRisTSsmCcVYUgd8HEgdVxG/NXkVQBCtj0JJjLA1TvhROBaF9QR3IBV48uLjPXu8VNzeVQDR05SF5Oead08JjENFLr6R4ePmOP+wVty+rUvsouLHcM7qHlbzAwAXngU4Zey8Ov8PuAPsd+nQniq5iOieawi4dD5F5Q1WvqkKZ+ODi/j9/ab58dQqs0oXWBT0tp0opQwx0I6GyQS+re3Fe3JrpmymmYUPP12h7WKWmwR7auBmr6SAdRnr0s/yy9QkWbcAaTk9fgMPGzxe2vwa3huN+TDH5RIyPG1uvhzQ8gsu261Iz26x2urmfu9YGsCO0H2ifbcpyBgfo6H1A3zYBvqYsfjOFjY/fcrjM5GhuzIlscAU9mfVhk8rT2i4sSDBP8gI0srCNR2gbMh1o5xrh4kyxU2jIa7C0+CqNc4jLuoaO1kkDYX900ImpSAUycG1N6Zig25NkvIraMvqLlE7ci4o6SAu/ZiJDhxgGqNM1UrbSMN2pkgG8bQA/duhHShdY7wkJUcpEIfNaZ8ksqZ68zeZbzoSY4naUAlIVSS1HRdbtkmaXjatbf521QZE0Y/KwavvrWLerKfi2S3U7TjfrDOuG1JMNMtBp3j/DpPxGXJHCyFfqAmivxzktPZvTZlTVMKKfN23dztZZ4V07DCl74uiPP/uZBdktexrHb90y1SP8VxFbwZxnLcY0W/abEJ7R5tk7uGrSCOfPrjZM5m0dYE4B9RkeMdgUx5AFnncYsqNjWmyVlz8YSH5+Tm6MK9z2rUV2eJF8327VPxymRsOYDFHooL1WwmjDCxc8liW6KnJu74MaD+vvWYib2h7bQMLK5MZ20fZ9u7qhV7/7C2wCbXA=', 'key': 'value', 'type': 'CompositeElement', 'element_id': '34922f62e3c3e7600d32eb0627b79202', 'page': '1'}\n"
     ]
    }
   ],
   "source": [
    "# Inspect the Output\n",
    "\n",
    "# 1. Number of Chunks\n",
    "print(f'This is the length of the lanchain docs {len(langchain_docs)}')\n",
    "\n",
    "# 2. Example Chunk\n",
    "print(f'This is an example langcahin doc \\n\\n {langchain_docs[0]}')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "aisk-fine-tune-embeddings",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}