{ "cells": [ { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "This is the repo dir /Users/kwasia/Documents/Projects/ai-starter-kit\n" ] } ], "source": [ "import os\n", "import sys\n", "\n", "current_dir = os.getcwd()\n", "kit_dir = os.path.abspath(os.path.join(current_dir, '..'))\n", "repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))\n", "\n", "sys.path.append(kit_dir)\n", "sys.path.append(repo_dir)\n", "\n", "print(f'This is the repo dir {repo_dir}')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load DotEnv\n", "\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv('../../.env')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "from utils.parsing.sambaparse import SambaParse" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Use Case 1 - Process a Single File" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-06-20 16:15:20,971 - INFO - Deleting contents of output directory: ./output\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2024-06-20 16:15:20,995 - INFO - Running command: unstructured-ingest local --output-dir ./output --num-processes 2 --strategy auto --ocr-languages eng --encoding utf-8 --fields-include element_id,text,type,metadata,embeddings --metadata-exclude --metadata-include --pdf-infer-table-structure --input-path \"./test_docs/samba_turbo.pdf\" --recursive --verbose --partition-by-api --api-key EA6ZX3037WEZUV8THwco --partition-endpoint http://localhost:8005 --pdf-infer-table-structure --chunking-strategy basic --chunk-max-characters 1500 --chunk-overlap 300\n", "2024-06-20 16:15:20,996 - INFO - This may take some time depending on the size of your data. Please be patient...\n", "2024-06-20 16:15:20,996 - INFO - This may take some time depending on the size of your data. Please be patient...\n", "/Users/kwasia/.pyenv/versions/sambaparse/lib/python3.10/site-packages/dataclasses_json/core.py:201: RuntimeWarning: 'NoneType' object value of non-optional type additional_partition_args detected when decoding CliPartitionConfig.\n", " warnings.warn(\n", "2024-06-20 16:15:22,908 MainProcess INFO running pipeline: DocFactory -> Reader -> Partitioner -> Chunker -> Copier with config: {\"reprocess\": false, \"verbose\": true, \"work_dir\": \"/Users/kwasia/.cache/unstructured/ingest/pipeline\", \"output_dir\": \"./output\", \"num_processes\": 2, \"raise_on_error\": false}\n", "2024-06-20 16:15:24,658 MainProcess INFO Running doc factory to generate ingest docs. Source connector: {\"processor_config\": {\"reprocess\": false, \"verbose\": true, \"work_dir\": \"/Users/kwasia/.cache/unstructured/ingest/pipeline\", \"output_dir\": \"./output\", \"num_processes\": 2, \"raise_on_error\": false}, \"read_config\": {\"download_dir\": null, \"re_download\": false, \"preserve_downloads\": false, \"download_only\": false, \"max_docs\": null}, \"connector_config\": {\"input_path\": \"./test_docs/samba_turbo.pdf\", \"recursive\": true, \"file_glob\": null}}\n", "2024-06-20 16:15:24,661 MainProcess INFO processing 1 docs via 2 processes\n", "2024-06-20 16:15:24,661 MainProcess INFO Calling Reader with 1 docs\n", "2024-06-20 16:15:24,661 MainProcess INFO Running source node to download data associated with ingest docs\n", "2024-06-20 16:15:26,511 SpawnPoolWorker-3 INFO File exists: test_docs/samba_turbo.pdf, skipping download\n", "2024-06-20 16:15:26,522 MainProcess INFO Calling Partitioner with 1 docs\n", "2024-06-20 16:15:26,523 MainProcess INFO Running partition node to extract content from json files. Config: {\"pdf_infer_table_structure\": true, \"strategy\": \"auto\", \"ocr_languages\": [\"eng\"], \"encoding\": \"utf-8\", \"additional_partition_args\": null, \"skip_infer_table_types\": null, \"fields_include\": [\"element_id\", \"text\", \"type\", \"metadata\", \"embeddings\"], \"flatten_metadata\": false, \"metadata_exclude\": [\"--metadata-include\"], \"metadata_include\": [], \"partition_endpoint\": \"http://localhost:8005\", \"partition_by_api\": true, \"api_key\": \"*******\", \"hi_res_model_name\": null}, partition kwargs: {}]\n", "2024-06-20 16:15:26,523 MainProcess INFO Creating /Users/kwasia/.cache/unstructured/ingest/pipeline/partitioned\n", "2024-06-20 16:15:28,387 SpawnPoolWorker-4 INFO Processing test_docs/samba_turbo.pdf\n", "2024-06-20 16:15:29,836 SpawnPoolWorker-4 DEBUG Using remote partition (http://localhost:8005)\n", "2024-06-20 16:15:40,244 SpawnPoolWorker-4 INFO writing partitioned content to /Users/kwasia/.cache/unstructured/ingest/pipeline/partitioned/eb87c25354d57b8c7434994ca9c3f796.json\n", "2024-06-20 16:15:40,254 MainProcess INFO Calling Chunker with 1 docs\n", "2024-06-20 16:15:40,255 MainProcess INFO Running chunking node. Chunking config: {\"chunking_strategy\": \"basic\", \"combine_text_under_n_chars\": null, \"include_orig_elements\": true, \"max_characters\": 1500, \"multipage_sections\": true, \"new_after_n_chars\": null, \"overlap\": 300, \"overlap_all\": false}]\n", "2024-06-20 16:15:40,255 MainProcess INFO Creating /Users/kwasia/.cache/unstructured/ingest/pipeline/chunked\n", "2024-06-20 16:15:42,318 SpawnPoolWorker-6 INFO writing chunking content to /Users/kwasia/.cache/unstructured/ingest/pipeline/chunked/df2636b5a36c11e91958dfd7ae81ddb1.json\n", "2024-06-20 16:15:42,323 MainProcess INFO Calling Copier with 1 docs\n", "2024-06-20 16:15:42,323 MainProcess INFO Running copy node to move content to desired output location\n", "2024-06-20 16:15:44,114 SpawnPoolWorker-9 INFO Copying /Users/kwasia/.cache/unstructured/ingest/pipeline/chunked/df2636b5a36c11e91958dfd7ae81ddb1.json -> output/samba_turbo.pdf.json\n", "2024-06-20 16:15:44,320 - INFO - Ingest process completed successfully!\n", "2024-06-20 16:15:44,321 - INFO - Performing additional processing...\n", "2024-06-20 16:15:44,324 - INFO - Additional processing completed.\n" ] } ], "source": [ "config_yaml = './config.yaml'\n", "sambaparse = SambaParse(config_yaml)\n", "\n", "source_type = 'local'\n", "input_path = './test_docs/samba_turbo.pdf'\n", "additional_metadata = {'key': 'value'}\n", "\n", "texts, metadata_list, langchain_docs = sambaparse.run_ingest(\n", " source_type, input_path=input_path, additional_metadata=additional_metadata\n", ")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "This is the length of the lanchain docs 5\n", "This is an example langcahin doc \n", "\n", " page_content=\"6/20/24, 3:23 PM\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nG\\\\SambaNovar\\n\\nEN\\n\\nBACK TO RESOURCES\\n\\n<\\n\\nPREVIOUS | NEXT\\n\\n>\\n\\nMay 29, 2024\\n\\njn\\n\\nNX\\n\\nfF\\n\\nBS\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nSambaNova is the clear winner of the latest large language model LLM benchmark by Artificial Analysis. Topping the Leaderboad at over 1000 tokens per second (t/s), Samba-1 Turbo sets a new record for Llama 3 8B performance on a single SN40L node and with full precision.\\n\\nWith speeds like this, enterprises can expect to accelerate an array of use cases and will enable innovation around unblocking agentic workflow, copilot, and synthetic data, to name a few. This breakthrough in AI technology is possible because the purpose-built SambaNova SN40L Reconfigurable Dataflow Unit RDU can hold hundreds of models at the same time and can switch between them in microseconds.\\n\\nSpeed for today and tomorrow\" metadata={'filename': 'samba_turbo.pdf', 'filetype': 'application/pdf', 'languages': 'eng', 'page_number': '1', 'orig_elements': 'eJzVl21v2zYQx7/KwW+2AV7DJ1FUMQxI22wrlqZFHrYCbVHw4WhzkSVBkut63b77jvYejCJF7BdDkleCyBN597v/Hak3nyZY4wKb8X0Kk8cwKWzl0TimbZBSVEWprZZRSx20YbJwkylMFjjaYEdL9p8mMdXY2AXmjwe7cPb9uOxd+6gLMdvm6XHdbaZt19XJ2zG1zdHf07VtZks7w4Hm30ywmU3e0WhHI++b5cJhT+P8Txoa8eOY19BHgh0JNQX5WEh49SIv8s/6P6EN9AWZfx5VjIWpKvRWRFlZ6dEJxr3SElEFr8xdR3WRtzhrP1iY2wFc315jA+McgTPGYDyiMdv3CfvHsJqvIY1fDWDBpRkEtDXEtgcKFvuuTwPC8fP9qJQlE8YV0ToeXamKqiAuKI3RJTNFFe6CymakPyBzuxh/fPv2X5L9LoPLNNZ4EwIUpQqxRKYL4RwtaQxHdMYrFSup7lzuu0E8X5DdTUGEGENRVbIqfCU1s0wFGazTVMGFJnZ3HcTJ2X56VFgq6jWoKMOlM0FykmMMEjk6LoW4D3q8lfVu4E+On/4Mly/h/OTi5dX505OLvTQZjUWunZeclwKDFJ4JpqMzUflKlXgfMNyaql0M3+2GfdXQ9jhr+/Q7hstscQMCxlUUUikUSjkjnePCCGFtYJE77x+eEl6dn/zy/OXVBfwBZyevL/fSQSWo+3tqzRELaniyUpoTFKkq5JWLd9KbPodwa6J2IXx/sA5Ky6pYeMZZITPlUAmnOTFBI6lC2IND8MKuQVRTEEyoXRpndLyTKx/wSyQsL5UpiAcLwUWLRisTSsmCcVYUgd8HEgdVxG/NXkVQBCtj0JJjLA1TvhROBaF9QR3IBV48uLjPXu8VNzeVQDR05SF5Oead08JjENFLr6R4ePmOP+wVty+rUvsouLHcM7qHlbzAwAXngU4Zey8Ov8PuAPsd+nQniq5iOieawi4dD5F5Q1WvqkKZ+ODi/j9/ab58dQqs0oXWBT0tp0opQwx0I6GyQS+re3Fe3JrpmymmYUPP12h7WKWmwR7auBmr6SAdRnr0s/yy9QkWbcAaTk9fgMPGzxe2vwa3huN+TDH5RIyPG1uvhzQ8gsu261Iz26x2urmfu9YGsCO0H2ifbcpyBgfo6H1A3zYBvqYsfjOFjY/fcrjM5GhuzIlscAU9mfVhk8rT2i4sSDBP8gI0srCNR2gbMh1o5xrh4kyxU2jIa7C0+CqNc4jLuoaO1kkDYX900ImpSAUycG1N6Zig25NkvIraMvqLlE7ci4o6SAu/ZiJDhxgGqNM1UrbSMN2pkgG8bQA/duhHShdY7wkJUcpEIfNaZ8ksqZ68zeZbzoSY4naUAlIVSS1HRdbtkmaXjatbf521QZE0Y/KwavvrWLerKfi2S3U7TjfrDOuG1JMNMtBp3j/DpPxGXJHCyFfqAmivxzktPZvTZlTVMKKfN23dztZZ4V07DCl74uiPP/uZBdktexrHb90y1SP8VxFbwZxnLcY0W/abEJ7R5tk7uGrSCOfPrjZM5m0dYE4B9RkeMdgUx5AFnncYsqNjWmyVlz8YSH5+Tm6MK9z2rUV2eJF8327VPxymRsOYDFHooL1WwmjDCxc8liW6KnJu74MaD+vvWYib2h7bQMLK5MZ20fZ9u7qhV7/7C2wCbXA=', 'key': 'value', 'type': 'CompositeElement', 'element_id': '34922f62e3c3e7600d32eb0627b79202', 'page': '1'}\n" ] } ], "source": [ "# Inspect the Output\n", "\n", "# 1. Number of Chunks\n", "print(f'This is the length of the lanchain docs {len(langchain_docs)}')\n", "\n", "# 2. Example Chunk\n", "print(f'This is an example langcahin doc \\n\\n {langchain_docs[0]}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Use Case 2 - Process Whole Directory " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "config_yaml = './config.yaml'\n", "sambaparse = SambaParse(config_yaml)\n", "\n", "source_type = 'local'\n", "input_path = './test_docs'\n", "additional_metadata = {'key': 'value'}\n", "\n", "texts, metadata_list, langchain_docs = sambaparse.run_ingest(\n", " source_type, input_path=input_path, additional_metadata=additional_metadata\n", ")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "This is the length of the lanchain docs 44\n", "This is an example langcahin doc \n", "\n", " page_content=\"6/20/24, 3:23 PM\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nG\\\\SambaNovar\\n\\nEN\\n\\nBACK TO RESOURCES\\n\\n<\\n\\nPREVIOUS | NEXT\\n\\n>\\n\\nMay 29, 2024\\n\\njn\\n\\nNX\\n\\nfF\\n\\nBS\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nSambaNova is the clear winner of the latest large language model LLM benchmark by Artificial Analysis. Topping the Leaderboad at over 1000 tokens per second (t/s), Samba-1 Turbo sets a new record for Llama 3 8B performance on a single SN40L node and with full precision.\\n\\nWith speeds like this, enterprises can expect to accelerate an array of use cases and will enable innovation around unblocking agentic workflow, copilot, and synthetic data, to name a few. This breakthrough in AI technology is possible because the purpose-built SambaNova SN40L Reconfigurable Dataflow Unit RDU can hold hundreds of models at the same time and can switch between them in microseconds.\\n\\nSpeed for today and tomorrow\" metadata={'filename': 'samba_turbo.pdf', 'filetype': 'application/pdf', 'languages': 'eng', 'page_number': '1', 'orig_elements': 'eJzVl21v2zYQx7/KwW+2AV7DJ1FUMQxI22wrlqZFHrYCbVHw4WhzkSVBkut63b77jvYejCJF7BdDkleCyBN597v/Hak3nyZY4wKb8X0Kk8cwKWzl0TimbZBSVEWprZZRSx20YbJwkylMFjjaYEdL9p8mMdXY2AXmjwe7cPb9uOxd+6gLMdvm6XHdbaZt19XJ2zG1zdHf07VtZks7w4Hm30ywmU3e0WhHI++b5cJhT+P8Txoa8eOY19BHgh0JNQX5WEh49SIv8s/6P6EN9AWZfx5VjIWpKvRWRFlZ6dEJxr3SElEFr8xdR3WRtzhrP1iY2wFc315jA+McgTPGYDyiMdv3CfvHsJqvIY1fDWDBpRkEtDXEtgcKFvuuTwPC8fP9qJQlE8YV0ToeXamKqiAuKI3RJTNFFe6CymakPyBzuxh/fPv2X5L9LoPLNNZ4EwIUpQqxRKYL4RwtaQxHdMYrFSup7lzuu0E8X5DdTUGEGENRVbIqfCU1s0wFGazTVMGFJnZ3HcTJ2X56VFgq6jWoKMOlM0FykmMMEjk6LoW4D3q8lfVu4E+On/4Mly/h/OTi5dX505OLvTQZjUWunZeclwKDFJ4JpqMzUflKlXgfMNyaql0M3+2GfdXQ9jhr+/Q7hstscQMCxlUUUikUSjkjnePCCGFtYJE77x+eEl6dn/zy/OXVBfwBZyevL/fSQSWo+3tqzRELaniyUpoTFKkq5JWLd9KbPodwa6J2IXx/sA5Ky6pYeMZZITPlUAmnOTFBI6lC2IND8MKuQVRTEEyoXRpndLyTKx/wSyQsL5UpiAcLwUWLRisTSsmCcVYUgd8HEgdVxG/NXkVQBCtj0JJjLA1TvhROBaF9QR3IBV48uLjPXu8VNzeVQDR05SF5Oead08JjENFLr6R4ePmOP+wVty+rUvsouLHcM7qHlbzAwAXngU4Zey8Ov8PuAPsd+nQniq5iOieawi4dD5F5Q1WvqkKZ+ODi/j9/ab58dQqs0oXWBT0tp0opQwx0I6GyQS+re3Fe3JrpmymmYUPP12h7WKWmwR7auBmr6SAdRnr0s/yy9QkWbcAaTk9fgMPGzxe2vwa3huN+TDH5RIyPG1uvhzQ8gsu261Iz26x2urmfu9YGsCO0H2ifbcpyBgfo6H1A3zYBvqYsfjOFjY/fcrjM5GhuzIlscAU9mfVhk8rT2i4sSDBP8gI0srCNR2gbMh1o5xrh4kyxU2jIa7C0+CqNc4jLuoaO1kkDYX900ImpSAUycG1N6Zig25NkvIraMvqLlE7ci4o6SAu/ZiJDhxgGqNM1UrbSMN2pkgG8bQA/duhHShdY7wkJUcpEIfNaZ8ksqZ68zeZbzoSY4naUAlIVSS1HRdbtkmaXjatbf521QZE0Y/KwavvrWLerKfi2S3U7TjfrDOuG1JMNMtBp3j/DpPxGXJHCyFfqAmivxzktPZvTZlTVMKKfN23dztZZ4V07DCl74uiPP/uZBdktexrHb90y1SP8VxFbwZxnLcY0W/abEJ7R5tk7uGrSCOfPrjZM5m0dYE4B9RkeMdgUx5AFnncYsqNjWmyVlz8YSH5+Tm6MK9z2rUV2eJF8327VPxymRsOYDFHooL1WwmjDCxc8liW6KnJu74MaD+vvWYib2h7bQMLK5MZ20fZ9u7qhV7/7C2wCbXA=', 'key': 'value', 'type': 'CompositeElement', 'element_id': '34922f62e3c3e7600d32eb0627b79202', 'page': '1'}\n" ] } ], "source": [ "# Inspect the Output\n", "\n", "# 1. Number of Chunks\n", "print(f'This is the length of the lanchain docs {len(langchain_docs)}')\n", "\n", "# 2. Example Chunk\n", "print(f'This is an example langcahin doc \\n\\n {langchain_docs[0]}')" ] } ], "metadata": { "kernelspec": { "display_name": "aisk-fine-tune-embeddings", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }