|
processor: |
|
verbose: True |
|
output_dir: './output' |
|
num_processes: 2 |
|
reprocess: False |
|
|
|
sources: |
|
local: |
|
recursive: True |
|
confluence: |
|
api_token: 'your_confluence_api_token' |
|
user_email: '[email protected]' |
|
url: 'https://your-confluence-url.atlassian.net' |
|
github: |
|
url: 'owner/repo' |
|
branch: 'main' |
|
google_drive: |
|
service_account_key: 'path/to/service_account_key.json' |
|
recursive: True |
|
drive_id: 'your_drive_id' |
|
|
|
partitioning: |
|
skip_infer_table_types: [] |
|
strategy: 'auto' |
|
hi_res_model_name: 'yolox' |
|
ocr_languages: ['eng'] |
|
encoding: 'utf-8' |
|
fields_include: ['element_id', 'text', 'type', 'metadata', 'embeddings'] |
|
flatten_metadata: False |
|
metadata_exclude: [] |
|
metadata_include: [] |
|
partition_endpoint: 'http://localhost' |
|
unstructured_port: 8005 |
|
partition_by_api: False |
|
default_unstructured_api_key: 123456789abcde |
|
|
|
chunking: |
|
enabled: True |
|
strategy: 'by_title' |
|
chunk_max_characters: 1500 |
|
chunk_overlap: 300 |
|
combine_under_n_chars: 1500 |
|
|
|
embedding: |
|
enabled: False |
|
provider: 'langchain-huggingface' |
|
model_name: 'intfloat/e5-large-v2' |
|
|
|
destination_connectors: |
|
enabled: False |
|
type: 'chroma' |
|
batch_size: 80 |
|
chroma: |
|
host: 'localhost' |
|
port: 8004 |
|
collection_name: 'snconf' |
|
tenant: 'default_tenant' |
|
database: 'default_database' |
|
qdrant: |
|
location: 'http://localhost:6333' |
|
collection_name: 'test' |
|
|
|
additional_processing: |
|
enabled: True |
|
extend_metadata: True |
|
replace_table_text: True |
|
table_text_key: 'text_as_html' |
|
return_langchain_docs: True |
|
convert_metadata_keys_to_string: True |
|
|