File size: 1,632 Bytes
a6c26b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
processor:
  verbose: True
  output_dir: './output'
  num_processes: 2
  reprocess: False

sources:
  local:
    recursive: True
  confluence:
    api_token: 'your_confluence_api_token'
    user_email: '[email protected]'
    url: 'https://your-confluence-url.atlassian.net'
  github:
    url: 'owner/repo'
    branch: 'main'
  google_drive:
    service_account_key: 'path/to/service_account_key.json'
    recursive: True
    drive_id: 'your_drive_id'

partitioning:
  skip_infer_table_types: []
  strategy: 'auto'
  hi_res_model_name: 'yolox'
  ocr_languages: ['eng']
  encoding: 'utf-8'
  fields_include: ['element_id', 'text', 'type', 'metadata', 'embeddings']
  flatten_metadata: False
  metadata_exclude: []
  metadata_include: []
  partition_endpoint: 'http://localhost'
  unstructured_port: 8005
  partition_by_api: False # set as true if using API server
  default_unstructured_api_key: 123456789abcde

chunking:
  enabled: True
  strategy: 'by_title'
  chunk_max_characters: 1500
  chunk_overlap: 300
  combine_under_n_chars: 1500

embedding:
  enabled: False
  provider: 'langchain-huggingface'
  model_name: 'intfloat/e5-large-v2'

destination_connectors:
  enabled: False
  type: 'chroma'
  batch_size: 80
  chroma:
    host: 'localhost'
    port: 8004
    collection_name: 'snconf'
    tenant: 'default_tenant'
    database: 'default_database'
  qdrant:
    location: 'http://localhost:6333'
    collection_name: 'test'

additional_processing:
  enabled: True
  extend_metadata: True
  replace_table_text: True
  table_text_key: 'text_as_html'
  return_langchain_docs: True
  convert_metadata_keys_to_string: True