Spaces:

arslan-ahmed
/

talk-to-your-docs

Sleeping

App Files Files Community

talk-to-your-docs / ttyd_functions.py

arslan-ahmed

added mode type

ed9ad5e about 1 year ago

raw

history blame

10.2 kB


	import datetime
	import uuid
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	import os
	from langchain.document_loaders import WebBaseLoader, TextLoader, Docx2txtLoader, PyMuPDFLoader
	from whatsapp_chat_custom import WhatsAppChatLoader # use this instead of from langchain.document_loaders import WhatsAppChatLoader

	from collections import deque
	import re
	from bs4 import BeautifulSoup
	import requests
	from urllib.parse import urlparse
	import mimetypes
	from pathlib import Path
	import tiktoken

	# Regex pattern to match a URL
	HTTP_URL_PATTERN = r'^http[s]*://.+'

	mimetypes.init()
	media_files = tuple([x for x in mimetypes.types_map if mimetypes.types_map[x].split('/')[0] in ['image', 'video', 'audio']])
	filter_strings = ['/email-protection#']


	def transformApi(api_key=''):
	if api_key==os.getenv("TEMP_PWD"):
	return os.getenv("OPENAI_API_KEY")
	elif api_key is None or api_key=='':
	return 'Null'
	else:
	return api_key

	def get_hyperlinks(url):
	try:
	reqs = requests.get(url)
	if not reqs.headers.get('Content-Type').startswith("text/html") or 400<=reqs.status_code<600:
	return []
	soup = BeautifulSoup(reqs.text, 'html.parser')
	except Exception as e:
	print(e)
	return []

	hyperlinks = []
	for link in soup.find_all('a', href=True):
	hyperlinks.append(link.get('href'))

	return hyperlinks


	# Function to get the hyperlinks from a URL that are within the same domain
	def get_domain_hyperlinks(local_domain, url):
	clean_links = []
	for link in set(get_hyperlinks(url)):
	clean_link = None

	# If the link is a URL, check if it is within the same domain
	if re.search(HTTP_URL_PATTERN, link):
	# Parse the URL and check if the domain is the same
	url_obj = urlparse(link)
	if url_obj.netloc.replace('www.','') == local_domain.replace('www.',''):
	clean_link = link

	# If the link is not a URL, check if it is a relative link
	else:
	if link.startswith("/"):
	link = link[1:]
	elif link.startswith(("#", '?', 'mailto:')):
	continue

	if 'wp-content/uploads' in url:
	clean_link = url+ "/" + link
	else:
	clean_link = "https://" + local_domain + "/" + link

	if clean_link is not None:
	clean_link = clean_link.strip().rstrip('/').replace('/../', '/')

	if not any(x in clean_link for x in filter_strings):
	clean_links.append(clean_link)

	# Return the list of hyperlinks that are within the same domain
	return list(set(clean_links))

	# this function will get you a list of all the URLs from the base URL
	def crawl(url, local_domain, prog=None):
	# Create a queue to store the URLs to crawl
	queue = deque([url])

	# Create a set to store the URLs that have already been seen (no duplicates)
	seen = set([url])

	# While the queue is not empty, continue crawling
	while queue:
	# Get the next URL from the queue
	url_pop = queue.pop()
	# Get the hyperlinks from the URL and add them to the queue
	for link in get_domain_hyperlinks(local_domain, url_pop):
	if link not in seen:
	queue.append(link)
	seen.add(link)
	if len(seen)>=100:
	return seen
	if prog is not None: prog(1, desc=f'Crawling: {url_pop}')

	return seen


	def ingestURL(documents, url, crawling=True, prog=None):
	url = url.rstrip('/')
	# Parse the URL and get the domain
	local_domain = urlparse(url).netloc
	if not (local_domain and url.startswith('http')):
	return documents
	print('Loading URL', url)
	if crawling:
	# crawl to get other webpages from this URL
	if prog is not None: prog(0, desc=f'Crawling: {url}')
	links = crawl(url, local_domain, prog)
	if prog is not None: prog(1, desc=f'Crawling: {url}')
	else:
	links = set([url])
	# separate pdf and other links
	c_links, pdf_links = [], []
	for x in links:
	if x.endswith('.pdf'):
	pdf_links.append(x)
	elif not x.endswith(media_files):
	c_links.append(x)

	# Clean links loader using WebBaseLoader
	if prog is not None: prog(0.5, desc=f'Ingesting: {url}')
	if c_links:
	loader = WebBaseLoader(list(c_links))
	documents.extend(loader.load())

	# remote PDFs loader
	for pdf_link in list(pdf_links):
	loader = PyMuPDFLoader(pdf_link)
	doc = loader.load()
	for x in doc:
	x.metadata['source'] = loader.source
	documents.extend(doc)

	return documents

	def ingestFiles(documents, files_list, prog=None):
	for fPath in files_list:
	doc = None
	if fPath.endswith('.pdf'):
	doc = PyMuPDFLoader(fPath).load()
	elif fPath.endswith('.txt') and not 'WhatsApp Chat with' in fPath:
	doc = TextLoader(fPath).load()
	elif fPath.endswith(('.doc', 'docx')):
	doc = Docx2txtLoader(fPath).load()
	elif 'WhatsApp Chat with' in fPath and fPath.endswith('.csv'): # Convert Whatsapp TXT files to CSV using https://whatstk.streamlit.app/
	doc = WhatsAppChatLoader(fPath).load()
	else:
	pass

	if doc is not None and doc[0].page_content:
	if prog is not None: prog(1, desc='Loaded file: '+fPath.rsplit('/')[0])
	print('Loaded file:', fPath)
	documents.extend(doc)
	return documents


	def data_ingestion(inputDir=None, file_list=[], url_list=[], prog=None):
	documents = []
	# Ingestion from Input Directory
	if inputDir is not None:
	files = [str(x) for x in Path(inputDir).glob('*/')]
	documents = ingestFiles(documents, files)
	if file_list:
	documents = ingestFiles(documents, file_list, prog)
	# Ingestion from URLs - also try https://python.langchain.com/docs/integrations/document_loaders/recursive_url_loader
	if url_list:
	for url in url_list:
	documents = ingestURL(documents, url, prog=prog)

	# Cleanup documents
	for x in documents:
	if 'WhatsApp Chat with' not in x.metadata['source']:
	x.page_content = x.page_content.strip().replace('\n', ' ').replace('\\n', ' ').replace(' ', ' ')

	# print(f"Total number of documents: {len(documents)}")
	return documents


	def split_docs(documents):
	# Splitting and Chunks
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=250) # default chunk size of 4000 makes around 1k tokens per doc. with k=4, this means 4k tokens input to LLM.
	docs = text_splitter.split_documents(documents)
	return docs


	def getSourcesFromMetadata(metadata, sourceOnly=True, sepFileUrl=True):
	# metadata: list of metadata dict from all documents
	setSrc = set()
	for x in metadata:
	metadataText = '' # we need to convert each metadata dict into a string format. This string will be added to a set
	if x is not None:
	# extract source first, and then extract all other items
	source = x['source']
	source = source.rsplit('/',1)[-1] if 'http' not in source else source
	notSource = []
	for k,v in x.items():
	if v is not None and k!='source' and k in ['page', 'title']:
	notSource.extend([f"{k}: {v}"])
	metadataText = ', '.join([f'source: {source}'] + notSource) if sourceOnly==False else source
	setSrc.add(metadataText)

	if sepFileUrl:
	src_files = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted([x for x in setSrc if 'http' not in x], key=str.casefold))]))
	src_urls = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted([x for x in setSrc if 'http' in x], key=str.casefold))]))

	src_files = 'Files:\n'+src_files if src_files else ''
	src_urls = 'URLs:\n'+src_urls if src_urls else ''
	newLineSep = '\n\n' if src_files and src_urls else ''

	return src_files + newLineSep + src_urls , len(setSrc)
	else:
	src_docs = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted(list(setSrc), key=str.casefold))]))
	return src_docs, len(setSrc)


	def getVsDict(embeddingFunc, docs, vsDict={}):
	# create chroma client if doesnt exist
	if vsDict.get('chromaClient') is None:
	vsDict['chromaDir'] = './vecstore/'+str(uuid.uuid1())
	vsDict['chromaClient'] = Chroma(embedding_function=embeddingFunc, persist_directory=vsDict['chromaDir'])
	# clear chroma client before adding new docs
	if vsDict['chromaClient']._collection.count()>0:
	vsDict['chromaClient'].delete(vsDict['chromaClient'].get()['ids'])
	# add new docs to chroma client
	vsDict['chromaClient'].add_documents(docs)
	print('vectorstore count:',vsDict['chromaClient']._collection.count(), 'at', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
	return vsDict

	# used for Hardcoded documents only - not uploaded by user (userData_vecStore is separate function)
	def localData_vecStore(openApiKey=None, inputDir=None, file_list=[], url_list=[], vsDict={}):
	documents = data_ingestion(inputDir, file_list, url_list)
	if not documents:
	return {}
	docs = split_docs(documents)
	# Embeddings
	embeddings = OpenAIEmbeddings(openai_api_key=openApiKey)
	# create chroma client if doesnt exist
	vsDict_hd = getVsDict(embeddings, docs, vsDict)
	# get sources from metadata
	src_str = getSourcesFromMetadata(vsDict_hd['chromaClient'].get()['metadatas'])
	src_str = str(src_str[1]) + ' source document(s) successfully loaded in vector store.'+'\n\n' + src_str[0]
	print(src_str)
	return vsDict_hd


	def num_tokens_from_string(string, encoding_name = "cl100k_base"):
	"""Returns the number of tokens in a text string."""
	encoding = tiktoken.get_encoding(encoding_name)
	num_tokens = len(encoding.encode(string))
	return num_tokens