Spaces:
Sleeping
Sleeping
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import Qdrant | |
from gh_issue_loader import GHLoader | |
from config import DB_CONFIG | |
CHUNK_SIZE = 500 | |
def get_text_chunk(docs): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=CHUNK_SIZE, chunk_overlap=0 | |
) | |
texts = text_splitter.split_documents(docs) | |
return texts | |
def store(texts): | |
model_name = "intfloat/multilingual-e5-large" | |
model_kwargs = {"device": "cuda"} | |
encode_kwargs = {"normalize_embeddings": False} | |
embeddings = HuggingFaceEmbeddings( | |
model_name=model_name, | |
model_kwargs=model_kwargs, | |
encode_kwargs=encode_kwargs, | |
) | |
db_url, db_api_key, db_collection_name = DB_CONFIG | |
_ = Qdrant.from_documents( | |
texts, | |
embeddings, | |
url=db_url, | |
api_key=db_api_key, | |
collection_name=db_collection_name, | |
) | |
def main(repo_name: str, path: str) -> None: | |
loader = GHLoader(repo_name, path) | |
docs = loader.load() | |
texts = get_text_chunk(docs) | |
store(texts) | |
if __name__ == "__main__": | |
""" | |
$ python store.py "REPO_NAME" "FILE_PATH" | |
$ python store.py cocoa data/cocoa-issues.json | |
""" | |
import sys | |
args = sys.argv | |
if len(args) != 3: | |
print("No args, you need two args for repo_name, json_file_path") | |
else: | |
repo_name = args[1] | |
path = args[2] | |
main(repo_name, path) | |