import tiktoken import os from langchain_openai import OpenAIEmbeddings from langchain_community.embeddings import HuggingFaceBgeEmbeddings import torch from transformers import AutoModel, AutoTokenizer from transformers import AutoModel, AutoTokenizer from langchain_huggingface import HuggingFaceEmbeddings # def get_embeddings_model_bge_base_en_v1_5(): # model_name = "BAAI/bge-base-en-v1.5" # model_kwargs = {'device': 'cpu'} # encode_kwargs = {'normalize_embeddings': False} # embedding_model = HuggingFaceBgeEmbeddings( # model_name=model_name, # model_kwargs=model_kwargs, # encode_kwargs=encode_kwargs # ) # return embedding_model # def get_embeddings_model_bge_en_icl(): # model_name = "BAAI/bge-en-icl" # model_kwargs = {'device': 'cpu'} # encode_kwargs = {'normalize_embeddings': False} # embedding_model = HuggingFaceBgeEmbeddings( # model_name=model_name, # model_kwargs=model_kwargs, # encode_kwargs=encode_kwargs # ) # return embedding_model , 4096 # def get_embeddings_model_bge_large_en(): # model_name = "BAAI/bge-large-en" # model_kwargs = {'device': 'cpu'} # encode_kwargs = {'normalize_embeddings': False} # embedding_model = HuggingFaceBgeEmbeddings( # model_name=model_name, # model_kwargs=model_kwargs, # encode_kwargs=encode_kwargs # ) # return embedding_model def get_embeddings_openai_text_3_large(): embedding_model = OpenAIEmbeddings(model="text-embedding-3-large") dimension = 3072 return embedding_model,dimension # def get_embeddings_snowflake_arctic_embed_l(): # current_dir = os.path.dirname(os.path.realpath(__file__)) # model_name = "Snowflake/snowflake-arctic-embed-l" # tokenizer = AutoTokenizer.from_pretrained(f"{current_dir}/cache/tokenizer/{model_name}") # model = AutoModel.from_pretrained(f"{current_dir}/cache/model/{model_name}") # return model,1024 def get_embeddings_snowflake_arctic_embed_l(): embedding_model = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l") return embedding_model,1024