nsarrazin HF staff commited on
Commit
1061bc2
1 Parent(s): f88542b

Revert "Update embedding model for WebSearch (#437)"

Browse files

This reverts commit f88542b1121903c1df34b20bd7b8213148e85ca0.

src/lib/server/websearch/sentenceSimilarity.ts CHANGED
@@ -6,14 +6,16 @@ function innerProduct(tensor1: Tensor, tensor2: Tensor) {
6
  return 1.0 - dot(tensor1.data, tensor2.data);
7
  }
8
 
9
- const extractor = await pipeline("feature-extraction", "Xenova/gte-small");
10
 
11
  export async function findSimilarSentences(
12
  query: string,
13
  sentences: string[],
14
  { topK = 5 }: { topK: number }
15
  ) {
16
- const input = [query, ...sentences];
 
 
17
  const output: Tensor = await extractor(input, { pooling: "mean", normalize: true });
18
 
19
  const queryTensor: Tensor = output[0];
 
6
  return 1.0 - dot(tensor1.data, tensor2.data);
7
  }
8
 
9
+ const extractor = await pipeline("feature-extraction", "Xenova/e5-small-v2");
10
 
11
  export async function findSimilarSentences(
12
  query: string,
13
  sentences: string[],
14
  { topK = 5 }: { topK: number }
15
  ) {
16
+ // this preprocessing step is suggested for e5-small-v2 model
17
+ // see more: https://huggingface.co/intfloat/e5-small-v2/blob/main/README.md?code=true#L2631
18
+ const input = [`query: ${query}`, ...sentences.map((s) => `passage: ${s}`)];
19
  const output: Tensor = await extractor(input, { pooling: "mean", normalize: true });
20
 
21
  const queryTensor: Tensor = output[0];