ONNX-Demo
/
pyserini
/resources
/index-metadata
/faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md
miracl-v1.0-mdpr-tied-pft-msmarco-ft-miracl-${lang}
This index was generated on 2023/03/21 using tevatron with the following commands:
Create Train Directory
create_train_dir.py
import json from pyserini.search.lucene import LuceneSearcher from datasets import load_dataset from random import shuffle from tqdm import tqdm searcher = LuceneSearcher.from_prebuilt_index('miracl-v1.0-${lang}') searcher.set_language('${lang}') miracl_train = load_dataset('miracl/miracl', '${lang}', split='train') with open('miracl_train_bm25_neg_top100_random30.${lang}.jsonl', 'w') as f: for data in tqdm(miracl_train): query = data['query'] positives = data['positive_passages'] negatives = data['negative_passages'] positive_ids = [p['docid'] for p in positives] negative_ids = [p['docid'] for p in negatives] hits = searcher.search(query, k=100) bm25_negatives = [] for hit in hits: info = json.loads(hit.raw) if info['docid'] not in positive_ids and info['docid'] not in negative_ids: bm25_negatives.append(info) all_negatives = negatives + bm25_negatives shuffle(all_negatives) random_30_negatives = all_negatives[:30] data['negative_passages'] = random_30_negatives if len(random_30_negatives) > 0: f.write(json.dumps(data, ensure_ascii=False)+'\n')
python create_train_dir.py
Train
CUDA_VISIBLE_DEVICES=0 python -m tevatron.driver.train \
--output_dir model_miracl_${lang} \
--model_name_or_path castorini/mdpr-tied-pft-msmarco \
--tokenizer_name bert-base-multilingual-cased \
--save_steps 20000 \
--dataset_name Tevatron/msmarco-passage \
--per_device_train_batch_size 64 \
--train_dir miracl_train_bm25_neg_top100_random30.${lang}.jsonl \
--train_n_passages 2 \
--learning_rate 1e-5 \
--q_max_len 32 \
--p_max_len 256 \
--num_train_epochs 40 \
--logging_steps 10 \
--overwrite_output_dir \
--fp16
Encode Corpus
CUDA_VISIBLE_DEVICES=0 python -m tevatron.driver.encode \
--output_dir=temp_out \
--model_name_or_path model_miracl_${lang} \
--fp16 \
--per_device_eval_batch_size 256 \
--dataset_name miracl/miracl-corpus:${lang} \
--p_max_len 256 \
--encoded_save_path model_miracl_${lang}_corpus/${lang}_corpus_emb.pt
Convert Index
convert_index.py
import numpy as np import faiss import pickle import os from tqdm import tqdm import argparse parser = argparse.ArgumentParser() parser.add_argument('--input', type=str, required=True) parser.add_argument('--output', type=str, required=True) args = parser.parse_args() def pickle_load(path): with open(path, 'rb') as f: reps, lookup = pickle.load(f) return np.array(reps), lookup index = faiss.IndexFlatIP(768) all_ids = [] for name in tqdm(os.listdir(args.input)): if 'corpus_emb' not in name: continue path = os.path.join(args.input, name) reps, ids = pickle_load(path) all_ids.extend(ids) index.add(reps) faiss.write_index(index, f'{args.output}/index') with open(f'{args.output}/docid', 'w') as f: for i in all_ids: f.write(f'{i}\n')
python test.py --input=model_miracl_${lang}_corpus --output=${lang}_index
Index from Pyserini
Tested to use the same checkpoint to index directly via Pyserini using the following command, got the same score. (on basilisk) (only tested on Swahili)
encoder=castorini/mdpr-tied-pft-msmarco-ft-miracl-$lang
index_dir=miracl-v1.0-$lang-mdpr-tied-pft-msmarco-ft-miracl-$lang
echo $index_dir
CUDA_VISIBLE_DEVICES=1 \
python -m pyserini.encode input --corpus $corpus \
--fields title text \
--delimiter "\n\n" \
--shard-id $shard_id \
--shard-num $shard_num \
output --embeddings $index_dir \
--to-faiss \
encoder --encoder $encoder \
--fields title text \
--batch 128 \
--encoder-class 'auto' \
--fp16